In [1]:
import pandas as pd
import mlflow
import logging
import dvc.api 
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score,precision_score,f1_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

In [3]:
path='data/titanic_data.csv'
repo="D:/ITI/data science/project/.git"
version='v2'

In [4]:
data_url=dvc.api.get_url(path=path,repo=repo,rev=version)
data = pd.read_csv(data_url,sep=',')
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [6]:
data.fillna(data.mean(),inplace=True)
data['Embarked'].fillna(data['Embarked'].mode().values[0],inplace=True)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [8]:
data=pd.get_dummies(data,columns=['Embarked','Sex'],drop_first=True)
data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Sex_male
0,0,3,22.0,1,0,7.25,0,1,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,1,0
3,1,1,35.0,1,0,53.1,0,1,0
4,0,3,35.0,0,0,8.05,0,1,1


In [9]:
X=data.drop('Survived',1)
y=data.Survived

In [10]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [11]:
def eval_metrics(actual, pred):
    return recall_score(actual, pred), precision_score(actual, pred), f1_score(actual, pred)
    

In [12]:
maxDepth=30
minLeaf=3
minSplit=5

In [14]:
remote_server_uri = "http://127.0.0.1:5000" 
mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment("/titanic")
with mlflow.start_run():
    clf=DecisionTreeClassifier(max_depth=maxDepth,min_samples_split=minSplit,min_samples_leaf=minLeaf)
    clf.fit(X_train,y_train)
    y_test_predicted=clf.predict(X_test)
    recall,precision,f1=eval_metrics(y_test, y_test_predicted)
    mlflow.log_param('data_url', data_url)
    mlflow.log_param("data_version", version)
    mlflow.log_param("input_rows", data.shape[0])
    mlflow.log_param("input_cols", data.shape[1])
    mlflow.log_param("max_depth", maxDepth)
    mlflow.log_param("min_samples_split", minSplit)
    mlflow.log_param("min_samples_leaf", minLeaf)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("f1", f1)
    
print(recall,precision,f1)

0.7837837837837838 0.8169014084507042 0.8
