In [None]:
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install scikit-learn
!pip install mlflow

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
import mlflow
#cross validation
from sklearn.model_selection import cross_val_score

# Load the dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data = pd.read_csv(url)

data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [2]:
X=data[['Age','Sex','Pclass','SibSp','Parch','Fare']]
y=data['Survived']

In [3]:
X.isnull().sum()

Age       177
Sex         0
Pclass      0
SibSp       0
Parch       0
Fare        0
dtype: int64

In [25]:
class AgeImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X['Age'] = X['Age'].fillna(X['Age'].mean())
        return X
    

In [5]:
# Add IsChild feature
class ChildAdder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['IsChild'] = (X['Age'] < 18).astype(int)
        return X
    

In [6]:
# Custom Encoder
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.copy()
        X['Sex'] = X['Sex'].map({'male': 1, 'female': 0})
        return X

In [7]:
# Apply MinMax scaler for Age and Fare
class Scaler(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.copy()
        X['Age'] = (X['Age'] - X['Age'].min()) / (X['Age'].max() - X['Age'].min())
        X['Fare'] = (X['Fare'] - X['Fare'].min()) / (X['Fare'].max() - X['Fare'].min())
        return X
    


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


def fit_and_evaluate_pipeline(pipeline, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):

    pipeline.fit(X_train, y_train)
    # Evaluate the model
    y_pred = pipeline.predict(X_test)
    Y_train_pred=pipeline.predict(X_train)
    print("Train_Accuracy:", accuracy_score(y_train, Y_train_pred))
    print("Test_Accuracy:", accuracy_score(y_test, y_pred))
    print("Train_Precision:", precision_score(y_train, Y_train_pred))
    print("Test_Precision:", precision_score(y_test, y_pred))
    print("Train_Recall:", recall_score(y_train, Y_train_pred))
    print("Test_Recall:", recall_score(y_test, y_pred))
    scores = cross_val_score(pipeline, X, y, cv=5)
    print("Cross-validation scores:", scores)
    print("Mean cross-validation score:", scores.mean())


In [26]:
# use for random forest
RFC_Pipeline=Pipeline([
    ('age_imputer',AgeImputer()),
    ('child_adder',ChildAdder()),
    ('categorical_encoder',CategoricalEncoder()),
    ('scaler',Scaler()),
    ('random_forest',RandomForestClassifier( n_estimators=100, random_state=42, max_depth=5))

])

fit_and_evaluate_pipeline(RFC_Pipeline)

Train_Accuracy: 0.848314606741573
Test_Accuracy: 0.8324022346368715
Train_Precision: 0.8539823008849557
Test_Precision: 0.8666666666666667
Train_Recall: 0.7201492537313433
Test_Recall: 0.7027027027027027
Cross-validation scores: [0.73743017 0.80898876 0.8258427  0.79775281 0.83707865]
Mean cross-validation score: 0.801418617789216


In [27]:
# use for logistic regression
from sklearn.linear_model import LogisticRegression
LR_Pipeline=Pipeline([
    ('age_imputer',AgeImputer()),
    ('child_adder',ChildAdder()),
    ('categorical_encoder',CategoricalEncoder()),
    ('scaler',Scaler()),
    ('logistic_regression',LogisticRegression())
])

fit_and_evaluate_pipeline(LR_Pipeline)

Train_Accuracy: 0.8033707865168539
Test_Accuracy: 0.7932960893854749
Train_Precision: 0.7711864406779662
Test_Precision: 0.7761194029850746
Train_Recall: 0.6791044776119403
Test_Recall: 0.7027027027027027
Cross-validation scores: [0.80446927 0.80337079 0.78089888 0.76404494 0.8258427 ]
Mean cross-validation score: 0.7957253154227607


In [28]:
# USE FOR decision tree
from sklearn.tree import DecisionTreeClassifier
DT_Pipeline=Pipeline([
    ('age_imputer',AgeImputer()),
    ('child_adder',ChildAdder()),
    ('categorical_encoder',CategoricalEncoder()),
    ('scaler',Scaler()),
    ('decision_tree',DecisionTreeClassifier( max_depth=5, random_state=42))
])

fit_and_evaluate_pipeline(DT_Pipeline)

Train_Accuracy: 0.8581460674157303
Test_Accuracy: 0.7821229050279329
Train_Precision: 0.8883720930232558
Test_Precision: 0.8431372549019608
Train_Recall: 0.7126865671641791
Test_Recall: 0.581081081081081
Cross-validation scores: [0.79329609 0.82022472 0.79213483 0.78651685 0.84269663]
Mean cross-validation score: 0.806973824618668


In [29]:
# USE FOR KNN
from sklearn.neighbors import KNeighborsClassifier
KNN_Pipeline=Pipeline([
    ('age_imputer',AgeImputer()),
    ('child_adder',ChildAdder()),
    ('categorical_encoder',CategoricalEncoder()),
    ('scaler',Scaler()),
    ('knn',KNeighborsClassifier( n_neighbors=5))
])

fit_and_evaluate_pipeline(KNN_Pipeline)

Train_Accuracy: 0.8665730337078652
Test_Accuracy: 0.7653631284916201
Train_Precision: 0.8559670781893004
Test_Precision: 0.7580645161290323
Train_Recall: 0.7761194029850746
Test_Recall: 0.6351351351351351
Cross-validation scores: [0.77094972 0.74157303 0.80898876 0.84269663 0.85393258]
Mean cross-validation score: 0.8036281463812692


In [30]:
# USE FOR SVC
from sklearn.svm import SVC
SVC_Pipeline=Pipeline([
    ('age_imputer',AgeImputer()),
    ('child_adder',ChildAdder()),
    ('categorical_encoder',CategoricalEncoder()),
    ('scaler',Scaler()),
    ('svc',SVC())
])

fit_and_evaluate_pipeline(SVC_Pipeline)

Train_Accuracy: 0.827247191011236
Test_Accuracy: 0.8156424581005587
Train_Precision: 0.8085106382978723
Test_Precision: 0.8059701492537313
Train_Recall: 0.7089552238805971
Test_Recall: 0.7297297297297297
Cross-validation scores: [0.83240223 0.82022472 0.82022472 0.80898876 0.8258427 ]
Mean cross-validation score: 0.8215366267026551


In [18]:
import mlflow 

mlflow.set_experiment("Titanic-Experiment")
mlflow.set_tracking_uri('http://127.0.0.1:5000')
with mlflow.start_run(nested=True):
    estimator=50
    depth=10
    # use for random forest
    
    RFC_Pipeline=Pipeline([
        ('age_imputer',AgeImputer()),
        ('child_adder',ChildAdder()),
        ('categorical_encoder',CategoricalEncoder()),
        ('scaler',Scaler()),
        ('random_forest',RandomForestClassifier( n_estimators=estimator, random_state=42, max_depth=depth))

    ])
    fit_and_evaluate_pipeline(RFC_Pipeline)
    mlflow.log_param("Random Forest", "Random Forest Pipeline")

    mlflow.log_param("n_estimators", estimator)

    mlflow.log_param("max_depth", depth)
    mlflow.log_metric("Train_accuracy", accuracy_score(y_train, RFC_Pipeline.predict(X_train)))
    mlflow.log_metric("Test_accuracy", accuracy_score(y_test, RFC_Pipeline.predict(X_test)))
    mlflow.log_metric("Train_precision", precision_score(y_train, RFC_Pipeline.predict(X_train)))
    mlflow.log_metric("Test_precision", precision_score(y_test, RFC_Pipeline.predict(X_test)))
    mlflow.log_metric("Train_recall", recall_score(y_train, RFC_Pipeline.predict(X_train)))
    mlflow.log_metric("Test_recall", recall_score(y_test, RFC_Pipeline.predict(X_test)))

    mlflow.log_metric("cross_val", cross_val_score(RFC_Pipeline, X, y, cv=5).mean())

    mlflow.sklearn.log_model(RFC_Pipeline, "pipline")

Train_Accuracy: 0.9297752808988764
Test_Accuracy: 0.7932960893854749
Train_Precision: 0.9698275862068966
Test_Precision: 0.8245614035087719
Train_Recall: 0.8395522388059702
Test_Recall: 0.6351351351351351
Cross-validation scores: [0.77094972 0.8258427  0.81460674 0.80898876 0.83146067]
Mean cross-validation score: 0.810369719414977


2024/08/19 18:34:38 INFO mlflow.tracking._tracking_service.client: 🏃 View run whimsical-yak-450 at: http://127.0.0.1:5000/#/experiments/106823123035668066/runs/21022e0ea6f14654ae860712cfc6689f.
2024/08/19 18:34:38 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/106823123035668066.
