# **Model Trainer - Titanic (Machine Learning from Disaster)**

### **Importing Libraries**

In [1]:
import os
import numpy as np
import pandas as pd
import joblib

from dataclasses import dataclass

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error, mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

### **Data Ingestion**

In [2]:
@dataclass
class DataIngestionConfig:
    root_dir: str = os.path.abspath(os.path.join(os.getcwd(), '..'))  # Assuming the script is inside the notebook folder
    artifacts_dir: str = os.path.join(root_dir, 'artifacts')
    train_data_path: str=os.path.join(artifacts_dir,'train.csv')
    test_data_path: str=os.path.join(artifacts_dir,'test.csv')

class DataIngestion:
    def __init__(self):
        self.ingestion_config=DataIngestionConfig()

    def initiate_data_ingestion(self):
        try:
            train_data=pd.read_csv('D:/Projects/KaggalProjects/Titanic_Machine_Learning_from_Disaster/notebook/train.csv')
            test_data=pd.read_csv('D:/Projects/KaggalProjects/Titanic_Machine_Learning_from_Disaster/notebook/test.csv')

            os.makedirs(self.ingestion_config.artifacts_dir,exist_ok=True)

            train_data.to_csv(self.ingestion_config.train_data_path,index=False,header=True)
            test_data.to_csv(self.ingestion_config.test_data_path,index=False,header=True)

            return train_data,test_data

        except Exception as e:
            print('The error is: ',e)

obj=DataIngestion()
titanic_train_data,titanic_test_data=obj.initiate_data_ingestion()

### **Data Cleaning and Transformation**

In [3]:
class DataTransformation:
    def data_imputing(self,train_data,test_data):
        '''
        This function is used to impute the missing values.
        - In the training data, the Embarked column has two missing values and will be replaced with the most occuring embarked value.
        - There is one missing value in Fare column of test data and will be replaced with the mode of the respective pClass.
        - There are some missing values in Age feature of both training and testing and will be replaced with the mean of the Age column.
        '''
        try:
            imputer=SimpleImputer(strategy='mean')
            
            #----------------------------------------------------------------------------------------------------------------------------------------
                                                                    #Training Part
            #----------------------------------------------------------------------------------------------------------------------------------------

            mode_value = train_data['Embarked'].mode()[0]  # Calculate the mode of the column
            train_data['Embarked'].fillna(mode_value, inplace=True) 
            train_data['Age']=imputer.fit_transform(train_data[['Age']])
            
            #----------------------------------------------------------------------------------------------------------------------------------------
                                                                    #Testing Part
            #----------------------------------------------------------------------------------------------------------------------------------------
            
            missing_index = test_data['Fare'].isnull()
            for idx in test_data[missing_index].index:
                pclass_value = test_data.loc[idx, 'Pclass']
                mode_fare = test_data[test_data['Pclass'] == pclass_value]['Fare'].mode()[0]
                test_data.loc[idx, 'Fare'] = mode_fare
            
            test_data['Age']=imputer.fit_transform(test_data[['Age']])

            return train_data,test_data
        
        except Exception as e:
            print('The error is: ',e)
    
    def feature_encoder(self,train_data,test_data):
        '''
        This function is used to give OneHotEncoder values to categorical columns.
        - Categorical columns like Embarked and Sex are encoded using OneHotEncoder.
        '''
        try:
            encoder=OneHotEncoder(sparse_output=False)

            #----------------------------------------------------------------------------------------------------------------------------------------
                                                                    #Training Part
            #----------------------------------------------------------------------------------------------------------------------------------------
            
            encoded_embarked_train=encoder.fit_transform(train_data[["Embarked"]])
            encoded_col_names = ['Embarked_' + str(cat) for cat in encoder.categories_[0]]
            encoded_embarked_df_train = pd.DataFrame(encoded_embarked_train,columns=encoded_col_names,index=train_data.index)
            train_data = train_data.join(encoded_embarked_df_train)

            encoded_sex_train=encoder.fit_transform(train_data[["Sex"]])
            encoded_col_names = ['Sex_' + str(cat) for cat in encoder.categories_[0]]
            encoded_sex_df_train = pd.DataFrame(encoded_sex_train,columns=encoded_col_names,index=train_data.index)
            train_data = train_data.join(encoded_sex_df_train)
            
            #----------------------------------------------------------------------------------------------------------------------------------------
                                                                    #Testing Part
            #----------------------------------------------------------------------------------------------------------------------------------------

            encoded_embarked_test=encoder.fit_transform(test_data[["Embarked"]])
            encoded_col_names = ['Embarked_' + str(cat) for cat in encoder.categories_[0]]
            encoded_embarked_df_test = pd.DataFrame(encoded_embarked_test,columns=encoded_col_names,index=test_data.index)
            test_data = test_data.join(encoded_embarked_df_test)

            encoded_sex_test=encoder.fit_transform(test_data[["Sex"]])
            encoded_col_names = ['Sex_' + str(cat) for cat in encoder.categories_[0]]
            encoded_sex_df_test = pd.DataFrame(encoded_sex_test,columns=encoded_col_names,index=test_data.index)
            test_data = test_data.join(encoded_sex_df_test)

            return train_data,test_data
        
        except Exception as e:
            print('The error is: ',e)
    
    def feature_dropper(self,train_data,test_data):
        '''
        This function is used to drop few features from the dataframe.
        - Features like Name, Sex, Cabin, Embarked, Ticket are dropped from the dataframe.
        '''
        try:
            #----------------------------------------------------------------------------------------------------------------------------------------
                                                                    #Training Part
            #----------------------------------------------------------------------------------------------------------------------------------------

            train_data=train_data.drop(["Name", "Sex", "Cabin", "Embarked", "Ticket"],axis=1)

            #----------------------------------------------------------------------------------------------------------------------------------------
                                                                    #Testing Part
            #----------------------------------------------------------------------------------------------------------------------------------------

            test_data=test_data.drop(["Name", "Sex", "Cabin", "Embarked", "Ticket"],axis=1)

            return train_data,test_data
        
        except Exception as e:
            print('The error is: ',e)


In [4]:
data_impute=DataTransformation()
impute_train_data,impute_test_data=data_impute.data_imputing(titanic_train_data,titanic_test_data)
encoded_train_data,encoded_test_data=data_impute.feature_encoder(impute_train_data,impute_test_data)
train_data,test_data=data_impute.feature_dropper(encoded_train_data,encoded_test_data)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Embarked'].fillna(mode_value, inplace=True)


### **Utilities**

In [5]:
def model_metrics(true, predicted):
    try:
        mae = mean_absolute_error(true, predicted)
        mse = mean_squared_error(true, predicted)
        rmse = np.sqrt(mean_squared_error(true, predicted))
        r2_square = r2_score(true, predicted)
        acc=accuracy_score(true, predicted)
        precision=precision_score(true, predicted)
        recall=recall_score(true, predicted)
        f1=f1_score(true, predicted)

        return mae, rmse, r2_square,acc,precision,recall,f1

    except Exception as e:
        print('The error is: ',e)

def print_evaluated_models(xtrain,xtest,ytrain,ytest,models_list):
    # try:
    model_list = []
    r2_list =[]

    for model_name, model in models_list.items():
        if model_name == 'Neural Network':
            model.fit(xtrain, ytrain, epochs=50, batch_size=32, verbose=0)
            y_train_pred = (model.predict(xtrain) > 0.5).astype("int32")
            y_test_pred = (model.predict(xtest) > 0.5).astype("int32")

        else:
            model.fit(xtrain, ytrain.values.flatten()) # Train model
            y_train_pred = model.predict(xtrain)
            y_test_pred = model.predict(xtest)
        
        # Evaluate Train and Test dataset
        model_train_mae , model_train_rmse, model_train_r2,model_train_acc,model_train_precision,model_train_recall,model_train_f1 = model_metrics(ytrain, y_train_pred)
        model_test_mae , model_test_rmse, model_test_r2,model_test_acc,model_test_precision,model_test_recall,model_test_f1 = model_metrics(ytest, y_test_pred)

        print(f"\033[1m{model_name}\033[0m")
        model_list.append(model_name)
        
        print('Model performance for Training set')
        print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
        print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
        print("- R2 Score: {:.4f}".format(model_train_r2))
        print("- Accuracy: {:.4f}".format(model_train_acc))
        print("- Precision: {:.4f}".format(model_train_precision))
        print("- Recall: {:.4f}".format(model_train_recall))
        print("- F1 Score: {:.4f}".format(model_train_f1))

        print('----------------------------------')
        
        print('Model performance for Validation set')
        print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
        print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
        print("- R2 Score: {:.4f}".format(model_test_r2))
        print("- Accuracy: {:.4f}".format(model_test_acc))
        print("- Precision: {:.4f}".format(model_test_precision))
        print("- Recall: {:.4f}".format(model_test_recall))
        print("- F1 Score: {:.4f}".format(model_test_f1))
        r2_list.append(model_test_r2)
        
        print('='*35)
        print('\n')

    # except Exception as e:
    #     print('The error is: ',e)

def save_model(model,model_name):
    '''
    This function is used to save the best model.
    '''
    try:
        root_dir: str = os.path.abspath(os.path.join(os.getcwd(), '..'))  # Assuming the script is inside the notebook folder
        model_dir: str = os.path.join(root_dir, 'model')
        # Ensure the directory exists
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        # Construct the full path
        file_path = os.path.join(model_dir, model_name)

        joblib.dump(model, file_path)

    except Exception as e:
        print('The error is: ',e)

def load_model(model_path):
    '''
    This function is used to load the model.
    '''
    try:
        model=joblib.load(model_path)
        return model
    
    except Exception as e:
        print('The error is: ',e)

### **Model Training**

In [6]:
X=train_data.drop("Survived",axis=1)
y=train_data["Survived"]
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=42)

In [7]:
# Neural Network Model
def create_neural_network():
    model = Sequential([
        Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

models = {
    'Logistic Regression': LogisticRegression(),
    # 'Decision Tree': DecisionTreeClassifier(),
    # 'Random Forest': RandomForestClassifier(random_state=42),
    # 'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    # 'Support Vector Machine': SVC(),
    # 'K-Nearest Neighbors': KNeighborsClassifier()
}

nn_model = create_neural_network()
models['Neural Network'] = nn_model

print_evaluated_models(X_train,X_val,y_train,y_val,models)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


NameError: name 'i' is not defined