# **Model Trainer - Titanic (Machine Learning from Disaster)**

### **Importing Libraries**

In [10]:
import os
import numpy as np
import pandas as pd

from dataclasses import dataclass

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

### **Data Ingestion**

In [11]:
@dataclass
class DataIngestionConfig:
    root_dir: str = os.path.abspath(os.path.join(os.getcwd(), '..'))  # Assuming the script is inside the notebook folder
    artifacts_dir: str = os.path.join(root_dir, 'artifacts')
    train_data_path: str=os.path.join(artifacts_dir,'train.csv')
    test_data_path: str=os.path.join(artifacts_dir,'test.csv')

class DataIngestion:
    def __init__(self):
        self.ingestion_config=DataIngestionConfig()

    def initiate_data_ingestion(self):
        try:
            train_data=pd.read_csv('D:/Projects/KaggalProjects/Titanic_Machine_Learning_from_Disaster/notebook/train.csv')
            test_data=pd.read_csv('D:/Projects/KaggalProjects/Titanic_Machine_Learning_from_Disaster/notebook/test.csv')

            os.makedirs(self.ingestion_config.artifacts_dir,exist_ok=True)

            train_data.to_csv(self.ingestion_config.train_data_path,index=False,header=True)
            test_data.to_csv(self.ingestion_config.test_data_path,index=False,header=True)

            return train_data,test_data

        except Exception as e:
            print('The error is: ',e)

obj=DataIngestion()
titanic_train_data,titanic_test_data=obj.initiate_data_ingestion()

### **Data Cleaning and Transformation**

In [12]:
class DataTransformation:
    def data_imputing(self,train_data,test_data):
        '''
        This function is used to impute the missing values.
        - In the training data, the Embarked column has two missing values and will be replaced with the most occuring embarked value.
        - There is one missing value in Fare column of test data and will be replaced with the mode of the respective pClass.
        - There are some missing values in Age feature of both training and testing and will be replaced with the mean of the Age column.
        '''
        try:
            imputer=SimpleImputer(strategy='mean')
            
            #----------------------------------------------------------------------------------------------------------------------------------------
                                                                    #Training Part
            #----------------------------------------------------------------------------------------------------------------------------------------

            mode_value = train_data['Embarked'].mode()[0]  # Calculate the mode of the column
            train_data['Embarked'].fillna(mode_value, inplace=True) 
            train_data['Age']=imputer.fit_transform(train_data[['Age']])
            
            #----------------------------------------------------------------------------------------------------------------------------------------
                                                                    #Testing Part
            #----------------------------------------------------------------------------------------------------------------------------------------
            
            missing_index = test_data['Fare'].isnull()
            for idx in test_data[missing_index].index:
                pclass_value = test_data.loc[idx, 'Pclass']
                mode_fare = test_data[test_data['Pclass'] == pclass_value]['Fare'].mode()[0]
                test_data.loc[idx, 'Fare'] = mode_fare
            
            test_data['Age']=imputer.fit_transform(test_data[['Age']])

            return train_data,test_data
        
        except Exception as e:
            print('The error is: ',e)
    
    def feature_encoder(self,train_data,test_data):
        '''
        This function is used to give OneHotEncoder values to categorical columns.
        - Categorical columns like Embarked and Sex are encoded using OneHotEncoder.
        '''
        try:
            encoder=OneHotEncoder(sparse_output=False)

            #----------------------------------------------------------------------------------------------------------------------------------------
                                                                    #Training Part
            #----------------------------------------------------------------------------------------------------------------------------------------
            
            encoded_embarked_train=encoder.fit_transform(train_data[["Embarked"]])
            encoded_col_names = ['Embarked_' + str(cat) for cat in encoder.categories_[0]]
            encoded_embarked_df_train = pd.DataFrame(encoded_embarked_train,columns=encoded_col_names,index=train_data.index)
            train_data = train_data.join(encoded_embarked_df_train)

            encoded_sex_train=encoder.fit_transform(train_data[["Sex"]])
            encoded_col_names = ['Sex_' + str(cat) for cat in encoder.categories_[0]]
            encoded_sex_df_train = pd.DataFrame(encoded_sex_train,columns=encoded_col_names,index=train_data.index)
            train_data = train_data.join(encoded_sex_df_train)
            
            #----------------------------------------------------------------------------------------------------------------------------------------
                                                                    #Testing Part
            #----------------------------------------------------------------------------------------------------------------------------------------

            encoded_embarked_test=encoder.fit_transform(test_data[["Embarked"]])
            encoded_col_names = ['Embarked_' + str(cat) for cat in encoder.categories_[0]]
            encoded_embarked_df_test = pd.DataFrame(encoded_embarked_test,columns=encoded_col_names,index=test_data.index)
            test_data = test_data.join(encoded_embarked_df_test)

            encoded_sex_test=encoder.fit_transform(test_data[["Sex"]])
            encoded_col_names = ['Sex_' + str(cat) for cat in encoder.categories_[0]]
            encoded_sex_df_test = pd.DataFrame(encoded_sex_test,columns=encoded_col_names,index=test_data.index)
            test_data = test_data.join(encoded_sex_df_test)

            return train_data,test_data
        
        except Exception as e:
            print('The error is: ',e)
    
    def feature_dropper(self,train_data,test_data):
        '''
        This function is used to drop few features from the dataframe.
        - Features like Name, Sex, Cabin, Embarked, Ticket are dropped from the dataframe.
        '''
        try:
            #----------------------------------------------------------------------------------------------------------------------------------------
                                                                    #Training Part
            #----------------------------------------------------------------------------------------------------------------------------------------

            train_data=train_data.drop(["Name", "Sex", "Cabin", "Embarked", "Ticket"],axis=1)

            #----------------------------------------------------------------------------------------------------------------------------------------
                                                                    #Testing Part
            #----------------------------------------------------------------------------------------------------------------------------------------

            test_data=test_data.drop(["Name", "Sex", "Cabin", "Embarked", "Ticket"],axis=1)

            return train_data,test_data
        
        except Exception as e:
            print('The error is: ',e)


In [13]:
data_impute=DataTransformation()
impute_train_data,impute_test_data=data_impute.data_imputing(titanic_train_data,titanic_test_data)
encoded_train_data,encoded_test_data=data_impute.feature_encoder(impute_train_data,impute_test_data)
train_data,test_data=data_impute.feature_dropper(encoded_train_data,encoded_test_data)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Embarked'].fillna(mode_value, inplace=True)


### **Utilities**

In [None]:
def evaluate_models(xtrain,ytrain,xtest,ytest,models):
    try:
        report = {}
        for i in range(len(models)):
            model = list(models.values())[i]
            # Train model
            model.fit(xtrain,ytrain)

            # Predict Training data
            y_train_pred = model.predict(xtrain)

            # Predict Testing data
            y_test_pred =model.predict(xtest)

            # Get R2 scores for train and test data
            train_model_score = r2_score(ytrain,y_train_pred)
            test_model_score = r2_score(ytest,y_test_pred)

            report[list(models.keys())[i]] =  test_model_score

        return report

    except Exception as e:
        logging.info('Exception occured during model training')
        raise CustomException(e,sys)