In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [14]:
def dataprocessing(data):

    data.drop('Name', axis = 1, inplace = True)
    
    data.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis = 1, inplace = True)
    
    data['Age'].fillna(int(data['Age'].mean(skipna = True)), inplace = True)
    
    data['Group'] = data['PassengerId'].apply(lambda x: int(x.split('_')[1]))
    data.drop('PassengerId', axis = 1, inplace = True)
    
    data[['Deck', 'Num', 'Side']] = data['Cabin'].str.split('/', expand = True)
    data.drop('Num', axis = 1, inplace = True)
    data.drop('Cabin', axis = 1, inplace = True)
    data['Deck'].fillna('Missing', inplace = True)
    data['Side'].fillna('Missing', inplace = True)

    data['Destination'].fillna('Missing', inplace = True)
    data['CryoSleep'] = train_data['CryoSleep'].apply(lambda x: 1 if x else 0)
    data['VIP'] = train_data['CryoSleep'].apply(lambda x: 1 if x else 0)

    data = pd.get_dummies(data, dtype = int)
    
    return data

In [15]:
train_data = pd.read_csv('train.csv')

train_IDs = train_data['PassengerId']
target_train = train_data['Transported']
train_data.drop('Transported', axis = 1, inplace = True)
clean_train_data = dataprocessing(train_data)

clean_train_data.to_csv('clean_train_2.csv', index = False)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(clean_train_data, target_train, test_size=0.2, random_state=42)

In [17]:
pipelines = {
    'rf': make_pipeline(MinMaxScaler(), RandomForestClassifier(random_state=1234)),
    'gb': make_pipeline(MinMaxScaler(), GradientBoostingClassifier(random_state=1234))
}

grid = {
    'rf': {
        'randomforestclassifier__n_estimators':[100,200,300]
    },
    'gb':{
        'gradientboostingclassifier__n_estimators':[100,200,300]
    } 
}

In [None]:
# Create a blank dictionary to hold the models 
fit_models = {}
# Loop through all the algos 
for algo, pipeline in pipelines.items():
  print(f'Training the {algo} model.')
  # Create new Grid Search CV Cclass 
  model = GridSearchCV(pipeline, grid[algo], n_jobs=-1, cv=10)
  # Train the model 
  model.fit(X_train, y_train)
  # Store results inside of the dictionary
  fit_models[algo] = model 


# Evaluate the performance of the model 
for algo, model in fit_models.items(): 
  yhat = model.predict(X_test)
  accuracy = accuracy_score(y_test, yhat)
  precision = precision_score(y_test, yhat)
  recall = recall_score(y_test, yhat)
  print(f'Metrics for {algo}: accuracy- {accuracy}, recall- {recall}, precision- {precision}')

Training the rf model.


In [12]:
test_data = pd.read_csv('test.csv')

test_IDs = test_data['PassengerId']
clean_test_data = dataprocessing(test_data)

y_results = fit_models['gb'].predict(clean_test_data)

pd.DataFrame({
    'PassengerId':test_IDs,
    'Transported':y_results
}).to_csv('results.csv', index=False)
