<a href="https://colab.research.google.com/github/SuccessPear/Kaggle-Competitions/blob/main/Titanic/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Note
- Sex is highly correlated with Survival
- PassengerID and Name are unique feature -> Drop
- Numeric features: Age, SibSp, Parch, Fare
- Categorical features: Pclass, Sex, Embarked
- Target: Survived
- Cabin has more than 70% missing instances -> Drop

# Kaggle authentication

In [2]:
!mkdir /.kaggle

In [3]:
!cp kaggle.json ~/.kaggle/kaggle.json

In [4]:
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle competitions download -c titanic

Downloading titanic.zip to /content
  0% 0.00/34.1k [00:00<?, ?B/s]
100% 34.1k/34.1k [00:00<00:00, 42.9MB/s]


In [6]:
!unzip titanic.zip

Archive:  titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


# Load dataset

In [18]:
import pandas as pd
from matplotlib import pyplot as plt

In [8]:
df = pd.read_csv('train.csv')

# Simple EDA

In [9]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [13]:
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
PassengerId,891.0,,,,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,,,,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,,,,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Name,891.0,891.0,"Braund, Mr. Owen Harris",1.0,,,,,,,
Sex,891.0,2.0,male,577.0,,,,,,,
Age,714.0,,,,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,,,,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,,,,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Ticket,891.0,681.0,347082,7.0,,,,,,,
Fare,891.0,,,,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


## Setup pandas-profiling

In [None]:
! pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [None]:
from pandas_profiling import ProfileReport

In [21]:
profile = ProfileReport(df, title='Titanic')

In [22]:
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [24]:
df_test = pd.read_csv('test.csv')

In [25]:
profile_test = ProfileReport(df_test, title='Titanic_test')

In [27]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [26]:
profile_test.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

# Preprocessing data

In [67]:
selected_df = df.drop(['PassengerId', 'Survived', 'Name', 'Cabin', 'Ticket'], axis=1)

In [68]:
# - Numeric features: Age, SibSp, Parch, Fare
# - Categorical features: Pclass, Sex, Embarked
def preprocessing(df):
  # Fill missing values in numeric features
  df['Age'].fillna(df['Age'].mean(), inplace=True)
  df['SibSp'].fillna(df['SibSp'].mean(), inplace=True)
  df['Parch'].fillna(df['Parch'].mean(), inplace=True)
  df['Fare'].fillna(df['Fare'].mean(), inplace=True)

  # Fill missing values in categorical features
  df['Pclass'].fillna('Unknown', inplace=True)
  df['Sex'].fillna('Unknown', inplace=True)
  df['Embarked'].fillna('Unknown', inplace=True)

  return df


In [90]:
X = preprocessing(selected_df)
y = df.Survived

In [91]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  891 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 48.9+ KB


# Train test split

In [92]:
from sklearn.model_selection import train_test_split

In [149]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=39)

# Create ML pipelines

In [94]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score

In [155]:
# Categorical features
cat_features = ['Pclass', 'Sex', 'Embarked']
cat_pipeline = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore'))])
# Numeric features
num_features = ['Age', 'SibSp', 'Parch', 'Fare']
num_pipeline = Pipeline([('scaler', StandardScaler())])

In [156]:
ct = ColumnTransformer([('cat_pipeline', cat_pipeline, cat_features),
                        ('num_pipeline', num_pipeline, num_features)])

In [157]:
# Fit the column transformer with train data
X_train_ct = ct.fit_transform(X_train)
X_train_ct = pd.DataFrame(X_train_ct, columns=ct.get_feature_names_out())
# Use the ct to transform test data
X_test_ct = ct.transform(X_test)
X_test_ct = pd.DataFrame(X_test_ct, columns=ct.get_feature_names_out())

In [158]:
X_train_ct.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 13 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   cat_pipeline__Pclass_1          712 non-null    float64
 1   cat_pipeline__Pclass_2          712 non-null    float64
 2   cat_pipeline__Pclass_3          712 non-null    float64
 3   cat_pipeline__Sex_female        712 non-null    float64
 4   cat_pipeline__Sex_male          712 non-null    float64
 5   cat_pipeline__Embarked_C        712 non-null    float64
 6   cat_pipeline__Embarked_Q        712 non-null    float64
 7   cat_pipeline__Embarked_S        712 non-null    float64
 8   cat_pipeline__Embarked_Unknown  712 non-null    float64
 9   num_pipeline__Age               712 non-null    float64
 10  num_pipeline__SibSp             712 non-null    float64
 11  num_pipeline__Parch             712 non-null    float64
 12  num_pipeline__Fare              712 

In [159]:
X_test_ct.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179 entries, 0 to 178
Data columns (total 13 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   cat_pipeline__Pclass_1          179 non-null    float64
 1   cat_pipeline__Pclass_2          179 non-null    float64
 2   cat_pipeline__Pclass_3          179 non-null    float64
 3   cat_pipeline__Sex_female        179 non-null    float64
 4   cat_pipeline__Sex_male          179 non-null    float64
 5   cat_pipeline__Embarked_C        179 non-null    float64
 6   cat_pipeline__Embarked_Q        179 non-null    float64
 7   cat_pipeline__Embarked_S        179 non-null    float64
 8   cat_pipeline__Embarked_Unknown  179 non-null    float64
 9   num_pipeline__Age               179 non-null    float64
 10  num_pipeline__SibSp             179 non-null    float64
 11  num_pipeline__Parch             179 non-null    float64
 12  num_pipeline__Fare              179 

In [160]:
ct.get_feature_names_out()

array(['cat_pipeline__Pclass_1', 'cat_pipeline__Pclass_2',
       'cat_pipeline__Pclass_3', 'cat_pipeline__Sex_female',
       'cat_pipeline__Sex_male', 'cat_pipeline__Embarked_C',
       'cat_pipeline__Embarked_Q', 'cat_pipeline__Embarked_S',
       'cat_pipeline__Embarked_Unknown', 'num_pipeline__Age',
       'num_pipeline__SibSp', 'num_pipeline__Parch', 'num_pipeline__Fare'],
      dtype=object)

In [161]:
ml_pipelines = {
    'rf' : Pipeline([('randomforestclassifier', RandomForestClassifier())]),
    'gb' : Pipeline([('gradientboostingclassifier', RandomForestClassifier())])
}

In [169]:
grid = {
    'rf': {
        'randomforestclassifier__n_estimators':[100,200,300]
    },
    'gb': {
        'gradientboostingclassifier__n_estimators':[200,300,400,500]
    }
}

In [170]:
# Create a blank dictionary to hold the models
fit_models = {}
# Loop through all the algos
for algo, pipeline in ml_pipelines.items():
  print(f'Training the {algo} model')
  # Create new Grid Search CV class
  model = GridSearchCV(pipeline, grid[algo], n_jobs=-1, cv=10)
  # Train the model
  model.fit(X_train_ct, y_train)
  # Store results inside of the dictionary
  fit_models[algo] = model

Training the rf model
Training the gb model


In [171]:
print(fit_models['rf'].best_params_)
print(fit_models['gb'].best_params_)

{'randomforestclassifier__n_estimators': 100}
{'gradientboostingclassifier__n_estimators': 500}


# Evaluation on test data

In [167]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [172]:
# Evaluate the performance of the model
for algo, model in fit_models.items():
  yhat = model.predict(X_test_ct)
  accuracy = accuracy_score(y_test, yhat)
  precision = precision_score(y_test, yhat)
  recall = recall_score(y_test, yhat)
  print(f'Metrics for {algo}: accuracy- {accuracy}, recall- {recall}, precision- {precision}')

Metrics for rf: accuracy- 0.7932960893854749, recall- 0.7142857142857143, precision- 0.6557377049180327
Metrics for gb: accuracy- 0.7932960893854749, recall- 0.7142857142857143, precision- 0.6557377049180327


# Submit to Kaggle

In [173]:
# Read test.csv
test_df = pd.read_csv('test.csv')

In [174]:
# Make a deep copy
abt_test = test_df.copy()
# Run through the preprocessing pipeline
preprocessing(abt_test)
# Column Transformer on test data
abt_test = ct.transform(abt_test)


In [176]:
yhat_test = fit_models['rf'].predict(abt_test)



In [178]:
submission = pd.DataFrame([test_df['PassengerId'], yhat_test]).T
submission.columns = ['PassengerId', 'Survived']

In [180]:
submission.to_csv('kaggle_submission.csv', index=False)

In [181]:
!kaggle competitions submit -c titanic -f "kaggle_submission.csv" -m "initial rf model"

100% 2.77k/2.77k [00:00<00:00, 3.65kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster