In [1]:
import numpy as np 
import pandas as pd 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


Upload the csv and replace any NaN in the Embarked column with Unknown

In [2]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data['Embarked'] = train_data['Embarked'].fillna('Unknown')
train_data.head()
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Import One Hot Encoder in order to transform columns with categorical data like Embarked and Sex to quantitative data

In [3]:
ohe = OneHotEncoder(handle_unknown = 'ignore',sparse_output = False).set_output(transform="pandas")
ohetransform = ohe.fit_transform(train_data[["Sex"]])
ohetransformtwo = ohe.fit_transform(train_data[["Embarked"]])
ohetransformthree = ohe.fit_transform(test_data[["Sex"]])
ohetransformfour = ohe.fit_transform(test_data[["Embarked"]])
train_data = pd.concat([train_data,ohetransformtwo],axis=1).drop(columns = ["Embarked"])
train_data = pd.concat([train_data,ohetransform],axis=1).drop(columns= ["Sex"])
test_data = pd.concat([test_data,ohetransformthree],axis=1).drop(columns = ["Embarked"])
test_data = pd.concat([test_data,ohetransformfour],axis=1).drop(columns= ["Sex"])

## For the missing values in Age and Fare, use the median to insert a value

In [4]:
train_data['Age'] = train_data['Age'].fillna(train_data.groupby('Pclass')['Age'].transform('median'))
test_data['Age'] = test_data['Age'].fillna(test_data.groupby('Pclass')['Age'].transform('median'))
test_data['Fare'] = test_data['Fare'].fillna(test_data.groupby('Pclass')['Fare'].transform('median'))

## Ensure that all columns are as wanted

In [5]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S,Embarked_Unknown,Sex_female,Sex_male
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0.0,0.0,1.0,0.0,0.0,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,1.0,0.0,0.0,0.0,1.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0.0,0.0,1.0,0.0,1.0,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0.0,0.0,1.0,0.0,1.0,0.0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0.0,0.0,1.0,0.0,0.0,1.0


In [6]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892,3,"Kelly, Mr. James",34.5,0,0,330911,7.8292,,0.0,1.0,0.0,1.0,0.0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,363272,7.0,,1.0,0.0,0.0,0.0,1.0
2,894,2,"Myles, Mr. Thomas Francis",62.0,0,0,240276,9.6875,,0.0,1.0,0.0,1.0,0.0
3,895,3,"Wirz, Mr. Albert",27.0,0,0,315154,8.6625,,0.0,1.0,0.0,0.0,1.0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,3101298,12.2875,,1.0,0.0,0.0,0.0,1.0


## Preprocess data in order to train model, split into 2 sections

In [7]:
Y = train_data.Survived
feature_names = ["Pclass","Sex_female","Sex_male","Age","SibSp","Parch","Fare","Embarked_C","Embarked_Q","Embarked_S"]
X = train_data[feature_names]
train_X, val_X, train_y, val_y = train_test_split(X,Y)

In [8]:
print(X.describe())
print(X.head())

           Pclass  Sex_female    Sex_male         Age       SibSp       Parch  \
count  891.000000  891.000000  891.000000  891.000000  891.000000  891.000000   
mean     2.308642    0.352413    0.647587   29.066409    0.523008    0.381594   
std      0.836071    0.477990    0.477990   13.244532    1.102743    0.806057   
min      1.000000    0.000000    0.000000    0.420000    0.000000    0.000000   
25%      2.000000    0.000000    0.000000   22.000000    0.000000    0.000000   
50%      3.000000    0.000000    1.000000   26.000000    0.000000    0.000000   
75%      3.000000    1.000000    1.000000   37.000000    1.000000    0.000000   
max      3.000000    1.000000    1.000000   80.000000    8.000000    6.000000   

             Fare  Embarked_C  Embarked_Q  Embarked_S  
count  891.000000  891.000000  891.000000  891.000000  
mean    32.204208    0.188552    0.086420    0.722783  
std     49.693429    0.391372    0.281141    0.447876  
min      0.000000    0.000000    0.000000    0

## Ensure no columns are empty

In [9]:
#titanic_model.fit(X,Y)
print(X.isna().sum())

Pclass        0
Sex_female    0
Sex_male      0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64


In [10]:
print(test_data[feature_names].isna().sum())

Pclass        0
Sex_female    0
Sex_male      0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64


In [11]:
train_data.head()
train_data.dropna(axis=1)
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S,Embarked_Unknown,Sex_female,Sex_male
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0.0,0.0,1.0,0.0,0.0,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,1.0,0.0,0.0,0.0,1.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0.0,0.0,1.0,0.0,1.0,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0.0,0.0,1.0,0.0,1.0,0.0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0.0,0.0,1.0,0.0,0.0,1.0


## Feature engineer the max_leaf_nodes sizes of a DecisionTreeClassifier and a RandomForestClassifier model to see which is the lowest

In [12]:
feature_names = ["Pclass","Sex_female","Sex_male","Age","SibSp","Parch","Fare","Embarked_C","Embarked_Q","Embarked_S"]
X = train_data[feature_names]
train_X, val_X, train_y, val_y = train_test_split(X,Y,random_state=0)

def get_mae(max_leaf_nodes,train_X,val_X,train_y,val_y):
    model = DecisionTreeClassifier(max_leaf_nodes = max_leaf_nodes,random_state =25)
    model.fit(train_X,train_y)
    prediction = model.predict(val_X)
    mae = mean_absolute_error(val_y,prediction)
    return mae

def get_mae_rf(max_leaf_nodes,train_X,val_X,train_y,val_y):
    model = RandomForestClassifier(max_leaf_nodes = max_leaf_nodes,random_state =25)
    model.fit(train_X,train_y)
    prediction = model.predict(val_X)
    mae = mean_absolute_error(val_y,prediction)
    return mae



list = []
list_two = []
x = range(100)
for i in x:
    i = i +2
    mae = get_mae(i,train_X,val_X,train_y,val_y)
    list.append(mae)
    print(mae, "the number is: ", i)
    
print(min(list),"at index:", np.argmin(list))
for a in x:
    a = a +2
    mae = get_mae_rf(a,train_X,val_X,train_y,val_y)
    list_two.append(mae)
    print(mae, "the number is: ", a)
    
print(min(list),"at index:", np.argmin(list)+2)
print(min(list_two)," at index: ", np.argmin(list_two)+2)

0.21973094170403587 the number is:  2
0.21973094170403587 the number is:  3
0.2242152466367713 the number is:  4
0.20179372197309417 the number is:  5
0.17937219730941703 the number is:  6
0.17937219730941703 the number is:  7
0.17488789237668162 the number is:  8
0.17488789237668162 the number is:  9
0.17488789237668162 the number is:  10
0.17488789237668162 the number is:  11
0.16591928251121077 the number is:  12
0.16591928251121077 the number is:  13
0.17040358744394618 the number is:  14
0.17040358744394618 the number is:  15
0.17040358744394618 the number is:  16
0.17040358744394618 the number is:  17
0.16591928251121077 the number is:  18
0.16591928251121077 the number is:  19
0.16591928251121077 the number is:  20
0.17040358744394618 the number is:  21
0.16591928251121077 the number is:  22
0.16591928251121077 the number is:  23
0.16591928251121077 the number is:  24
0.17040358744394618 the number is:  25
0.17040358744394618 the number is:  26
0.17488789237668162 the number is:

## Random Forest Model was the lowest mean absolute error with 36 leaf nodes, create and train model

In [13]:
rf_model = RandomForestClassifier(max_leaf_nodes =36,random_state=25)
rf_model.fit(train_X,train_y)
rf_prediction = rf_model.predict(val_X)
rf_mae = mean_absolute_error(rf_prediction,val_y)
rf_model.fit(X,Y)

In [14]:
predictions = rf_model.predict(test_data[feature_names])

In [15]:
#output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
#output['Survived'] = output['Survived'].round().astype(int)
#output.to_csv('submission.csv', index=False)
output = pd.DataFrame({'PassengerID': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv',index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
