In [None]:
import numpy as np 
import pandas as pd 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



Upload the csv and replace any NaN in the Embarked column with Unknown

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data['Embarked'] = train_data['Embarked'].fillna('Unknown')
train_data.head()
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

## Import One Hot Encoder in order to transform columns with categorical data like Embarked and Sex to quantitative data

In [None]:
ohe = OneHotEncoder(handle_unknown = 'ignore',sparse_output = False).set_output(transform="pandas")
ohetransform = ohe.fit_transform(train_data[["Sex"]])
ohetransformtwo = ohe.fit_transform(train_data[["Embarked"]])
ohetransformthree = ohe.fit_transform(test_data[["Sex"]])
ohetransformfour = ohe.fit_transform(test_data[["Embarked"]])
train_data = pd.concat([train_data,ohetransformtwo],axis=1).drop(columns = ["Embarked"])
train_data = pd.concat([train_data,ohetransform],axis=1).drop(columns= ["Sex"])
test_data = pd.concat([test_data,ohetransformthree],axis=1).drop(columns = ["Embarked"])
test_data = pd.concat([test_data,ohetransformfour],axis=1).drop(columns= ["Sex"])

## For the missing values in Age and Fare, use the median to insert a value

In [None]:
train_data['Age'] = train_data['Age'].fillna(train_data.groupby('Pclass')['Age'].transform('median'))
test_data['Age'] = test_data['Age'].fillna(test_data.groupby('Pclass')['Age'].transform('median'))
test_data['Fare'] = test_data['Fare'].fillna(test_data.groupby('Pclass')['Fare'].transform('median'))

## Ensure that all columns are as wanted

In [None]:
train_data.head()

In [None]:
test_data.head()

## Preprocess data in order to train model, split into 2 sections

In [None]:
Y = train_data.Survived
feature_names = ["Pclass","Sex_female","Sex_male","Age","SibSp","Parch","Fare","Embarked_C","Embarked_Q","Embarked_S"]
X = train_data[feature_names]
train_X, val_X, train_y, val_y = train_test_split(X,Y)

In [None]:
print(X.describe())
print(X.head())

## Ensure no columns are empty

In [None]:
#titanic_model.fit(X,Y)
print(X.isna().sum())

In [None]:
print(test_data[feature_names].isna().sum())

In [None]:
train_data.head()
train_data.dropna(axis=1)
train_data.head()

## Feature engineer the max_leaf_nodes sizes of a DecisionTreeClassifier and a RandomForestClassifier model to see which is the lowest

In [None]:
feature_names = ["Pclass","Sex_female","Sex_male","Age","SibSp","Parch","Fare","Embarked_C","Embarked_Q","Embarked_S"]
X = train_data[feature_names]
train_X, val_X, train_y, val_y = train_test_split(X,Y,random_state=0)

def get_mae(max_leaf_nodes,train_X,val_X,train_y,val_y):
    model = DecisionTreeClassifier(max_leaf_nodes = max_leaf_nodes,random_state =25)
    model.fit(train_X,train_y)
    prediction = model.predict(val_X)
    mae = mean_absolute_error(val_y,prediction)
    return mae

def get_mae_rf(max_leaf_nodes,train_X,val_X,train_y,val_y):
    model = RandomForestClassifier(max_leaf_nodes = max_leaf_nodes,random_state =25)
    model.fit(train_X,train_y)
    prediction = model.predict(val_X)
    mae = mean_absolute_error(val_y,prediction)
    return mae



list = []
list_two = []
x = range(100)
for i in x:
    i = i +2
    mae = get_mae(i,train_X,val_X,train_y,val_y)
    list.append(mae)
    print(mae, "the number is: ", i)
    
print(min(list),"at index:", np.argmin(list))
for a in x:
    a = a +2
    mae = get_mae_rf(a,train_X,val_X,train_y,val_y)
    list_two.append(mae)
    print(mae, "the number is: ", a)
    
print(min(list),"at index:", np.argmin(list)+2)
print(min(list_two)," at index: ", np.argmin(list_two)+2)

## Random Forest Model was the lowest mean absolute error with 36 leaf nodes, create and train model

In [None]:
rf_model = RandomForestClassifier(max_leaf_nodes =36,random_state=25)
rf_model.fit(train_X,train_y)
rf_prediction = rf_model.predict(val_X)
rf_mae = mean_absolute_error(rf_prediction,val_y)
rf_model.fit(X,Y)

In [None]:
predictions = rf_model.predict(test_data[feature_names])

In [None]:
#output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
#output['Survived'] = output['Survived'].round().astype(int)
#output.to_csv('submission.csv', index=False)
output = pd.DataFrame({'PassengerID': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv',index=False)
print("Your submission was successfully saved!")