# Titanic code

In [145]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


pd.set_option('display.max_columns', None)

## Data analysis

In [146]:
dataset = pd.read_csv("train.csv")
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [147]:
print("Dataset info: ")
dataset.info()

print("Dataset description: ")
dataset.describe()


Dataset info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
Dataset description: 


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [148]:
# Check null values of Age
dataset[dataset['Age'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


There are some age null values of Age. We could impute them, but that would probably bias the result. We will drop them

In [149]:
dataset = dataset.dropna(subset=['Age'])

There are alsosome missing values for cabin. This is probably because they didn't have any cabin. So we will just put it to zero

In [150]:
dataset.loc[:, 'Cabin'] = dataset['Cabin'].fillna(0)
dataset[dataset['Cabin'].isnull()]


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


We need to take care of the Name, Sex and Ticket. We will drop the Name and Ticket column (since that will prob. not have any influence in the result), and perform one-hot encoding in the Sex column.

The Cabin column is also problematic. We will split it into two: the cabin letter and the cabin number. We fill the null values with 0, and 

In [158]:
if 'Name' in dataset.columns: dataset = dataset.drop("Name", axis=1)
if 'Ticket' in dataset.columns: dataset = dataset.drop("Ticket", axis=1)

if 'Sex' in dataset.columns: dataset = pd.get_dummies(dataset, columns=["Sex"])

if 'Cabin' in dataset.columns:
    # Split the "Cabin" column into "Cabin_Letter" and "Cabin_Primary"
    dataset.loc[:, 'Num_Cabins'] = dataset['Cabin'].str.split().apply(lambda x: len(x) if isinstance(x, list) else 0)
    
    dataset.loc[:, 'Cabin_Letter'] = dataset['Cabin'].str.get(0)

    # Fix empty string
    dataset.loc[:,'Cabin_Primary'] = dataset['Cabin'].str.split().str[0].str.slice(start=1)
    

    dataset = pd.get_dummies(dataset, columns=['Cabin_Letter'], prefix='Cabin')

    dataset.loc[:, 'Cabin_Primary'] = dataset.loc[:, 'Cabin_Primary'].fillna(0)

    dataset = dataset.drop('Cabin', axis=1)

if 'Embarked' in dataset.columns:
    dataset = pd.get_dummies(dataset, columns=['Embarked'], prefix='Embarked')
    

# dataset.head()
dataset[dataset['Cabin_Primary'] == '']


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Num_Cabins,Cabin_Primary,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
75,76,0,3,25.0,0,0,7.65,0,1,2,,0,0,0,0,0,1,0,0,0,0,1
292,293,0,2,36.0,0,0,12.875,0,1,1,,0,0,0,1,0,0,0,0,1,0,0
327,328,1,2,36.0,0,0,13.0,1,0,1,,0,0,0,1,0,0,0,0,0,0,1
339,340,0,1,45.0,0,0,35.5,0,1,1,,0,0,0,0,0,0,0,1,0,0,1
473,474,1,2,23.0,0,0,13.7917,1,0,1,,0,0,0,1,0,0,0,0,1,0,0
699,700,0,3,42.0,0,0,7.65,0,1,2,,0,0,0,0,0,1,0,0,0,0,1
715,716,0,3,19.0,0,0,7.65,0,1,2,,0,0,0,0,0,1,0,0,0,0,1


## Model Training

In [152]:
X = dataset.drop("Survived", axis=1)  # Features (all columns except "Survived")
y = dataset["Survived"]  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [153]:
model = RandomForestClassifier(n_estimators=100)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(accuracy)

ValueError: could not convert string to float: ''