# Titanic

-----

Import Cell

In [89]:
import pandas as pd
import sys
import string
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

Checking versions

In [72]:
print("Python - " + str(sys.version) )
print("Pandas - " + str(pd.__version__))
print("SKLearn - " + str(sklearn.__version__))

Python - 3.11.4 | packaged by Anaconda, Inc. | (main, Jul  5 2023, 13:38:37) [MSC v.1916 64 bit (AMD64)]
Pandas - 1.5.3
SKLearn - 1.3.0


Import training Dataset

In [73]:
df = pd.read_csv('Datasets/train.csv')
print(df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


----
## Feature Engineering

### 1. Titles

Converting the names to their respective titles.

Fetch all the unique titles from the dataset

In [74]:
all_titles = []

for name_string in df["Name"] :
    comma_pos = name_string.find(',')
    dot_pos = name_string.find('.')
    title = name_string[comma_pos + 2:dot_pos]
    if title not in all_titles :
        all_titles.append(title)

print(all_titles)

['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms', 'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess', 'Jonkheer']


In [75]:
# Function to return the title for a given name
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
    # print(big_string)
    return np.nan

In [76]:
# Evalute the above function to each row and add it as a column

df['Title']=df['Name'].map(lambda x: substrings_in_string(x, all_titles))
 
#replacing all titles with mr, mrs, miss, master
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
df['Title']=df.apply(replace_titles, axis=1)
print(df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked Title  
0      0         A/5 21171   7.2500   NaN        S    Mr  
1      0          PC 17599  71.2833   C85        C    Mr  
2      0  STON/O2. 3101282   7.9250   NaN        S  Miss  
3      0            113803  53.1000  C123        S    Mr  
4      0            37345

In [77]:
# Turning cabin number into Deck
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Nan']

df['Deck']=df['Cabin'].map(lambda x: substrings_in_string(str(x), cabin_list))
print(df['Deck'].head())

0    NaN
1      C
2    NaN
3      C
4    NaN
Name: Deck, dtype: object


In [78]:
# Creating new family_size column
df['Family_Size']=df['SibSp']+df['Parch']

# Since age and class are both numbers we can just multiply them.
df['Age*Class']=df['Age']*df['Pclass']

# Here we divide the fare by the number of family members traveling together, I’m not exactly sure what this represents, but it’s easy enough to add in.

df['Fare_Per_Person']=df['Fare']/(df['Family_Size']+1)

In [86]:
# Dropping the first column

df = df.drop('PassengerId', axis=1)
df = df.drop('Name', axis=1)
df = df.drop('Ticket', axis=1)

In [90]:
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

In [91]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title,Deck,Family_Size,Age*Class,Fare_Per_Person
0,0,3,1,22.0,1,0,7.25,147,2,2,8,1,66.0,3.625
1,1,1,0,38.0,1,0,71.2833,81,0,2,2,1,38.0,35.64165
2,1,3,0,26.0,0,0,7.925,147,2,1,8,0,78.0,7.925
3,1,1,0,35.0,1,0,53.1,55,2,2,2,1,35.0,26.55
4,0,3,1,35.0,0,0,8.05,147,2,2,8,0,105.0,8.05


---

## Implement Decision Tree

In [92]:
X = df.iloc[:, 1:]
Y = df.iloc[:, 0]

In [93]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [98]:
clf = DecisionTreeClassifier()
clf.fit(X_train, Y_train)

In [100]:
Y_pred = clf.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.7932960893854749
