In [None]:
import numpy as np
import pandas as pd

train_df = pd.read_csv('../../data/titanic/raw/train.csv')
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
def parse_cabin(string):
    if pd.isna(string):
        return []
    tokens = string.split()
    result = []
    # deck_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8}
    for token in tokens:
        try:
            deck = token[0]
            room = int(token[1:])
        except (KeyError, ValueError): # exit the loop if we hit an unexpected format, 'F G67' for example
            return []
        result.append({'deck': deck, 'room': room})
    return result

# def parse_ticket(string):
#     # remove all non numeric characters
#     return ''.join(filter(str.isdigit, string))

train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
train_df['FamilySizeDivAge'] = train_df['FamilySize'].div(train_df['Age']) # try to capture youngest member(s) of large families
train_df['LastName'] = train_df['Name'].apply(lambda x: x.split(',')[0])
train_df['Rooms'] = train_df['Cabin'].apply(parse_cabin)
train_df['RoomCount'] = train_df['Rooms'].apply(lambda x: len(x) if len(x) > 0 else np.nan)
train_df['Deck'] = train_df['Rooms'].apply(lambda x: x[0]['deck'] if len(x) > 0 else np.nan)
# train_df['TicketNum'] = train_df['Ticket'].apply(parse_ticket)
train_df['TicketGroupSize'] = train_df.groupby('Ticket')['Ticket'].transform('count')
train_df['FareDivAge'] = train_df['Fare'].div(train_df['Age']) # idk maybe this is useful?

features = [
    'Pclass',
    'Sex', 
    'Age', 
    # 'AgeMissing', 
    'SibSp', 
    'Parch', 
    'FamilySize',
    'FamilySizeDivAge',
    'LastName',
    'Ticket', 
    'TicketGroupSize',
    'Fare',
    'FareDivAge',
    'Embarked', 
    'Deck'
]

train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,FamilySizeDivAge,LastName,Rooms,RoomCount,Deck,TicketGroupSize,FareDivAge
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,2,0.090909,Braund,[],,,1,0.329545
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,0.052632,Cumings,"[{'deck': 'C', 'room': 85}]",1.0,C,1,1.875876
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1,0.038462,Heikkinen,[],,,1,0.304808
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,2,0.057143,Futrelle,"[{'deck': 'C', 'room': 123}]",1.0,C,2,1.517143
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,1,0.028571,Allen,[],,,1,0.230000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,1,0.037037,Montvila,[],,,1,0.481481
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,1,0.052632,Graham,"[{'deck': 'B', 'room': 42}]",1.0,B,1,1.578947
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,4,,Johnston,[],,,2,
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1,0.038462,Behr,"[{'deck': 'C', 'room': 148}]",1.0,C,1,1.153846


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report

X = pd.get_dummies(train_df[features])
X_test = pd.get_dummies(train_df[features])

y = train_df['Survived']

rf = RandomForestClassifier()
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rf, X, y, cv=cv, scoring='accuracy')
print(f'Mean Accuracy: {scores.mean()}, Standard Deviation: {scores.std()}')

Mean Accuracy: 0.8349883874207519, Standard Deviation: 0.02825174215450522


In [10]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced', 'balanced_subsample']
}

rf = RandomForestClassifier(random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=50,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

search.fit(X, y)
print(f'Best Score: {search.best_score_}, Best Parameters: {search.best_params_}')

Best Score: 0.8327663046889711, Best Parameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 20, 'class_weight': 'balanced', 'bootstrap': True}


In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
clf_report = classification_report(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')
print('\nClassification Report:\n', clf_report)

Validation Accuracy: 0.8268156424581006

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.90      0.86       105
           1       0.84      0.72      0.77        74

    accuracy                           0.83       179
   macro avg       0.83      0.81      0.82       179
weighted avg       0.83      0.83      0.82       179



In [5]:
importances = rf_clf.feature_importances_
feature_names = X_train.columns
fi = pd.DataFrame({'feature': feature_names, 'importance': importances})
fi['group'] = fi['feature'].str.split('_').str[0]
fi.groupby('group')['importance'].sum().sort_values(ascending=False)

group
Ticket              0.222206
LastName            0.220792
Sex                 0.187025
FareDivAge          0.063446
Fare                0.057230
FamilySizeDivAge    0.046551
Age                 0.041643
Pclass              0.036156
Deck                0.030851
TicketGroupSize     0.026761
FamilySize          0.021330
Embarked            0.018948
SibSp               0.016175
Parch               0.010885
Name: importance, dtype: float64