In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.exceptions import DataConversionWarning, ConvergenceWarning

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DataConversionWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
train = pd.read_csv('data/train.csv')

train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
test = pd.read_csv('data/test.csv')

test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
def extract_title(df):
    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

    df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df['Title'] = df['Title'].replace(['Countess', 'Mme', 'Lady'], 'Mrs')

    other_male = ['Col', 'Capt', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir', 'Dr']
    df['Title'] = df['Title'].replace(other_male, 'Other Male')

    df.loc[(df['Sex'] == 'female') & (df['Title'] == 'Other Male'), 'Title'] = 'Mrs'

    df['Title'] = df['Title'].map({'Mrs': 0, 'Miss': 1, 'Mr': 2, 'Master': 3, 'Other Male': 4})
    
    return df

In [9]:
def infer_age(df):
    age_inference = np.zeros((3,5))
    pclasses = [1, 2, 3]
    titles = [0, 1, 2, 3, 4]

    for pclass in pclasses:
        for title in titles:
            filtered_df = df[(df['Pclass'] == pclass) & (df['Title'] == title)]

            median = filtered_df['Age'].median()
            if not median:
                median = df[df['PClass'] == pclass]['Age'].median()
            if not median:
                median = df['Age'].median()

            print(f'{pclass}{title}')
            print(median)
            age_inference[pclass - 1, title] = int(median / 0.5 + 0.5 ) * 0.5

    for idx1, pclass in enumerate(pclasses):
        for idx2, title in enumerate(titles):
            df.loc[(df['Age'].isnull()) & (df['Sex'] == idx1) & (df['Pclass'] == idx2 + 1), 'Age'] = guess_ages[idx1,idx2]

    return df
        

In [7]:
most_common_city = train['Embarked'].value_counts().idxmax()

def data_preparation(df):
    df = extract_title(df)
    
    columns_to_drop = ['Survived', 'Name', 'Ticket', 'Cabin']
    
    for column in columns_to_drop:
        if column in df.columns:
            df= df.drop(column, axis='columns')
    
    df['Embarked'] = df['Embarked'].fillna(most_common_city) # fill Nan values in Embarked column with the most common embark city

    mean_fare = df['Fare'].mean() 
    df['Fare'] = df['Fare'].fillna(mean_fare)

    df.loc[:,'Sex'] = df.loc[:,'Sex'].map({'male': 0,'female': 1})
    df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

    df = infer_age(df)

    return df

In [10]:
x_train, y_train = data_preparation(train), train[['Survived']]

print(x_train.shape)
print(y_train.shape)

x_train

10
41.5
11
30.0
12
40.0
13
4.0
14
49.0
20
32.0
21
24.0
22
31.0
23
1.0
24
46.5
30
31.0
31
18.0
32
26.0
33
4.0
34
nan


ValueError: cannot convert float NaN to integer

In [None]:
x_train.info()

### Grid Search for the best Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],   
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'] 
}

clf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='f1_macro')

grid_search.fit(x_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
random_forest = grid_search.best_estimator_

random_forest

### Grid Search for the best Stochastic Gradient Descent Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

param_grid = {
    'loss': ['log_loss', 'modified_huber', 'squared_hinge', 'perceptron'],
    'penalty': ['l2', 'l1', 'elasticnet'], 
    'alpha': [0.0001, 0.001, 0.01, 0.1], 
    'max_iter': [1000, 3000, 5000],
    'tol': [1e-3, 1e-4, 1e-5],  
}

clf = SGDClassifier(random_state=42)

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='f1_macro')

grid_search.fit(x_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
sgd = grid_search.best_estimator_

sgd

### Grid Search for the best K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

param_grid = {
    'n_neighbors': [3, 5, 7, 9],  
    'weights': ['uniform', 'distance'],  
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  
    'p': [1, 2]
}


clf = KNeighborsClassifier()

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='f1_macro')

grid_search.fit(x_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
knn = grid_search.best_estimator_

knn

### Grid Search for the best Multi-layer Perceptron

In [None]:
from sklearn.neural_network import MLPClassifier

param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'activation': ['relu', 'tanh', 'logistic'], 
    'solver': ['adam', 'sgd'],  
    'alpha': [0.0001, 0.001, 0.01],  
}



clf = MLPClassifier(random_state=42)

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='f1_macro')

grid_search.fit(x_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
mlp = grid_search.best_estimator_

mlp

### Performing Ensemble method

In [None]:
from sklearn.ensemble import VotingClassifier

# weighted voting, with proportions 5:3:2 based on obtained accuracies 
estimators_weighted = [
    ('random_forest1', random_forest),
    ('random_forest2', random_forest),
    ('random_forest3', random_forest),
    ('random_forest4', random_forest),
    ('random_forest5', random_forest),
    ('mlp1', mlp),
    ('mlp2', mlp),
    ('mlp3', mlp),
    ('sgd1', sgd),
    ('sgd2', sgd)
]

ensemble = VotingClassifier(estimators_weighted, voting='soft')

In [None]:
test = data_preparation(test)

test.info()

In [None]:
x_train = data_preparation(train)

x_train.info()

In [None]:
from sklearn.metrics import f1_score, confusion_matrix

ensemble.fit(x_train, y_train)

y_pred = ensemble.predict(test)
y_pred_train = ensemble.predict(x_train)

y_pred_mlp = mlp.predict(test)
y_pred_result = random_forest.predict(test)
y_pred_sgd = sgd.predict(test)

print(f1_score(y_train, y_pred_train, average='macro'))
confusion_matrix(y_train, y_pred_train)

In [None]:
y_pred

In [None]:
def save_predictions(y_pred, name):
    y_pred_series = pd.Series(y_pred, index=test['PassengerId'])
    
    y_pred_df = y_pred_series.to_frame().reset_index()

    y_pred_df.rename(columns={0:'Survived'}, inplace=True)

    y_pred_df.to_csv(name, index=False)

### Save results to the CSV file

In [None]:
save_predictions(y_pred, 'result_ensemble.csv')
save_predictions(y_pred_mlp, 'result_mlp.csv')
save_predictions(y_pred_result, 'result_random_forest.csv')
save_predictions(y_pred_sgd, 'result_sgd.csv')