# &emsp;Modules

In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.pipeline import make_pipeline

In [2]:
from sklearn.tree import ExtraTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import OneClassSVM
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OutputCodeClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier    
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import GaussianNB
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB  
from sklearn.neighbors import NearestCentroid
from sklearn.svm import NuSVC
from sklearn.linear_model import Perceptron
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC

# Understanding Data

## &emsp;Importing Data

### &emsp;&emsp;Submission Sample

In [3]:
submission_sample = pd.read_csv('data/gender_submission.csv', dtype={'PassengerId': np.uint16, 'Survived': np.bool})
submission_sample.head(5)

Unnamed: 0,PassengerId,Survived
0,892,False
1,893,True
2,894,False
3,895,False
4,896,True


### &emsp;&emsp;Test Data

In [4]:
test_data = pd.read_csv('data/test.csv', dtype={'PassengerId': np.uint16,
                                                'Pclass': np.uint8,
                                                'Name': 'object',
                                                'Sex': 'object',
                                                'Age': np.float32,
                                                'SibSp': np.uint8,
                                                'Parch': np.uint8,
                                                'Ticket': 'object',
                                                'Fare': np.float32,
                                                'Cabin': 'object',
                                                'Embarked': 'object'})
test_data.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### &emsp;&emsp;Train Data

In [8]:
original_train_data = pd.read_csv('data/train.csv', dtype={'PassengerId': np.uint16,
                                                'Survived': np.bool,
                                                'Pclass': np.uint8,
                                                'Name': 'object',
                                                'Sex': 'object',
                                                'Age': np.float32,
                                                'SibSp': np.uint8,
                                                'Parch': np.uint8,
                                                'Ticket': 'object',
                                                'Fare': np.float32,
                                                'Cabin': 'object',
                                                'Embarked': 'object'})
original_train_data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,False,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,True,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283302,C85,C
2,3,True,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,True,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.099998,C123,S
4,5,False,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
train_data = original_train_data.copy()
train_data['Sex'] = train_data['Sex'].apply(lambda x : 0 if x == 'male' else 1)

## &emsp;Analysing Data

### &emsp;&emsp;Balance

In [11]:
train_data.groupby('Survived').agg({ 'PassengerId': 'count'})

Unnamed: 0_level_0,PassengerId
Survived,Unnamed: 1_level_1
False,549
True,342


#### &emsp;&emsp;&emsp;Undersampling Majority Class

In [12]:
rus = RandomUnderSampler(random_state=42, sampling_strategy = 'majority')
X_resampled, y_X_resampled = rus.fit_resample(train_data.drop(columns=['Survived']), train_data['Survived'])
undersampled_train_data = pd.concat([X_resampled, y_X_resampled], axis=1)

In [13]:
undersampled_train_data.groupby('Survived').agg({ 'PassengerId': 'count'})

Unnamed: 0_level_0,PassengerId
Survived,Unnamed: 1_level_1
False,342
True,342


#### &emsp;&emsp;&emsp;Oversampling Minority Class

In [14]:
ros = RandomOverSampler(random_state=42, sampling_strategy = 'minority')
X_resampled, y_X_resampled = ros.fit_resample(train_data.drop(columns = ['Survived']), train_data['Survived'])
oversampled_train_data = pd.concat([X_resampled, y_X_resampled], axis=1)

In [15]:
oversampled_train_data.groupby('Survived').agg({ 'PassengerId': 'count'})

Unnamed: 0_level_0,PassengerId
Survived,Unnamed: 1_level_1
False,549
True,549


# MODELS

## Supervisioned

In [16]:
models = '''ExtraTreeClassifier
DecisionTreeClassifier
OneClassSVM
MLPClassifier
RadiusNeighborsClassifier
KNeighborsClassifier
ClassifierChain
MultiOutputClassifier
OutputCodeClassifier
OneVsOneClassifier
OneVsRestClassifier
SGDClassifier
RidgeClassifierCV
RidgeClassifier
PassiveAggressiveClassifier
GaussianProcessClassifier
VotingClassifier
AdaBoostClassifier
GradientBoostingClassifier
BaggingClassifier
ExtraTreesClassifier
RandomForestClassifier
BernoulliNB
CalibratedClassifierCV
GaussianNB
LabelPropagation
LabelSpreading
LinearDiscriminantAnalysis
LinearSVC
LogisticRegression
LogisticRegressionCV
MultinomialNB
NearestCentroid
NuSVC
Perceptron
QuadraticDiscriminantAnalysis
SVC'''
models = models.split('\n')

In [83]:
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
312,2,26.000000,1,1,26.000000
124,1,54.000000,0,1,77.287498
783,3,29.923729,1,2,23.450001
167,3,45.000000,1,4,27.900000
772,2,57.000000,0,0,10.500000
...,...,...,...,...,...
875,3,15.000000,0,0,7.225000
879,1,56.000000,0,1,83.158302
880,2,25.000000,0,1,26.000000
887,1,19.000000,0,0,30.000000


In [84]:
for i in range(10):
    history = []
    for model in models:
        X = undersampled_train_data.select_dtypes(exclude='object').drop(columns=['Survived','PassengerId'])
        y = undersampled_train_data['Survived']

        X['Age'] = X['Age'].fillna(X['Age'].mean())

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        try:
            # clf = globals()[model]()
            clf = make_pipeline(MinMaxScaler(),globals()[model]())
            if 'fit' in dir(globals()[model]()):
                clf.fit(X_train, y_train)
            elif 'partial_fit' in dir(globals()[model]()):
                clf.partial_fit(X_train, y_train)
            elif 'fit_transform' in dir(globals()[model]()):
                clf.fit_transform(X_train, y_train)
            elif 'fit_predict' in dir(globals()[model]()):
                clf.fit_predict(X_train, y_train)

            y_pred = clf.predict(X_test)

            if -1 in y_pred:
                y_pred[y_pred == -1] = 0

            history.append([model, accuracy_score(y_test, y_pred), ''])
        except TypeError as e:
            history.append([model, 0, e])
        except Exception as e:
            history.append([model, accuracy_score(y_test, y_pred), e])

    if i == 0:
        acc_df = pd.DataFrame(history, columns = ['model', 'accuracy', 'error']).sort_values(by='accuracy', ascending=False)
    else:
        acc_df = pd.merge(acc_df, pd.DataFrame(history, columns = ['model', 'accuracy', 'error'])[['model','accuracy']], on = 'model', how='left', suffixes=('', f'_{i}'))



In [87]:
acc_df.set_index('model', inplace=True)

In [90]:
top5_models = acc_df.drop(columns = ['error']).agg(['mean','std'], axis=1).sort_values(by='mean', ascending=False).head(5)

In [91]:
top5_models

Unnamed: 0_level_0,mean,std
model,Unnamed: 1_level_1,Unnamed: 2_level_1
BaggingClassifier,0.79562,0.01685694
GradientBoostingClassifier,0.79562,0.0
KNeighborsClassifier,0.788321,1.170278e-16
RandomForestClassifier,0.786861,0.01130798
NuSVC,0.773723,1.170278e-16


In [None]:
dict_kwargs = {
    'BC' : {'base_estimator': DecisionTreeClassifier(max_depth=1), 'n_estimators': 100},
}

In [94]:
clf = make_pipeline(MinMaxScaler(),BaggingClassifier())
clf.fit(X_train, y_train)
test_data['Sex'] = test_data['Sex'].apply(lambda x : 0 if x == 'male' else 1)
y_pred = clf.predict(test_data[X.columns])

In [100]:
submission = test_data[['PassengerId']].copy()
submission.loc[:,'Survived'] = y_pred

In [102]:
submission.to_csv('submission.csv', index=False)