In [146]:
import pandas as pd
import matplotlib.pyplot as plt

In [147]:
df = pd.read_csv('train.csv')
df.shape

(891, 12)

In [148]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [149]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [150]:
df['Age'].fillna(df['Age'].median(), inplace=True)
df.drop('Cabin', axis='columns', inplace=True)
df.dropna(inplace=True)
df.shape, df.isna().sum()

((889, 11),
 PassengerId    0
 Survived       0
 Pclass         0
 Name           0
 Sex            0
 Age            0
 SibSp          0
 Parch          0
 Ticket         0
 Fare           0
 Embarked       0
 dtype: int64)

In [151]:
#df.drop(['Name', 'Ticket', 'Embarked', 'PassengerId'], axis='columns', inplace=True)
df.drop(['Name', 'Ticket', 'PassengerId'], axis='columns', inplace=True)

In [152]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [153]:
df['Fare'].describe()

count    889.000000
mean      32.096681
std       49.697504
min        0.000000
25%        7.895800
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [154]:
def get_MAD(series, median):
    diff = abs(series - median)
    return diff.median()

fare_median = df['Fare'].median()
MAD = get_MAD(df['Fare'], fare_median)

upper_fare_limit = fare_median + ((3.5 * MAD) / 0.6745)
lower_fare_limit = fare_median + ((-3.5 * MAD) / 0.6745)

fare_outliers = df[(df['Fare'] < lower_fare_limit) | (df['Fare'] > upper_fare_limit)]

df.drop(fare_outliers.index, axis='index', inplace=True)

fare_outliers.shape, df.shape

((158, 8), (731, 8))

In [155]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])

In [156]:
X = df.drop(['Survived'], axis='columns')
y = df['Survived']

In [157]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,2
2,3,0,26.0,0,0,7.925,2
4,3,1,35.0,0,0,8.05,2
5,3,1,28.0,0,0,8.4583,1
7,3,1,2.0,3,1,21.075,2


In [158]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=X['Pclass'])

In [159]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

In [160]:
model_params = {
    'SVM': {
        'model': SVC(),
        'params': {
            'svc__C': [1e-4, 1e-3, 1e-2, 1e-1, 1, 2],
            'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'params': {
            'randomforestclassifier__n_estimators': [100, 125, 150, 175, 200],
            'randomforestclassifier__criterion': ['gini', 'entropy', 'log_loss']
        }
    },
    'Ridge': {
        'model': RidgeClassifier(),
        'params': {
            'ridgeclassifier__alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 2]
        }
    },
    'Logistic Regression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'logisticregression__C': [1e-4, 1e-3, 1e-2, 1e-1, 1, 2],
            'logisticregression__solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky']
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'kneighborsclassifier__n_neighbors': [5, 10 ,15],
            'kneighborsclassifier__weights': ['uniform', 'distance']
        }
    },
    'Gaussian NB': {
        'model': GaussianNB(),
        'params': {}
    }
}

In [161]:
scores = []
best_estimators = {}

for mn, mp in model_params.items():
    pipe = make_pipeline(MinMaxScaler(), mp['model'])
    gscv = GridSearchCV(pipe, mp['params'], cv=5, return_train_score=False)
    gscv.fit(X_train, y_train)

    scores.append({
        'Model': mn,
        'Best Params': gscv.best_params_,
        'Best Score': gscv.best_score_
    })
    best_estimators[mn] = gscv.best_estimator_

pd.DataFrame(scores, columns=['Model', 'Best Params', 'Best Score'])

Unnamed: 0,Model,Best Params,Best Score
0,SVM,"{'svc__C': 1, 'svc__kernel': 'poly'}",0.803053
1,Random Forest,{'randomforestclassifier__criterion': 'entropy...,0.821268
2,Ridge,{'ridgeclassifier__alpha': 1},0.790242
3,Logistic Regression,"{'logisticregression__C': 0.1, 'logisticregres...",0.79573
4,KNN,"{'kneighborsclassifier__n_neighbors': 15, 'kne...",0.806656
5,Gaussian NB,{},0.779299


In [162]:
best_estimators

{'SVM': Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                 ('svc', SVC(C=1, kernel='poly'))]),
 'Random Forest': Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                 ('randomforestclassifier',
                  RandomForestClassifier(criterion='entropy',
                                         n_estimators=125))]),
 'Ridge': Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                 ('ridgeclassifier', RidgeClassifier(alpha=1))]),
 'Logistic Regression': Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                 ('logisticregression',
                  LogisticRegression(C=0.1, max_iter=1000))]),
 'KNN': Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                 ('kneighborsclassifier', KNeighborsClassifier(n_neighbors=15))]),
 'Gaussian NB': Pipeline(steps=[('minmaxscaler', MinMaxScaler()), ('gaussiannb', GaussianNB())])}

In [163]:
best_estimators['SVM'].score(X_test, y_test)

0.8415300546448088

In [164]:
best_estimators['Random Forest'].score(X_test, y_test)

0.825136612021858

In [165]:
best_estimators['Ridge'].score(X_test, y_test)

0.8032786885245902

In [166]:
best_estimators['Logistic Regression'].score(X_test, y_test)

0.8087431693989071

In [167]:
best_estimators['KNN'].score(X_test, y_test)

0.825136612021858

In [168]:
best_estimators['Gaussian NB'].score(X_test, y_test)

0.7814207650273224

In [184]:
best_model = best_estimators['SVM']

In [195]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [196]:
df_output = pd.DataFrame()
df_output['PassengerId'] = df_test['PassengerId']
df_output.head()

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896


In [197]:
df_test.drop(['Name', 'Ticket', 'PassengerId', 'Cabin'], axis='columns', inplace=True)
df_test.isna().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [198]:
df_test['Age'].fillna(df_test['Age'].median(), inplace=True)
df_test['Fare'].fillna(df['Fare'].mean(), inplace=True)

In [199]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [200]:
df_test['Sex'] = le.fit_transform(df_test['Sex'])
df_test['Embarked'] = le.fit_transform(df_test['Embarked'])
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,34.5,0,0,7.8292,1
1,3,0,47.0,1,0,7.0,2
2,2,1,62.0,0,0,9.6875,1
3,3,1,27.0,0,0,8.6625,2
4,3,0,22.0,1,1,12.2875,2


In [201]:
df_output['Survived'] = best_model.predict(df_test)
df_output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0


In [203]:
df_output.to_csv('output.csv', index=False)