In [381]:
import pandas as pd
import matplotlib.pyplot as plt

In [382]:
df = pd.read_csv('train.csv')
df.shape

(891, 12)

In [383]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [384]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [385]:
df['Age'].fillna(df['Age'].median(), inplace=True)
df.dropna(inplace=True)
df.shape, df.isna().sum()

((202, 12),
 PassengerId    0
 Survived       0
 Pclass         0
 Name           0
 Sex            0
 Age            0
 SibSp          0
 Parch          0
 Ticket         0
 Fare           0
 Cabin          0
 Embarked       0
 dtype: int64)

In [386]:
#df.drop(['Name', 'Ticket', 'Embarked', 'PassengerId', 'Cabin'], axis='columns', inplace=True)
#df.drop(['Name', 'Ticket', 'Embarked', 'PassengerId', 'Fare', 'Cabin'], axis='columns', inplace=True)
df.drop(['Name', 'Ticket', 'PassengerId', 'Cabin'], axis='columns', inplace=True)

In [387]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
1,1,1,female,38.0,1,0,71.2833,C
3,1,1,female,35.0,1,0,53.1,S
6,0,1,male,54.0,0,0,51.8625,S
10,1,3,female,4.0,1,1,16.7,S
11,1,1,female,58.0,0,0,26.55,S


In [388]:
df['Fare'].describe()

count    202.000000
mean      76.103301
std       74.759941
min        0.000000
25%       28.959375
50%       55.000000
75%       89.776050
max      512.329200
Name: Fare, dtype: float64

In [389]:
def get_MAD(series, median):
    diff = abs(series - median)
    return diff.median()

fare_median = df['Fare'].median()
MAD = get_MAD(df['Fare'], fare_median)

upper_fare_limit = fare_median + ((3.5 * MAD) / 0.6745)
lower_fare_limit = fare_median + ((-3.5 * MAD) / 0.6745)

fare_outliers = df[(df['Fare'] < lower_fare_limit) | (df['Fare'] > upper_fare_limit)]

df.drop(fare_outliers.index, axis='index', inplace=True)

fare_outliers.shape, df.shape

((17, 8), (185, 8))

In [390]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])

In [391]:
X = df.drop(['Survived'], axis='columns')
y = df['Survived']

In [392]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
1,1,0,38.0,1,0,71.2833,0
3,1,0,35.0,1,0,53.1,2
6,1,1,54.0,0,0,51.8625,2
10,3,0,4.0,1,1,16.7,2
11,1,0,58.0,0,0,26.55,2


In [393]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=X['Pclass'])

In [394]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

In [395]:
model_params = {
    'SVM': {
        'model': SVC(),
        'params': {
            'svc__C': [1e-4, 1e-3, 1e-2, 1e-1, 1, 2],
            'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'params': {
            'randomforestclassifier__n_estimators': [100, 125, 150, 175, 200],
            'randomforestclassifier__criterion': ['gini', 'entropy', 'log_loss']
        }
    },
    'Ridge': {
        'model': RidgeClassifier(),
        'params': {
            'ridgeclassifier__alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 3, 4, 5]
        }
    },
    'Logistic Regression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'logisticregression__C': [1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 3, 4, 5],
            'logisticregression__solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky']
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'kneighborsclassifier__n_neighbors': [5, 10, 15, 20, 25, 30 ,35],
            'kneighborsclassifier__weights': ['uniform', 'distance']
        }
    },
    'Gaussian NB': {
        'model': GaussianNB(),
        'params': {}
    }
}

In [396]:
scores = []
best_estimators = {}

for mn, mp in model_params.items():
    pipe = make_pipeline(MinMaxScaler(), mp['model'])
    gscv = GridSearchCV(pipe, mp['params'], cv=5, return_train_score=False)
    gscv.fit(X_train, y_train)

    scores.append({
        'Model': mn,
        'Best Params': gscv.best_params_,
        'Best Score': gscv.best_score_
    })
    best_estimators[mn] = gscv.best_estimator_

pd.DataFrame(scores, columns=['Model', 'Best Params', 'Best Score'])

Unnamed: 0,Model,Best Params,Best Score
0,SVM,"{'svc__C': 0.1, 'svc__kernel': 'poly'}",0.738095
1,Random Forest,{'randomforestclassifier__criterion': 'entropy...,0.679894
2,Ridge,{'ridgeclassifier__alpha': 4},0.687302
3,Logistic Regression,"{'logisticregression__C': 2, 'logisticregressi...",0.694709
4,KNN,"{'kneighborsclassifier__n_neighbors': 30, 'kne...",0.744974
5,Gaussian NB,{},0.680159


In [397]:
best_estimators

{'SVM': Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                 ('svc', SVC(C=0.1, kernel='poly'))]),
 'Random Forest': Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                 ('randomforestclassifier',
                  RandomForestClassifier(criterion='entropy',
                                         n_estimators=200))]),
 'Ridge': Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                 ('ridgeclassifier', RidgeClassifier(alpha=4))]),
 'Logistic Regression': Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                 ('logisticregression', LogisticRegression(C=2, max_iter=1000))]),
 'KNN': Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                 ('kneighborsclassifier', KNeighborsClassifier(n_neighbors=30))]),
 'Gaussian NB': Pipeline(steps=[('minmaxscaler', MinMaxScaler()), ('gaussiannb', GaussianNB())])}

In [398]:
best_estimators['SVM'].score(X_test, y_test)

0.7872340425531915

In [399]:
best_estimators['Random Forest'].score(X_test, y_test)

0.7872340425531915

In [400]:
best_estimators['Ridge'].score(X_test, y_test)

0.8297872340425532

In [401]:
best_estimators['Logistic Regression'].score(X_test, y_test)

0.7872340425531915

In [402]:
best_estimators['KNN'].score(X_test, y_test)

0.7659574468085106

In [403]:
best_estimators['Gaussian NB'].score(X_test, y_test)

0.7872340425531915

In [413]:
best_model = best_estimators['Ridge']

In [414]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [415]:
df_output = pd.DataFrame()
df_output['PassengerId'] = df_test['PassengerId']
df_output.head()

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896


In [416]:
df_test.drop(['Name', 'Ticket', 'PassengerId', 'Cabin'], axis='columns', inplace=True)
df_test.isna().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [417]:
df_test['Age'].fillna(df_test['Age'].median(), inplace=True)
df_test['Fare'].fillna(df['Fare'].mean(), inplace=True)

In [418]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [419]:
df_test['Sex'] = le.fit_transform(df_test['Sex'])
df_test['Embarked'] = le.fit_transform(df_test['Embarked'])
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,34.5,0,0,7.8292,1
1,3,0,47.0,1,0,7.0,2
2,2,1,62.0,0,0,9.6875,1
3,3,1,27.0,0,0,8.6625,2
4,3,0,22.0,1,1,12.2875,2


In [420]:
df_output['Survived'] = best_model.predict(df_test)
df_output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [421]:
df_output.to_csv('output.csv', index=False)