In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from collections import Counter
from sklearn.model_selection import GridSearchCV

In [None]:
import tensorflow as tf
dev = tf.config.list_physical_devices('GPU')
if len(dev) > 0:
    tf.config.experimental.set_memory_growth(dev[0], 'True')

In [None]:
!nvidia-smi

In [None]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
combine = [train, test]
train.info()

In [None]:
train.head()

In [None]:
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers   
out = detect_outliers(train, 2, ['Fare', 'SibSp', 'Parch', 'Age'])

In [None]:
train.drop(out, axis = 0, inplace = True)

In [None]:
train.info()

In [None]:
test1 = test.copy()
test1['Survived'] = np.nan
data = pd.concat([train, test1]).reset_index(drop = True)
len(data)

In [None]:
data.info()

In [None]:
plt.figure(figsize = [20, 15])
plt.subplot(3, 3, 1)
sns.histplot(data = train, x = train['Sex'], hue = train['Survived'], multiple = 'dodge')
plt.subplot(3, 3, 2)
sns.histplot(data = train, x = train['Pclass'], hue = train['Survived'], multiple = 'dodge')
plt.subplot(3, 3, 3)
sns.histplot(data = train, x = train['Embarked'], hue = train['Survived'], multiple = 'dodge')
plt.subplot(3, 3, 4)
sns.histplot(data = train, x = train['Age'], hue = train['Survived'], multiple = 'dodge')
plt.subplot(3, 3, 5)
sns.histplot(data = train, x = train['SibSp'], hue = train['Survived'], multiple = 'dodge')
plt.subplot(3, 3, 6)
sns.histplot(data = train, x = train['Parch'], hue = train['Survived'], multiple = 'dodge')
plt.plot()

# Data Preprocessing

### Adding Title attribute to the data

In [None]:
data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand = False)
pd.crosstab(data['Title'], data.Sex)

In [None]:
col_rep = ['Capt', 'Col', 'Countess', 'Don', 'Dona', 'Dr', 'Jonkheer', 'Lady',
           'Major', 'Rev', 'Sir']

data['Title'].replace(col_rep, 'Rare', inplace = True)
data['Title'].replace(['Mlle', 'Ms'], 'Miss', inplace = True)
data['Title'].replace(['Mme'], 'Mrs', inplace = True)
pd.crosstab(data['Title'], data['Sex'])

### Encoding the Sex attribute

In [None]:
data.loc[data['Sex'] == 'male', 'Sex'] = 0
data.loc[data['Sex'] == 'female', 'Sex'] = 1

### Replacing the null values in Age attribute with median values of the respective passenger classes

In [None]:
# guess_age = np.zeros([2, 3])

# for i in range(2): 
#     for j in range(3):
#         guess = data[(data['Sex'] == i) & (data['Pclass'] == j+1)]['Age'].dropna()
#         guess = guess.median()
#         guess_age[i, j] = np.floor(guess)

# for i in range(2):
#     for j in range(3):
#         data.loc[(data['Sex'] == i) & (data['Pclass'] == j+1) & (data['Age'].isnull()), 'Age'] = guess_age[i, j]            
        
index_NaN_age = list(data["Age"][data["Age"].isnull()].index)

for i in index_NaN_age :
    age_med = np.nanmedian(data[data['Title'] == data.loc[i, 'Title']]["Age"])
    age_pred = np.nanmedian(data["Age"][((data['Sex'] == data.iloc[i]["Sex"]) & 
                                         (data['Title'] == data.iloc[i]["Title"]) & 
                                         (data['Pclass'] == data.iloc[i]["Pclass"]) 
                                        )])
    if not np.isnan(age_pred):
        data.loc[i, 'Age'] = age_pred
    else:
        data.loc[i, 'Age'] = age_med
    
data.info()

### Adding unknown value to the null values of the Cabin Attribute

In [None]:
f = data['Cabin'].str.extract('(^.{0,1})')
data['Cabin'] = f
data.loc[data['Cabin'].isnull(), 'Cabin'] = 'U' # U is unknown

In [None]:
pd.value_counts(data['Cabin'])

In [None]:
data.loc[data['Cabin'] == 'T', 'Cabin'] = 'A'
sns.histplot(data = data[:len(train)], x = data[:len(train)]['Cabin'], hue = 'Survived', multiple = 'stack');

In [None]:
data[:len(train)].groupby('Cabin').mean()

In [None]:
data['Deck'] = 'U'
data.loc[(data['Cabin'] == 'A') | (data['Cabin'] == 'B') | (data['Cabin'] == 'C'), 'Deck'] = 'ABC'
data.loc[(data['Cabin'] == 'D') | (data['Cabin'] == 'E'), 'Deck'] = 'DE'
data.loc[(data['Cabin'] == 'F') | (data['Cabin'] == 'G'), 'Deck'] = 'FG'
data.loc[(data['Cabin'] == np.nan), 'Deck'] = 'U'

In [None]:
data.groupby('Deck').mean()

### Adding Relatives and Companion attributes

In [None]:
data['Relatives'] = data['SibSp'] + data['Parch'] + 1
data.loc[data['Relatives'] == 0, 'Companions'] = 0
data.loc[data['Relatives'] > 0, 'Companions'] = 1
#     data.drop(['Relatives'], axis = 1, inplace = True)
# train.groupby(['Companions']).mean()['Survived']

In [None]:
# train.groupby('Relatives').mean()

In [None]:
data.loc[data['Relatives'] == 0, 'Family_Size'] = 0
data.loc[data['Relatives'] == 1, 'Family_Size'] = 1
data.loc[(data['Relatives'] > 1) & (data['Relatives'] < 5), 'Family_Size'] = 2
data.loc[(data['Relatives'] >= 5), 'Family_Size'] = 3

In [None]:
data.groupby('Family_Size').mean()

### Adding Surname Attribute

In [None]:
data['Name']

In [None]:
data['Surname'] = data['Name'].map(lambda i: i.split(',')[0])

### Ticket Frequency

In [None]:
data['Ticket_Frequency'] = data.groupby('Ticket')['Ticket'].transform('count')

### Breaking down fare into different classes

In [None]:
data.Fare.describe()

In [None]:
plt.figure(figsize=(6, 9))
sns.boxplot(data = data[data.Fare < 200], y = 'Fare');

### Adding mother column because they had higher chance of survival compared to other women

In [None]:
data.loc[(data['Parch'] > 0) & (data['Sex'] == 1) & (data['Age'] > 18) & (data['Title'] == 'Mrs'), 'Mother'] = 1
data.loc[data['Mother'] != 1, 'Mother'] = 0


### Replacing null values in Embarked attribute with the most frequent value

In [None]:
data[data['Embarked'].isnull()]

In [None]:
plt.figure(figsize = (10, 8))
sns.boxplot(data = data, x = 'Embarked', y = 'Fare', hue = 'Pclass');
data.loc[data['Embarked'].isnull(), 'Embarked'] = 'C'

### Imputing missing values in Fare

In [None]:
data[data['Fare'].isnull()]

In [None]:
np.nanmedian(data[data['Pclass']==3]['Fare'])

In [None]:
fm = np.nanmedian(data[(data['Pclass']==1)]['Fare'])
data.loc[data['Fare'].isnull(), 'Fare'] = fm

### Transforming Fare attribute with log transformation

In [None]:
sns.histplot(data['Fare'], kde = True);

In [None]:
d = data[data['Fare'] != 0].index
data['Fare'] = data['Fare'].map(lambda i: np.log(i) if i>0 else 0)

In [None]:
sns.histplot(data['Fare'], kde = True);

### Scaling the Numerical attributes

In [None]:
col = ['Fare', 'Relatives', 'Age', 'SibSp', 'Parch']
ss = StandardScaler()
data[col] = ss.fit_transform(data[col])

In [None]:
data.head()

### Dividing the Fare into different categories based on the price of the ticket

In [None]:
# data.loc[data.Fare < 15, 'Cost'] = 0
# data.loc[(data.Fare >= 15) & (data.Fare < 60), 'Cost'] = 1
# data.loc[(data.Fare >= 60) & (data.Fare < 150), 'Cost'] = 2
# data.loc[(data.Fare >= 150), 'Cost'] = 3

In [None]:
# data[:891].groupby('Cost').mean()

In [None]:
# sns.distplot(data[:len(train)]['Cost'], kde = False)

In [None]:
data.info()

### Encoding the categorical attributes

In [None]:
# cat = ['Pclass', 'Title', 'Age_group', 'Companions', 'Embarked', 'Sex', 'Mother', 'Family_Size', 'Cost']
cat = ['Title', 'Pclass', 'Sex', 'Embarked', 'Family_Size', 'Deck']
data = pd.get_dummies(data, columns = cat)

In [None]:
drop_c = ['Name', 'PassengerId', 'Ticket', 'Ticket_Frequency',
          'Mother', 'Companions', 'Cabin', 'Surname']
data.drop(columns = drop_c, inplace = True)


In [None]:
data.info()

In [None]:
# test_id = data[data['Age'].isnull()].index
# train_id = data[~data['Age'].isnull()].index
# train_sample = data.iloc[train_id].drop(['Age', 'Survived'], axis = 1)
# train_label = data.iloc[train_id]['Age']
# test_sample = data.iloc[test_id].drop(['Age', 'Survived'], axis = 1)
# param = [
#     {'max_depth': range(2, 8, 1)}, 
#     {'n_estimators': range(30, 160, 10)}, 
#     {'max_leaf_nodes': range(10, 20, 2)}
# ]

# rf = RandomForestRegressor()
# gs_rf = GridSearchCV(rf, param, cv = 5, n_jobs = -1, verbose = 1)
# gs_rf.fit(train_sample, train_label)
# gs_rf.score(train_sample, train_label), gs_rf.best_params_

In [None]:
data.info()

In [None]:
data

In [None]:
data.info()

In [None]:
# cc = ['Sex', 'Relatives', 'Pclass', 'Family_Size']
# aa = data[cc].astype(np.uint8)
# data[cc] = aa
# data.info()

In [None]:
train, test = data[:len(train)], data[len(train):]

In [None]:
X_train, y_train, X_test = train.drop(['Survived'], axis = 1), train['Survived'], test.drop(['Survived'], axis = 1)
X_train.shape, y_train.shape, X_test.shape

In [None]:
X_train.info()

In [None]:
X_test.info()

In [None]:
X_train_full, y_train_full = X_train, y_train

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.125, random_state=np.random.randint(10, 100))
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

In [None]:
# for train_id, valid_id in ids.split(X_train, y_train):
#     X_train, X_valid = X_train.iloc[train_id], X_train.iloc[valid_id]
#     y_train, y_valid = y_train[train_id], y_train[valid_id]
# X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

# Feature Selection

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier(n_estimators = 39)
et.fit(X_train, y_train)
et.score(X_valid, y_valid), et.score(X_train, y_train)

In [None]:
pd.Series(et.feature_importances_, 
             index = X_train.columns)

# Modeling

# SVM

In [None]:
param = [
    {
        'kernel': ['rbf'], 'C': [4, 5, 6, 8, 9, 10, 12], 
        'gamma': [0.001, 0.003, 0.01, 0.03, 0, 0.1, 0.3, 1, 3, 10]
    }, 
]

svc = SVC(probability = True)
gs_svc = GridSearchCV(svc, param, cv = 5, n_jobs = -1, verbose = 1)
gs_svc.fit(X_train, y_train)
gs_svc.best_estimator_, gs_svc.score(X_valid, y_valid), gs_svc.score(X_train, y_train)

In [None]:
svc_best = gs_svc.best_estimator_

### Predictions!!

In [None]:
# !rm 'submission.csv'

In [None]:
# predictions = gs_svc.predict(X_test)
# sub = pd.read_csv('../input/titanic/gender_submission.csv')
# sub['Survived'] = predictions
# sub.to_csv('submission.csv', index = False)

# XGBoost

In [None]:

from xgboost import XGBClassifier

param_grid={
    'max_depth': range(3, 8, 3),
    'n_estimators': range(10, 40, 2),
    'learning_rate': [0.2, 0.1, 0.03, 0.01]
}

xg = XGBClassifier(eval_metric='logloss', n_jobs = -1, tree_method='gpu_hist', use_label_encoder = False)
gs_xg = GridSearchCV(xg, param_grid, cv = 5, n_jobs = -1, verbose = 1)
gs_xg.fit(X_train, y_train)

gs_xg.best_params_, gs_xg.score(X_valid, y_valid), gs_xg.score(X_train, y_train)

In [None]:
xg_best = gs_xg.best_estimator_

# Random Forest

In [None]:
param = [
    {'n_estimators': [50, 100, 150], 
     'max_depth': [4, 9, 15], 
     'max_leaf_nodes': [ 15, 30, 50, 100]}, 
]

rf = RandomForestClassifier()
gs_rf = GridSearchCV(rf, param, cv = 5, n_jobs = -1, verbose = 1)
gs_rf.fit(X_train, y_train)
gs_rf.best_estimator_, gs_rf.score(X_valid, y_valid), gs_rf.score(X_train, y_train)

In [None]:
rf_best = gs_rf.best_estimator_

# Adaptive Boosting

In [None]:
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier
param = [
    {'n_estimators': [150, 200, 300, 400, 450, 500, 550]}
]
ada = AdaBoostClassifier()
gs_ada = GridSearchCV(ada, param, cv = 5, n_jobs = -1, verbose = 1)
gs_ada.fit(X_train, y_train)
gs_ada.best_estimator_, gs_ada.score(X_valid, y_valid), gs_ada.score(X_train, y_train)

In [None]:
ada_best = gs_ada.best_estimator_

# Extra Trees Classifier

In [None]:
param = [
    {'n_estimators': range(8, 28, 4), 
     'max_depth': range(4, 20, 4),
     'max_leaf_nodes': range(4, 20, 4),
    }
]

et = ExtraTreesClassifier()
gs_et = GridSearchCV(et, param, cv = 5, n_jobs = -1, verbose = 1)
gs_et.fit(X_train, y_train)
gs_et.best_estimator_, gs_et.score(X_valid, y_valid), gs_et.score(X_train, y_train)

In [None]:
et_best = gs_et.best_estimator_

In [None]:
et_best.fit(X_train, y_train)
pd.Series(et_best.feature_importances_, index = X_train.columns)

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
param = [
    {'n_neighbors': range(2, 8, 1)}
]

knn = KNeighborsClassifier()
gs_knn = GridSearchCV(knn, param, cv = 5, n_jobs = -1)
gs_knn.fit(X_train, y_train)
gs_knn.best_estimator_, gs_knn.score(X_valid, y_valid), gs_knn.score(X_train_full, y_train_full)

In [None]:
knn_best = gs_knn.best_estimator_

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
param = [
    {
        'criterion': ['gini', 'entropy'],
        'max_depth': range(4, 20, 2),
        'max_leaf_nodes': range(4, 20, 2),
    }
]

dt = DecisionTreeClassifier()
gs_dt = GridSearchCV(dt, param, cv = 5, n_jobs = -1, verbose = 1)
gs_dt.fit(X_train, y_train)
gs_dt.best_estimator_, gs_dt.score(X_valid, y_valid), gs_dt.score(X_train, y_train)

In [None]:
dt_best = gs_dt.best_estimator_

# Voting Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
vc_lr = LogisticRegression(solver = 'sag')
vc_mlp = MLPClassifier()
vc_dt = DecisionTreeClassifier()

vc = VotingClassifier(estimators = [('rf', rf_best), ('svc', svc_best), ('lr', vc_lr), 
                                    ('mlp', vc_mlp), ('xgc', xg_best), ('knn', knn_best),
                                    ('ada', ada_best), ('ET', et_best), ('dt', dt_best)], 
                                    voting = 'soft', n_jobs = -1, verbose = 1)
vc.fit(X_train, y_train)
vc.score(X_valid, y_valid), vc.score(X_train, y_train), vc.score(X_train_full, y_train_full)

In [None]:
predictions = vc.predict(X_test).astype(np.uint8)
sub = pd.read_csv('../input/titanic/gender_submission.csv')
sub['Survived'] = predictions
sub.to_csv('submission.csv', index = False)

In [None]:
type(predictions[0])

In [None]:
pd.value_counts(predictions)

In [None]:
# import pickle
# models = [svc_best, rf_best, xg_best, knn_best, ada_best, et_best]
# name = ['svc.sav', 'rf.sav', 'xg.sav', 'knn.sav', 'ada.sav', 'et.sav']
# for i in range(len(models)):
#     pickle.dump(models[i], open(name[i], 'wb'))
    

# Logistic Regression

In [None]:
# from sklearn.linear_model import LogisticRegression
# lr = LogisticRegression()
# lr.fit(X_train, y_train)

# lr.score(X_valid, y_valid), lr.score(X_train, y_train)

In [None]:
# !rm 'submission.csv'
# predictions = lr.predict(X_test)
# sub = pd.read_csv('../input/titanic/gender_submission.csv')
# sub['Survived'] = predictions
# sub.to_csv('submission.csv', index = False)