In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
train_data.head(10)

In [None]:
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
test_data.head(10)

In [None]:
train_data['train_test'] = 1
test_data['train_test'] = 0
test_data['Survived'] = np.NaN
all_data = pd.concat([train_data, test_data])

%matplotlib inline
all_data.columns

In [None]:
# Data Exploration

train_data.info()

In [None]:
train_data.describe()

In [None]:
df_num = train_data[['Age', 'SibSp', 'Parch', 'Fare']]
df_cat = train_data[['Survived', 'Pclass', 'Sex', 'Embarked', 'Cabin']]

In [None]:
for i in df_num.columns:
    plt.hist(df_num[i])
    plt.title(i)
    plt.show()

In [None]:
print(df_num.corr())
sns.heatmap(df_num.corr())

In [None]:
pd.pivot_table(train_data, index = 'Survived', values = ['Age', 'Parch', 'SibSp', 'Fare'])

In [None]:
for i in df_cat.columns:
    sns.barplot(df_cat[i].value_counts().index, df_cat[i].value_counts()).set_title(i)
    plt.show()

In [None]:
print(pd.pivot_table(train_data, index = 'Survived', columns = 'Sex', values = 'Ticket', aggfunc = 'count'))
print()
print(pd.pivot_table(train_data, index = 'Survived', columns = 'Pclass', values = 'Ticket', aggfunc = 'count'))
print()
print(pd.pivot_table(train_data, index = 'Survived', columns = 'Embarked', values = 'Ticket', aggfunc = 'count'))
print()

In [None]:
# Feature Engineering

df_cat.Cabin
train_data['cabin_multiple'] = train_data.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
train_data['cabin_multiple'].value_counts()

In [None]:
pd.pivot_table(train_data, index = 'Survived',columns = 'cabin_multiple', values = 'Ticket', aggfunc = 'count')

In [None]:
train_data['cabin_adv'] = train_data.Cabin.apply(lambda x: str(x)[0])
train_data['cabin_adv'].value_counts()

In [None]:
pd.pivot_table(train_data, index = 'Survived',columns = 'cabin_adv', values = 'Ticket', aggfunc = 'count')

In [None]:
train_data['numeric_ticket'] = train_data.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)
train_data['lettered_ticket'] = train_data.Ticket.apply(lambda x: ''.join(x.split(' ')[:-1]).replace('.','').replace('/','').lower() if len(x.split(' ')[:-1]) > 0 else 0)

In [None]:
print(train_data['numeric_ticket'].value_counts())

In [None]:
pd.set_option('max_rows', None)
train_data['lettered_ticket'].value_counts()

In [None]:
pd.pivot_table(train_data, index = 'Survived', columns = 'numeric_ticket', values = 'Ticket', aggfunc = 'count')

In [None]:
pd.pivot_table(train_data, index = 'Survived', columns = 'lettered_ticket', values = 'Ticket', aggfunc = 'count')

In [None]:
train_data['name_title'] = train_data.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
train_data['name_title'].value_counts()

In [None]:
pd.pivot_table(train_data, index = 'Survived', columns = 'name_title', values = 'Name', aggfunc = 'count')

In [None]:
# Data Preprocessing

all_data.Age = train_data.Age.fillna(train_data.Age.mean())

In [None]:
all_data.dropna(subset = ['Embarked'], inplace = True)

In [None]:
all_data['norm_fare'] = np.log(train_data.Fare+1)
all_data['norm_fare'].hist()

In [None]:
all_data['cabin_multiple'] = all_data.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
all_data['cabin_adv'] = all_data.Cabin.apply(lambda x: str(x)[0])
all_data['numeric_ticket'] = all_data.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)
all_data['name_title'] = all_data.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())

In [None]:
all_data.Fare = all_data.Fare.fillna(train_data.Fare.median())

In [None]:
all_dummies = pd.get_dummies(all_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'norm_fare', 'Embarked', 'cabin_adv', 'cabin_multiple', 'numeric_ticket', 'name_title', 'train_test']])

In [None]:
X_train = all_dummies[all_dummies.train_test==1].drop(['train_test'],axis=1)
X_test = all_dummies[all_dummies.train_test==0].drop(['train_test'],axis=1)

y_train = all_data[all_data.train_test==1].Survived
y_train.shape

In [None]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()

X_train_scaled = X_train.copy()
X_train_scaled[['Age', 'Parch', 'SibSp', 'norm_fare']] = scale.fit_transform(X_train_scaled[['Age', 'Parch', 'SibSp', 'norm_fare']])
X_test_scaled = X_test.copy()
X_test_scaled[['Age', 'Parch', 'SibSp', 'norm_fare']] = scale.fit_transform(X_test_scaled[['Age', 'Parch', 'SibSp', 'norm_fare']])

In [None]:
# Model Building

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

In [None]:
gnb = GaussianNB()
cv = cross_val_score(gnb, X_train_scaled, y_train, cv = 5)
print(cv)
print(cv.mean())

In [None]:
lr = LogisticRegression(max_iter = 2000)
cv = cross_val_score(lr, X_train, y_train, cv = 5)
print(cv)
print(cv.mean())

In [None]:
dt = tree.DecisionTreeClassifier(random_state = 1)
cv = cross_val_score(dt, X_train_scaled, y_train, cv = 5)
print(cv)
print(cv.mean())

In [None]:
knn = KNeighborsClassifier()
cv = cross_val_score(knn, X_train_scaled, y_train, cv = 5)
print(cv)
print(cv.mean())

In [None]:
rf = RandomForestClassifier(random_state = 1)
cv = cross_val_score(rf, X_train_scaled, y_train, cv = 5)
print(cv)
print(cv.mean())

In [None]:
svc = SVC(probability = True)
cv = cross_val_score(svc, X_train_scaled, y_train, cv = 5)
print(cv)
print(cv.mean())

In [None]:
xgb = XGBClassifier(random_state = 1)
cv = cross_val_score(xgb, X_train_scaled, y_train, cv = 5)
print(cv)
print(cv.mean())

In [None]:
voting_clf = VotingClassifier(estimators = [('lr',lr),('knn',knn),('rf',rf),('gnb',gnb),('svc',svc),('xgb',xgb)], voting = 'soft')
cv = cross_val_score(voting_clf, X_train_scaled, y_train, cv = 5)
print(cv)
print(cv.mean())

In [None]:
voting_clf.fit(X_train_scaled, y_train)
y_pred_base_vc = voting_clf.predict(X_test_scaled)
base_submission = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': y_pred_base_vc})
base_submission.to_csv('base_submission.csv', index = False)

In [None]:
# Model tuning

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
def clf_performance(classifier, model_name):
    print(model_name)
    print('Best Score: ' + str(classifier.best_score_))
    print('Best Parameters: ' + str(classifier.best_params_))

In [None]:
lr = LogisticRegression()

param_grid = { 'max_iter' : [2000],
               'penalty': ['l1','l2'],
               'C': np.logspace(-4,4,20),
               'solver' : ['liblinear'] }

clf_lr = GridSearchCV(lr, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_lr = clf_lr.fit(X_train_scaled,y_train)
clf_performance(best_clf_lr, 'Logistic Regression')

In [None]:
knn = KNeighborsClassifier()

param_grid = { 'n_neighbors' : [3,5,7,9],
               'weights' : ['uniform', 'distance'],
               'algorithm' : ['auto', 'ball_tree', 'kd_tree'],
               'p' : [1,2] }

clf_knn = GridSearchCV(knn, param_grid = param_grid, cv=5, verbose= True, n_jobs = -1)
best_clf_knn = clf_knn.fit(X_train_scaled, y_train)
clf_performance(best_clf_knn,'KNN')

In [None]:
svc = SVC(probability = True)

param_grid = tuned_parameters = [{'kernel': ['rbf'], 'gamma' : [.1,.5,1,2,5,10], 'C': [.1,1,10,100,1000]},
                                 {'kernel': ['linear'], 'C': [.1,1,10,100,1000]},
                                 {'kernel': ['poly'], 'degree': [2,3,4,5], 'C': [.1,1,10,100,1000]}]


clf_svc = GridSearchCV(svc, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_svc = clf_svc.fit(X_train_scaled,y_train)
clf_performance(best_clf_svc, 'SVC')                                  

In [None]:
rf = RandomForestClassifier(random_state = 1)

param_grid = {'n_estimators': [100,500,1000], 'bootstrap': [True,False], 'max_depth': [3,5,10,20,50,75,100,None],
              'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1,2,4,10], 'min_samples_split': [2,5,10]}

clf_rf_rnd = RandomizedSearchCV(rf, param_distributions = param_grid, n_iter = 100, cv = 5, verbose = True, n_jobs = -1)
best_clf_rf_rnd = clf_rf_rnd.fit(X_train_scaled,y_train)
clf_performance(best_clf_rf_rnd, 'Random Forest')

In [None]:
rf = RandomForestClassifier(random_state = 1)

param_grid = {'n_estimators': [400,450,500,550], 'criterion' : ['gini', 'entropy'], 'bootstrap': [True], 'max_depth': [15,20,25],
              'max_features': ['auto', 'sqrt',  10], 'min_samples_leaf': [2,3], 'min_samples_split': [2,3]}

clf_rf = GridSearchCV(rf, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_rf = clf_rf.fit(X_train_scaled,y_train)
clf_performance(best_clf_rf, 'Random Forest')



In [None]:
best_rf = best_clf_rf.best_estimator_.fit(X_train_scaled,y_train)
feat_importances = pd.Series(best_rf.feature_importances_, index = X_train_scaled.columns)
feat_importances.nlargest(20).plot(kind = 'barh')