In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from collections import Counter

from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier , GradientBoostingClassifier , ExtraTreesClassifier , VotingClassifier
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
sns.set(style='white' , context='notebook' , palette='deep')

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
IDtest = test['PassengerId']
# Any results you write to the current directory are saved as output.

In [None]:
train.head(10)

<h2>Funtion to Detect Outlier</h2>

In [None]:
def detect_outliers(df, n, features , drop):
    outlier_indicies = []
    
    for col in features:
        Q1 = np.percentile(df[col] , 25)
        Q3 = np.percentile(df[col] , 75)
        #inter Quatile range
        IQR = Q3-Q1
        outlier_range = 1.5 * IQR
        row_index_OfOutliers = df[(df[col] < Q1 - outlier_range) | (df[col] > Q3 + outlier_range)].index
        outlier_indicies.extend(row_index_OfOutliers)
    OutlierIndex_count = Counter(outlier_indicies)
    RowWith_N_outlier = [row_index for row_index in OutlierIndex_count if OutlierIndex_count[row_index] > n]
    if drop == 1:
        df = df.drop(RowWith_N_outlier , axis=0).reset_index(drop=True)
        return df
    return RowWith_N_outlier
print(train.shape)
Outliers_to_drop = detect_outliers(train,2,["Age","SibSp","Parch","Fare"] , drop=0)
train.loc[Outliers_to_drop] # Show the outliers rows

In [None]:
train = detect_outliers(train,2,["Age","SibSp","Parch","Fare"] , drop=1)

<h3>Join Train and Test Data for Future Steps</h3>

In [None]:
train_len = len(train)
dataset = pd.concat(objs=[train , test] , axis=0 , sort=False).reset_index(drop=True)
dataset.tail()

<h3>Funtion to give information of missing data</h3>

In [None]:
def get_missingcols(data , num_of_rows , message):
    print(message)
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False) * 100
    Type = data[total.index].dtypes
    missing_data = pd.concat([total , percent , Type] , axis=1 , keys = ['Total' , 'Percent' , 'Type'])
    print(missing_data.head(num_of_rows))
get_missingcols(train , 12 , "Training Data")
get_missingcols(test , 11 , "test Data")

In [None]:
train.dtypes

In [None]:
train.describe()

In [None]:
heatmap = sns.heatmap(train[["Survived","SibSp","Parch","Age","Fare"]].corr() , annot=True , fmt=".3f" , cmap="coolwarm")

<h2>Numerical data Analysis</h2>

In [None]:
#SibSp
plot = sns.catplot(x='SibSp' , y='Survived' , data=train , kind='bar' , height=6)
plot.despine(left=True)

In [None]:
#Age
plot  = sns.FacetGrid(train , col='Survived')
plot = plot.map(sns.distplot , "Age")

In [None]:
plot = sns.kdeplot(train['Age'][(train['Survived']==0) & (train['Age'].notnull())] , color='Red' , shade=True)
plot = sns.kdeplot(train["Age"][(train["Survived"] == 1) & (train["Age"].notnull())], ax=plot, color="Blue", shade= True)
plot.set_xlabel("Age")
plot.set_ylabel("Frequency")
plot = plot.legend(["Not Survived","Survived"])

In [None]:
#Fare
dataset['Fare'].isnull().sum()

In [None]:
dataset['Fare'].fillna(dataset['Fare'].mean() , inplace=True)

In [None]:
plot = sns.distplot(dataset['Fare'] , color='m' , label="Skewness : %.2f"%(dataset['Fare'].skew()))
plot = plot.legend(loc="best")

In [None]:
dataset['Fare'] = dataset['Fare'].map(lambda x:np.log(x) if x>0 else 0)
plot = sns.distplot(dataset['Fare'] , color='m' , label="Skewness : %.2f"%(dataset['Fare'].skew()))
plot = plot.legend(loc="best")

<h2>Categorical Values</h2>

In [None]:
#Sex
plot = sns.catplot(x="Sex",y="Survived",data=train,kind="bar", height = 6)
plot.despine(left=True)

In [None]:
train[['Sex' , 'Survived']].groupby('Sex').mean()

In [None]:
#Pclass
plot = sns.catplot(x="Pclass",y="Survived",data=train,kind="bar", height = 6)
plot.despine(left=True)

In [None]:
plot = sns.catplot(x="Pclass",y="Survived" , hue='Sex',data=train,kind="bar", height = 6)
plot.despine(left=True)

In [None]:
#Embarked
dataset['Embarked'].isnull().sum()

In [None]:
dataset['Embarked'].fillna(train['Embarked'].value_counts().idxmax() , inplace=True)

In [None]:
plot = sns.catplot(x='Embarked' , y='Survived' , data = train , kind='bar') 
plot.despine(left=True)

In [None]:
plot = sns.catplot('Pclass', col='Embarked' , data=train , kind='count')
plot.despine(left=True)

In [None]:
# plot = sns.catplot(x='Pclass', y='Survived' , col='Embarked' , data=train , kind='bar')
# plot.despine(left=True)

<h2>Filling Missing Values</h2>

In [None]:
#Age


# Explore Age vs Sex, Parch , Pclass and SibSP
plot = sns.catplot(y="Age",x="Sex",data=dataset,kind="box")
plot = sns.catplot(y="Age",x="Pclass", data=dataset,kind="box")
plot = sns.catplot(y="Age",x="Parch", data=dataset,kind="box")
plot = sns.catplot(y="Age",x="SibSp", data=dataset,kind="box")

<p>This help to understand that we can predict Age on the basis of SibSp , Parch and Pclass</p>

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
train_X = dataset[['SibSp','Pclass','Parch']][dataset['Age'].notnull()]
train_y = dataset[['Age']][dataset['Age'].notnull()]
test_X = dataset[['SibSp','Pclass','Parch']][dataset['Age'].isnull()]
model = RandomForestRegressor(n_estimators=100,random_state=1)
model.fit(train_X , train_y)
prediction = np.round(model.predict(test_X),decimals=0)
dataset['Age'][dataset['Age'].isnull()] = prediction

<h1>Feature Engineering</h1>

In [None]:
dataset.rename(columns={'Name':'Title'} , inplace=True)
dataset['Title'].head()

In [None]:
#Name
unique_list = [name.split(',')[1].split('.')[0].strip() for name in dataset['Title']]
dataset['Title'] = pd.Series(unique_list)
dataset['Title'].head()

In [None]:
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col' , 'Sir']:
        return 'Mr'
    elif title in ['Mme' , 'the Countess']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms' , 'Lady' , 'Dona']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
dataset['Title']=dataset.apply(replace_titles, axis=1)
dataset['Title'].value_counts()

In [None]:
plot = sns.factorplot(x='Title' , y='Survived' , data=dataset , kind='bar')

In [None]:
#Family Size(new Feature)
dataset['Fsize'] = dataset['SibSp'] + dataset['Parch']

In [None]:
plot = sns.factorplot(x='Fsize' , y='Survived' , data=dataset , kind='bar')

In [None]:


# Create new feature of family size
dataset['Single'] = dataset['Fsize'].map(lambda s: 1 if s == 0 else 0)
dataset['SmallF'] = dataset['Fsize'].map(lambda s: 1 if  1<= s <= 2  else 0)
dataset['MedF'] = dataset['Fsize'].map(lambda s: 1 if s == 3 else 0)
dataset['LargeF'] = dataset['Fsize'].map(lambda s: 1 if s >= 4 else 0)

In [None]:
g = sns.factorplot(x="Single",y="Survived",data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x="SmallF",y="Survived",data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x="MedF",y="Survived",data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x="LargeF",y="Survived",data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")


In [None]:
#Apply onehot encoding
dataset = pd.get_dummies(dataset , columns=['Title' , 'Embarked'])
#Change Sex to 0 and 1
dataset['Sex'] = dataset['Sex'].map(lambda s:1 if s=='male' else 0)

In [None]:
dataset.head()

In [None]:
#Cabin
dataset['Cabin'].head()

In [None]:
dataset['Cabin'] = pd.Series(cabin[0] if not pd.isnull(cabin) else 'X' for cabin in dataset['Cabin'].values)

In [None]:
g = sns.countplot(dataset["Cabin"],order=['A','B','C','D','E','F','G','T','X'])

In [None]:
plot = sns.factorplot(y="Survived",x="Cabin",data=dataset,kind="bar",order=['A','B','C','D','E','F','G','T','X'])
plot = plot.set_ylabels("Survival Probability")

In [None]:
dataset = pd.get_dummies(dataset , columns=['Cabin'])

In [None]:
#Tickets
dataset['Ticket'].head()

In [None]:
Tickets = []
for t in list(dataset.Ticket):
    if not t.isdigit():
        Tickets.append(t.split(" ")[0].replace(".","").replace("/","").strip())
    else:
        Tickets.append("X")
dataset['Ticket'] = pd.Series(Tickets)

In [None]:
dataset['Ticket'].head()

In [None]:
dataset['Ticket'].value_counts()

In [None]:
dataset = pd.get_dummies(dataset , columns=['Pclass' , 'Ticket'])

In [None]:
dataset.drop(['PassengerId'] , axis=1 , inplace=True)

In [None]:
dataset.head()

In [None]:
dataset.shape

<h1>Modeling</h1>

In [None]:
train = dataset[:train_len]
test = dataset[train_len:]
test.drop('Survived' , axis=1 , inplace=True)

In [None]:
train['Survived'] = train['Survived'].astype(int)
y_train = train['Survived']
X_train= train.drop('Survived', axis=1)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
# random_state=1
# classifiers = []
# classifiers.append(SVC(random_state=random_state))
# classifiers.append(DecisionTreeClassifier(random_state=random_state))
# classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state,learning_rate=0.1))
# classifiers.append(RandomForestClassifier(random_state=random_state))
# classifiers.append(GradientBoostingClassifier(random_state=random_state))
# classifiers.append(KNeighborsClassifier())
# classifiers.append(LogisticRegression(random_state = random_state))
# classifiers.append(XGBClassifier(n_estimator =100, random_state=random_state))

# cv_scores = []
# for classifier in classifiers:
#     cv_scores.append(cross_val_score(classifier , X_train , y=y_train , scoring='accuracy' , cv=10 , n_jobs=4).mean())
# print(cv_scores)

# cv_df = pd.DataFrame({"CrossValMeans":cv_scores,"Algorithm":["SVC","DecisionTree","AdaBoost","RandomForest","GradientBoosting"
#                      ,"KNeighboors","LogisticRegression","XGBClassifier"]})
# plot = sns.barplot(x='CrossValMeans' , y='Algorithm' , data=cv_df , orient='h')

<h2>Feature Selection</h2>

In [None]:
#Using XGBoost
from matplotlib import pyplot
from xgboost import plot_importance
model = XGBClassifier(n_estimater=100  , random_state=2)
model.fit(X_train,y_train)


In [None]:
importance_features = model.feature_importances_
importance_df = pd.DataFrame({"features":X_train.columns , "importance":importance_features})
importance_df = importance_df.sort_values(by='importance' , ascending=False).reset_index(drop=True)

In [None]:
plot_importance(model)

In [None]:
pyplot.figure(figsize=(15,10))
plot = sns.barplot(x='importance' , y='features' , orient='h' , data=importance_df[:26])

In [None]:
from matplotlib import pyplot
from xgboost import plot_importance
XGB_model = XGBClassifier(n_estimators=100  , random_state=2)
#Choise Best Features
def getBestFeature(model , X_train , y_train , plot=False , feature_num=0 , F_range=[0,0]):
    model.fit(X_train,y_train)
    important_features = model.feature_importances_
    importance_df = pd.DataFrame({"features":X_train.columns , "importance":important_features})
    importance_df = importance_df.sort_values(by='importance' , ascending=False).reset_index(drop=True)
    #Plot the Features
    if plot==True:
        pyplot.figure(figsize=(15,10))
        plot = sns.barplot(x='importance' , y='features' , orient='h' , data=importance_df[:feature_num])
        pyplot.show()
    #Get the Best Features
    accuracy = []
    feature_len=[]    
    for i in range(1,int(len(important_features))):
        feature_len.append(i)
    feature_len.append(len(important_features))
    for F_len in feature_len:
        features = importance_df['features'][:F_len]
        accuracy.append(cross_val_score(model, X_train[features], y = y_train, scoring = "accuracy", cv = 10, n_jobs=4).mean())
    print(accuracy,feature_len)
    df = pd.DataFrame({"feature_len":feature_len , "accuracy":accuracy})
    pyplot.close()
    plot = pyplot.plot(df['feature_len'] , accuracy)
    return (importance_df['features'][:feature_len[accuracy.index(max(accuracy))]] , max(accuracy))

In [None]:
Best_features_XGB , Best_acc_XGB = getBestFeature(XGB_model ,X_train , y_train , plot=True , feature_num=30)

In [None]:
print(Best_features_XGB)
print("Best_acc :", Best_acc_XGB)

In [None]:
Ada_model = AdaBoostClassifier()
Best_features_Ada , Best_acc_Ada = getBestFeature(Ada_model , X_train , y_train , plot=True , feature_num=30)

In [None]:
print(Best_features_Ada )
print("Best_acc :",Best_acc_Ada)

In [None]:
RF_model = RandomForestClassifier()
Best_features_RF,Best_acc_RF = getBestFeature(RF_model , X_train , y_train , plot=True , feature_num=30)

In [None]:
print(Best_features_RF,Best_acc_RF)

In [None]:
Extree_model = ExtraTreesClassifier()
Best_features_Extree,Best_acc_Extree = getBestFeature(Extree_model , X_train , y_train , plot=True , feature_num=30)

In [None]:
print(Best_features_Extree,Best_acc_Extree)

In [None]:
# Knn_model = KNeighborsClassifier()
# Best_features_Knn,Best_acc_Knn = getBestFeature(Knn_model , X_train , y_train , plot=True , feature_num=30)

In [None]:
# print(Best_features_knn,Best_acc_knn)

In [None]:
# svm_model = SVC()
# Best_features_svc,Best_acc_svc = getBestFeature(svm_model , X_train , y_train , plot=True , feature_num=30)

In [None]:
#Greedy Approach To Select Features
# def getBestFeature_BeingGreedy(model , X_train , y_train , plot=False , feature_num=0 , F_range=[0,0]):
#     model.fit(X_train,y_train)
#     important_features = model.feature_importances_
#     importance_df = pd.DataFrame({"features":X_train.columns , "importance":important_features})
#     importance_df = importance_df.sort_values(by='importance' , ascending=False).reset_index(drop=True)
#     #Get the Best Features
#     accuracy = [0]
#     feature_list = []
#     for feature in importance_df['features']:
#         acc = accuracy[-1]
#         accuracy.append(cross_val_score(model, X_train[feature_list+[feature]], y = y_train, scoring = "accuracy", cv = 10, n_jobs=4).mean())
#         if accuracy[-1] > acc:
#             feature_list.append(feature)
#     accuracy.remove(0)
#     print(accuracy,feature_list)
#     df = pd.DataFrame({"feature_len":importance_df['features'], "accuracy":accuracy})
#     pyplot.close()
#     plot = pyplot.plot(df['feature_len'] , accuracy)
#     print(accuracy.index(max(accuracy)))
#     return importance_df['features'][:feature_list[accuracy.index(max(accuracy))]]

In [None]:
# Best_Features = getBestFeature_BeingGreedy(model ,X_train , y_train , plot=True , feature_num=30)

In [None]:
#range(1,250,10), gamma=0.05, learning_rate=0.08,n_estimators=231
#np.round(np.linspace(0 , 0.3 , 19) , 2), 'gamma':[0.01,0.05,0.1,0.5]
XGB = XGBClassifier()
gb_param_grid = {
              'n_estimators' :[231],
              'learning_rate':[0.08],
                'gamma':[0.05]
              }

gsXGB = GridSearchCV(XGB,param_grid = gb_param_grid, cv=10, scoring="accuracy", n_jobs= 4, verbose = 1)

gsXGB.fit(X_train[Best_features_XGB],y_train)

XGB_best = gsXGB.best_estimator_
#n_estimators=61
#learning_rate=0.22
# Best score
gsXGB.best_score_

In [None]:
def tuneParams(classifier,params,train_x,train_y):
    md = GridSearchCV(classifier,params,cv=10,scoring='accuracy',n_jobs=-1)
    md.fit(train_x,train_y)
    best_params,best_score = md.best_params_,np.round(md.best_score_*100,2)
    
    return best_params,best_score , md.best_estimator_

In [None]:
# ABParams = {'n_estimators':range(1,500,25),'learning_rate':[0.1,0.2,0.3,0.22,0.01,0.02,0.03,0.04,0.05]}
# RFParams = {'n_estimators':[5,10,15,20,25,30,35,40,45,50],'max_depth':[3,5,8,10],'min_samples_split':[2,5,10],'min_samples_leaf':[2,4,10],'random_state':[5]}
# ETParams = {'n_estimators':range(1,500,25),'max_depth':[3,8,12],'min_samples_leaf':[2,4,9],'verbose':[0]}
#KNNParams = {'n_neighbors':[3,8,14],'leaf_size':[2,5,9],'weights':['uniform']}
# SVCParams = {'C':[0.01,0.1,0.5],'gamma':[0.01,0.2]}
ABParams= {'learning_rate': [0.1], 'n_estimators': [250]} 
RFParams= {'max_depth': [8], 'min_samples_leaf': [2], 'min_samples_split': [5], 'n_estimators': [1], 'random_state': [5]}
ETParams= {'max_depth': [12], 'min_samples_leaf': [2], 'n_estimators': [300], 'verbose': [0]}
KNNParams= {'leaf_size': [2], 'n_neighbors': [3], 'weights': ['uniform']} 
SVCParams= {'C': [0.5], 'gamma':[0.2]}
# AdaBoost: {'learning_rate': 0.1, 'n_estimators': 250} 82.97
# RandomForest: {'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 10, 'random_state': 5} 84.11
# ExtraTrees: {'max_depth': 12, 'min_samples_leaf': 2, 'n_estimators': 300, 'verbose': 0} 83.88
# SVC: {'C': 0.5, 'gamma': 0.2} 80.02

In [None]:
processed_train_data = X_train
TargetLabel = y_train
AB_best_Params,AB_best_score,ada_Best = tuneParams(AdaBoostClassifier(),ABParams,processed_train_data[Best_features_Ada],TargetLabel)
print("AdaBoost:",AB_best_Params,AB_best_score)
RF_best_Params,RF_best_score,RF_best = tuneParams(RandomForestClassifier(),RFParams,processed_train_data[Best_features_RF],TargetLabel)
print("RandomForest:",RF_best_Params,RF_best_score)
ET_best_Params,ET_best_score,ExtC_best = tuneParams(ExtraTreesClassifier(),ETParams,processed_train_data[Best_features_Extree],TargetLabel)
print("ExtraTrees:",ET_best_Params,ET_best_score)
# KNN_best_Params,KNN_best_score,KNN_best = tuneParams(KNeighborsClassifier(),KNNParams,processed_train_data,TargetLabel)
# print("KNeighbors:",KNN_best_Params,KNN_best_score)
# SVC_best_Params,SVC_best_score,SVMC_best = tuneParams(SVC(),SVCParams,processed_train_data,TargetLabel)
# print("SVC:",SVC_best_Params,SVC_best_score)



In [None]:
# votingC = VotingClassifier(estimators=[ ('extc', ExtC_best),
# ('svc', SVMC_best), ('adac',ada_Best),('rf',RF_best),('knn',KNN_best)], voting='soft', n_jobs=4)

# votingC = votingC.fit(X_train, y_train)
# votingC 'SVM':SVMC_best.predict(test)
prediction_df = pd.DataFrame({'Xgboot':XGB_best.predict(test[Best_features_XGB]) ,'Ada':ada_Best.predict(test[Best_features_Ada]) , 
                              'RandomForest':RF_best.predict(test[Best_features_RF]) , 'ExtraTrees':ExtC_best.predict(test[Best_features_Extree])
                              })
training_prediction = pd.DataFrame({'Xgboot':XGB_best.predict(X_train[Best_features_XGB]) ,'Ada':ada_Best.predict(X_train[Best_features_Ada]) , 
                              'RandomForest':RF_best.predict(X_train[Best_features_RF]) , 'ExtraTrees':ExtC_best.predict(X_train[Best_features_Extree])
                              })

In [None]:
from sklearn.metrics import accuracy_score
training_prediction['Final_test'] = (training_prediction.sum(axis=1))/len(prediction_df.columns)
training_prediction['Final_test'] = training_prediction['Final_test'].map(lambda x: 1 if x>=0.5 else 0)
for columns in training_prediction.columns:
    print("training_score" , accuracy_score(y_train , training_prediction[columns]))
training_prediction

In [None]:
prediction_df['Final_test'] = (prediction_df.sum(axis=1))/len(prediction_df.columns)
prediction_df['Final_test'] = prediction_df['Final_test'].map(lambda x: 1 if x>=0.5 else 0)
prediction_df

In [None]:
print(XGB_best)

In [None]:
my_submission = pd.DataFrame({'PassengerId': IDtest, 'Survived':prediction_df['Final_test'] })
my_submission.to_csv('submission.csv', index=False)