In [1]:
# Import standard libraries
import pandas as pd
import math, time, random, datetime
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# for Preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize

# Machine Learning
import catboost
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC, SVC, NuSVC
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier, PassiveAggressiveClassifier, RidgeClassifierCV, Perceptron
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree
import lightgbm as lgb

from catboost import CatBoostClassifier, Pool, cv


# ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
data_train = pd.read_csv(r'C:\Users\Shalini\Documents\Kaggle\Titanic\train.csv')
data_test = pd.read_csv(r'C:\Users\Shalini\Documents\Kaggle\Titanic\test.csv')

In [3]:
data_all = [data_train, data_test]
data_test1 = data_test.copy()

In [4]:
data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [6]:
def check_missing_data(df):
    flag=df.isna().sum().any()
    if flag==True:
        total = df.isnull().sum()
        percent = (df.isnull().sum()*100)/(df.isnull().count())
        output = pd.concat([total, percent], axis=1, keys=['Total', 'Percent(%)'])
        data_type = []
        for col in df.columns:
            dtype = str(df[col].dtype)
            data_type.append(dtype)
        output['Types'] = data_type
        return(output.sort_values(['Total'], ascending=0))
    else:
        return(False)

In [7]:
check_missing_data(data_train)

Unnamed: 0,Total,Percent(%),Types
Cabin,687,77.104377,object
Age,177,19.86532,float64
Embarked,2,0.224467,object
PassengerId,0,0.0,int64
Survived,0,0.0,int64
Pclass,0,0.0,int64
Name,0,0.0,object
Sex,0,0.0,object
SibSp,0,0.0,int64
Parch,0,0.0,int64


In [8]:
check_missing_data(data_test)

Unnamed: 0,Total,Percent(%),Types
Cabin,327,78.229665,object
Age,86,20.574163,float64
Fare,1,0.239234,float64
PassengerId,0,0.0,int64
Pclass,0,0.0,int64
Name,0,0.0,object
Sex,0,0.0,object
SibSp,0,0.0,int64
Parch,0,0.0,int64
Ticket,0,0.0,object


In [9]:
drop_columns = ['PassengerId','Cabin', 'Ticket']
for dataset in data_all:
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)

    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)

    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)
    dataset.drop(drop_columns, axis=1, inplace = True)

In [10]:
check_missing_data(data_test)

False

In [11]:
data_train.count()

Survived    891
Pclass      891
Name        891
Sex         891
Age         891
SibSp       891
Parch       891
Fare        891
Embarked    891
dtype: int64

In [12]:
for dataset in data_all:    
    dataset['FamilySize'] = dataset ['SibSp'] + dataset['Parch'] + 1

    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)

    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)

In [13]:
data_train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,Title,FareBin,AgeBin
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,2,Mr,"(-0.001, 7.91]","(16.0, 32.0]"
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,2,Mrs,"(31.0, 512.329]","(32.0, 48.0]"
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,1,Miss,"(7.91, 14.454]","(16.0, 32.0]"
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,2,Mrs,"(31.0, 512.329]","(32.0, 48.0]"
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1,Mr,"(7.91, 14.454]","(32.0, 48.0]"


In [14]:
#cleanup rare title names
#print(data_train['Title'].value_counts())
for dataset in data_all:
    stat_min = 10
    title_name = (dataset['Title'].value_counts() < stat_min) #this will create a true false series with title name as index
    #apply and lambda functions are quick and dirty code to find and replace with fewer lines of code: https://community.modeanalytics.com/python/tutorial/pandas-groupby-and-python-lambda-functions/
    dataset['Title'] = dataset['Title'].apply(lambda x: 'Misc' if title_name.loc[x] == True else x)

In [15]:
data_test.nunique()

Pclass          3
Name          418
Sex             2
Age            79
SibSp           7
Parch           8
Fare          169
Embarked        3
FamilySize      9
Title           5
FareBin         4
AgeBin          5
dtype: int64

In [16]:
cat_features = ['Pclass', 'Sex', 'Embarked', 'FamilySize', 'Title','FareBin', 'AgeBin']

In [17]:
encoder = LabelEncoder()

In [18]:
encoded = data_train[cat_features].apply(encoder.fit_transform)
encoded.head()

Unnamed: 0,Pclass,Sex,Embarked,FamilySize,Title,FareBin,AgeBin
0,2,1,2,1,3,0,1
1,0,0,0,1,4,3,2
2,2,0,2,0,2,1,1
3,0,0,2,1,4,3,2
4,2,1,2,0,3,1,2


In [19]:
X_train = encoded
y_train = data_train['Survived']

In [20]:
#Machine Learning Algorithm (MLA) Selection and Initialization
MLA = [
    #Ensemble Methods
    AdaBoostClassifier(),
    BaggingClassifier(),
    ExtraTreesClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier(),

    #Gaussian Processes
    GaussianProcessClassifier(),
    
    #GLM
    LogisticRegression(),
    PassiveAggressiveClassifier(),
    RidgeClassifierCV(),
    SGDClassifier(),
    Perceptron(),
    
    #Navies Bayes
    BernoulliNB(),
    GaussianNB(),
    
    #Nearest Neighbor
    KNeighborsClassifier(),
    
    #SVM
    SVC(probability=True),
    NuSVC(probability=True),
    LinearSVC(),
    
    #Trees    
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    
    #Discriminant Analysis
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),

    
    #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
    XGBClassifier()    
    ]



#split dataset in cross-validation. This is an alternative to train_test_split
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .2, train_size = .8, random_state = 1 ) # run model 10x with 60/30 split intentionally leaving out 10%

#create table to compare MLA metrics
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)

#MLA_predict = y_train

#index through MLA and save performance to table
row_index = 0
for alg in MLA:

    #set name and parameters
    MLA_name = alg
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    
    #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
    cv_results = model_selection.cross_validate(alg, X_train, y_train, cv  = cv_split)

    MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()   
    #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
    MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3   #let's know the worst that can happen!
    

    #save MLA predictions - see section 6 for usage
    alg.fit(X_train, y_train)
    alg.predict(X_train)
    
    row_index+=1

    
#print and sort table: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare
#MLA_predict

Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy Mean,MLA Test Accuracy Mean,MLA Test Accuracy 3*STD,MLA Time
14,"SVC(C=1.0, cache_size=200, class_weight=None, ...","{'C': 1.0, 'cache_size': 200, 'class_weight': ...",0.839185,0.816201,0.0913216,0.0415826
15,"NuSVC(cache_size=200, class_weight=None, coef0...","{'cache_size': 200, 'class_weight': None, 'coe...",0.835393,0.812291,0.0972067,0.0482699
0,"(DecisionTreeClassifier(class_weight=None, cri...","{'algorithm': 'SAMME.R', 'base_estimator': Non...",0.823736,0.804469,0.0874083,0.0400878
20,"QuadraticDiscriminantAnalysis(priors=None, reg...","{'priors': None, 'reg_param': 0.0, 'store_cova...",0.827949,0.804469,0.0914906,0.00229373
4,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': True, 'class_weight': None, 'cri...",0.891573,0.798324,0.0872314,0.00817735
21,"XGBClassifier(base_score=0.5, booster='gbtree'...","{'base_score': 0.5, 'booster': 'gbtree', 'cols...",0.853652,0.797207,0.113683,0.0366031
3,([DecisionTreeRegressor(criterion='friedman_ms...,"{'criterion': 'friedman_mse', 'init': None, 'l...",0.860112,0.797207,0.109658,0.0593181
1,"(DecisionTreeClassifier(class_weight=None, cri...","{'base_estimator': None, 'bootstrap': True, 'b...",0.892275,0.794972,0.100852,0.0088738
2,"(ExtraTreeClassifier(class_weight=None, criter...","{'bootstrap': False, 'class_weight': None, 'cr...",0.895225,0.792179,0.112877,0.00747907
17,"DecisionTreeClassifier(class_weight=None, crit...","{'class_weight': None, 'criterion': 'gini', 'm...",0.895225,0.789944,0.117127,0.00179503


In [21]:
classifiers = []
model1 = XGBClassifier()
classifiers.append(model1)
model2 = svm.SVC()
classifiers.append(model2)
model3 = tree.DecisionTreeClassifier()
classifiers.append(model3)
model4 = RandomForestClassifier()
classifiers.append(model4)
model5 = LinearSVC()
classifiers.append(model5)
model6 = GradientBoostingClassifier()
classifiers.append(model6)
model7 = KNeighborsClassifier()
classifiers.append(model7)
model8 = GaussianNB()
classifiers.append(model8)
model9 = SGDClassifier()
classifiers.append(model9)
#model10 = SGDClassifier()
#classifiers.append(model10)


In [22]:
def fit_ml_algo(algo, X_train, y_train, cv):
    model = algo.fit(X_train, y_train)
    acc = round(model.score(X_train, y_train)*100,2)
    train_pred = model_selection.cross_val_predict(algo, X_train, y_train, cv=cv,n_jobs=-1)
    acc_cv = round(metrics.accuracy_score(y_train, train_pred) *100, 2)
    return train_pred, acc, acc_cv

In [23]:
for clf in classifiers:
    start_time = time.time()
    train_pred_log, acc_log, acc_cv_log = fit_ml_algo(clf, X_train, y_train, 10)
    log_time = (time.time()-start_time)
    print("Accuracy for %s:  %s" % (clf, acc_log))
    print("Accuracy CV 10-fold for %s: %s" % (clf, acc_cv_log))
    print("Running Time for %s: %s" % (clf, datetime.timedelta(seconds=log_time)))

Accuracy for XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1):  84.4
Accuracy CV 10-fold for XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1): 82.49
Running Time for XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       

In [24]:
data_test.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,Title,FareBin,AgeBin
0,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q,1,Mr,"(-0.001, 7.896]","(30.4, 45.6]"
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,S,2,Mrs,"(-0.001, 7.896]","(45.6, 60.8]"
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q,1,Mr,"(7.896, 14.454]","(60.8, 76.0]"
3,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,S,1,Mr,"(7.896, 14.454]","(15.2, 30.4]"
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,S,3,Mrs,"(7.896, 14.454]","(15.2, 30.4]"


In [25]:
data_test.isnull().sum()

Pclass        0
Name          0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked      0
FamilySize    0
Title         0
FareBin       0
AgeBin        0
dtype: int64

In [26]:
t_encoded = data_test[cat_features].apply(encoder.fit_transform)
t_encoded.head()

Unnamed: 0,Pclass,Sex,Embarked,FamilySize,Title,FareBin,AgeBin
0,2,1,1,0,3,0,2
1,2,0,2,1,4,0,3
2,1,1,1,0,3,1,4
3,2,1,2,0,3,1,1
4,2,0,2,2,4,1,1


In [27]:
X_test = t_encoded
X_test.head()

Unnamed: 0,Pclass,Sex,Embarked,FamilySize,Title,FareBin,AgeBin
0,2,1,1,0,3,0,2
1,2,0,2,1,4,0,3
2,1,1,1,0,3,1,4
3,2,1,2,0,3,1,1
4,2,0,2,2,4,1,1


In [28]:
predict = model2.predict(X_test)

In [29]:
predict[:20]

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1],
      dtype=int64)

In [30]:
data_test1.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [31]:
submission = pd.DataFrame()
submission['PassengerId'] = data_test1['PassengerId']
submission['Survived'] = predict
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [32]:
submission.to_csv(r'C:\Users\Shalini\Documents\Kaggle\Titanic\SubmissionNew.csv', index=False)
print('Submission CSV is ready!')