In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv("/Users/patrickfahy99/Documents/Kaggle_datasets/titanic/train.csv")
X_test = pd.read_csv("/Users/patrickfahy99/Documents/Kaggle_datasets/titanic/test.csv")

train.index = train["PassengerId"]
del train["PassengerId"]
del train["Cabin"]
del train['Ticket']

X_test.index = X_test["PassengerId"]
del X_test["PassengerId"]
del X_test["Cabin"]
del X_test["Ticket"]

y_train = train["Survived"]
X_train = train.drop(labels = ["Survived"],axis = 1)

The Cabin column simply has too many missing values to be of great use.

In [2]:
# Having a look at the datasets to look for missing values

X_train.info()
print("\n")
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Name      891 non-null    object 
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 62.6+ KB


<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Fare      417 non

We see that the train columns with missing data are Age and Embarked.
The test columns with missing data are Age and fare.

I will input missing Age and Fare values with the median, and the Embarked with the mode.

In [3]:
data = [X_train, X_test]

for i in data:    
    #complete missing age with median
    i['Age'].fillna(i['Age'].median(), inplace = True)

    #complete embarked with mode
    i['Embarked'].fillna(i['Embarked'].mode()[0], inplace = True)

    #complete missing fare with median
    i['Fare'].fillna(i['Fare'].median(), inplace = True)

In [4]:
for i in data:    
    #Discrete variables
    i['FamilySize'] = i['SibSp'] + i['Parch'] + 1

    i['IsAlone'] = 1 #initialize to yes/1 is alone
    i['IsAlone'].loc[i['FamilySize'] > 1] = 0 # now update to no/0 if family size is greater than 1

    #quick and dirty code split title from name: http://www.pythonforbeginners.com/dictionary/python-split
    i['Title'] = i['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]


    #Continuous variable bins; qcut vs cut: https://stackoverflow.com/questions/30211923/what-is-the-difference-between-pandas-qcut-and-pandas-cut
    #Fare Bins/Buckets using qcut or frequency bins: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.qcut.html
    i['FareBin'] = pd.qcut(i['Fare'], 4)
    
    del i['Name']
    del i['SibSp']
    del i['Parch']
    del i['Fare']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [5]:
X_train['Title'].value_counts()

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
Mme               1
Jonkheer          1
Don               1
Capt              1
Lady              1
Sir               1
the Countess      1
Ms                1
Name: Title, dtype: int64

We see that there are multiple entries which only come up a few times, we will group these together as 'Other'.

In [6]:
other_train = (X_train['Title'].value_counts() < 8)
X_train['Title'] = X_train['Title'].apply(lambda x: 'Misc' if other_train.loc[x] == True else x)

other_test = (X_test['Title'].value_counts() < 8)
X_test['Title'] = X_test['Title'].apply(lambda x: 'Misc' if other_test.loc[x] == True else x)

X_test['Title'].value_counts()

Mr        240
Miss       78
Mrs        72
Master     21
Misc        7
Name: Title, dtype: int64

In [7]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

label = LabelEncoder()
for i in data:    
    i['Sex_Code'] = label.fit_transform(i['Sex'])
    i['Embarked_Code'] = label.fit_transform(i['Embarked'])
    i['Title_Code'] = label.fit_transform(i['Title'])
    i['FareBin_Code'] = label.fit_transform(i['FareBin'])
    del i['Sex']
    del i['Embarked']
    del i['Title']
    del i['FareBin']

X_train

Unnamed: 0_level_0,Pclass,Age,FamilySize,IsAlone,Sex_Code,Embarked_Code,Title_Code,FareBin_Code
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,22.0,2,0,1,2,3,0
2,1,38.0,2,0,0,0,4,3
3,3,26.0,1,1,0,2,2,1
4,1,35.0,2,0,0,2,4,3
5,3,35.0,1,1,1,2,3,1
...,...,...,...,...,...,...,...,...
887,2,27.0,1,1,1,2,1,1
888,1,19.0,1,1,0,2,2,2
889,3,28.0,4,0,0,2,2,2
890,1,26.0,1,1,1,0,3,2


In [8]:
from fancyimpute import KNN
X_train_imp = KNN(k=5).fit_transform(X_train)
X_test_imp = KNN(k=5).fit_transform(X_test)

Using TensorFlow backend.


Imputing row 1/891 with 0 missing, elapsed time: 0.118
Imputing row 101/891 with 0 missing, elapsed time: 0.118
Imputing row 201/891 with 0 missing, elapsed time: 0.119
Imputing row 301/891 with 0 missing, elapsed time: 0.119
Imputing row 401/891 with 0 missing, elapsed time: 0.119
Imputing row 501/891 with 0 missing, elapsed time: 0.121
Imputing row 601/891 with 0 missing, elapsed time: 0.121
Imputing row 701/891 with 0 missing, elapsed time: 0.121
Imputing row 801/891 with 0 missing, elapsed time: 0.122
Imputing row 1/418 with 0 missing, elapsed time: 0.035
Imputing row 101/418 with 0 missing, elapsed time: 0.035
Imputing row 201/418 with 0 missing, elapsed time: 0.036
Imputing row 301/418 with 0 missing, elapsed time: 0.036
Imputing row 401/418 with 0 missing, elapsed time: 0.036




In [9]:
X_train['Age'] = X_train_imp[:,1]
X_test['Age'] = X_test_imp[:,1]

for i in data:
    i['AgeBin'] = pd.cut(i['Age'].astype(int), 5)
    del i['Age']
    i['AgeBin_Code'] = label.fit_transform(i['AgeBin'])
    del i['AgeBin']

Split training data into training and validation sets

In [10]:
from sklearn import model_selection

X_train1, X_val1, y_train1, y_val1 = model_selection.train_test_split(X_train, y_train, random_state = 0)

In [11]:
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process


#Machine Learning Algorithm (MLA) Selection and Initialization
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),
    
    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    #SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),
    
    #Trees    
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    
    #Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis()
    ]

from sklearn.preprocessing import MinMaxScaler

# Create the scaler object with a range of 0-1
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit on the training data
scaler.fit(X_train1)

# Transform both the training and testing data
X_train1 = scaler.transform(X_train1)
X_val1 = scaler.transform(X_val1)

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [12]:
# Function to calculate mean absolute error
def mae(y_val1, y_pred):
    return np.mean(abs(y_val1 - y_pred))

# Takes in a model, trains the model, and evaluates the model on the test set
def fit_and_evaluate(model):
    
    # Train the model
    model.fit(X_train1, y_train1)
    
    # Make predictions and evalute
    model_pred = model.predict(X_val1)
    model_mae = mae(y_val1, model_pred)
    
    # Return the performance metric
    return model_mae

In [13]:
ada = ensemble.AdaBoostClassifier()
ada_mae = fit_and_evaluate(ada)
ada_mae

0.16591928251121077

In [14]:
for model in MLA:
    print('Performance on the test set: MAE = %0.4f' % fit_and_evaluate(model))

Performance on the test set: MAE = 0.1659
Performance on the test set: MAE = 0.1614
Performance on the test set: MAE = 0.1659
Performance on the test set: MAE = 0.1704
Performance on the test set: MAE = 0.1794




Performance on the test set: MAE = 0.1883




Performance on the test set: MAE = 0.2018
Performance on the test set: MAE = 0.3498
Performance on the test set: MAE = 0.2152
Performance on the test set: MAE = 0.2063
Performance on the test set: MAE = 0.1973
Performance on the test set: MAE = 0.2287
Performance on the test set: MAE = 0.2287
Performance on the test set: MAE = 0.1883
Performance on the test set: MAE = 0.2197
Performance on the test set: MAE = 0.2197
Performance on the test set: MAE = 0.2108
Performance on the test set: MAE = 0.1749
Performance on the test set: MAE = 0.1839
Performance on the test set: MAE = 0.2152
Performance on the test set: MAE = 0.2063




Let's use the Adaboost classifier

In [15]:
model = ensemble.AdaBoostClassifier()

# Number of trees used in the boosting process
n_estimators = [10, 25, 50, 75, 100, 150, 250]

# Learning rate
learning_rate = [0.01, 0.05, 0.1, 0.5, 1]

# Define the grid of hyperparameters to search
hyperparameter_grid = {'n_estimators': n_estimators,
                       'learning_rate': learning_rate}

In [16]:
from sklearn.model_selection import RandomizedSearchCV

# Set up the random search with 4-fold cross validation
random_cv = RandomizedSearchCV(estimator=model,
                               param_distributions=hyperparameter_grid,
                               cv=4, n_iter=25, 
                               scoring = 'neg_mean_absolute_error',
                               n_jobs = -1, verbose = 1, 
                               return_train_score = True,
                               random_state=42)

In [17]:
# Fit on the training data
random_cv.fit(X_train1, y_train1)

Fitting 4 folds for each of 25 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   10.9s finished


RandomizedSearchCV(cv=4, error_score='raise-deprecating',
                   estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                                base_estimator=None,
                                                learning_rate=1.0,
                                                n_estimators=50,
                                                random_state=None),
                   iid='warn', n_iter=25, n_jobs=-1,
                   param_distributions={'learning_rate': [0.01, 0.05, 0.1, 0.5,
                                                          1],
                                        'n_estimators': [10, 25, 50, 75, 100,
                                                         150, 250]},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=True, scoring='neg_mean_absolute_error',
                   verbose=1)

In [18]:
random_results = pd.DataFrame(random_cv.cv_results_).sort_values('mean_test_score', ascending = False)

random_results.head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,mean_train_score,std_train_score
14,0.147767,0.004879,0.016078,0.001641,75,0.1,"{'n_estimators': 75, 'learning_rate': 0.1}",-0.166667,-0.184524,-0.174699,-0.138554,-0.166168,0.017099,1,-0.164,-0.158,-0.157371,-0.173307,-0.163169,0.006399
7,0.312475,0.010468,0.0562,0.001291,150,0.05,"{'n_estimators': 150, 'learning_rate': 0.05}",-0.166667,-0.184524,-0.174699,-0.144578,-0.167665,0.014713,2,-0.164,-0.158,-0.157371,-0.173307,-0.163169,0.006399
1,0.557303,0.007465,0.063839,0.002208,250,0.05,"{'n_estimators': 250, 'learning_rate': 0.05}",-0.172619,-0.184524,-0.180723,-0.144578,-0.170659,0.015605,3,-0.162,-0.158,-0.163347,-0.175299,-0.164661,0.006449
6,0.294656,0.003662,0.033567,0.001601,150,0.1,"{'n_estimators': 150, 'learning_rate': 0.1}",-0.172619,-0.184524,-0.180723,-0.150602,-0.172156,0.013123,4,-0.162,-0.158,-0.163347,-0.173307,-0.164163,0.005633
3,0.025706,0.004791,0.003514,0.000755,10,0.5,"{'n_estimators': 10, 'learning_rate': 0.5}",-0.178571,-0.196429,-0.198795,-0.138554,-0.178144,0.024075,5,-0.168,-0.168,-0.167331,-0.185259,-0.172147,0.007575
9,0.147719,0.015746,0.01375,0.002883,50,0.1,"{'n_estimators': 50, 'learning_rate': 0.1}",-0.190476,-0.190476,-0.186747,-0.150602,-0.179641,0.016767,6,-0.18,-0.166,-0.167331,-0.185259,-0.174647,0.008209
17,0.190281,0.011216,0.024975,0.004491,100,0.05,"{'n_estimators': 100, 'learning_rate': 0.05}",-0.190476,-0.190476,-0.186747,-0.156627,-0.181138,0.014177,7,-0.182,-0.166,-0.167331,-0.185259,-0.175147,0.008573
23,0.098255,0.003906,0.012836,0.001594,50,0.5,"{'n_estimators': 50, 'learning_rate': 0.5}",-0.178571,-0.190476,-0.180723,-0.186747,-0.184132,0.004742,8,-0.168,-0.162,-0.163347,-0.181275,-0.168655,0.007618
2,0.17908,0.013133,0.025321,0.005563,75,0.5,"{'n_estimators': 75, 'learning_rate': 0.5}",-0.178571,-0.196429,-0.192771,-0.186747,-0.188623,0.006774,9,-0.168,-0.166,-0.163347,-0.181275,-0.169655,0.006909
22,0.102453,0.004819,0.014511,0.00282,50,1.0,"{'n_estimators': 50, 'learning_rate': 1}",-0.178571,-0.190476,-0.198795,-0.186747,-0.188623,0.007271,9,-0.168,-0.168,-0.177291,-0.203187,-0.17912,0.014404


In [19]:
random_cv.best_estimator_

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.1,
                   n_estimators=75, random_state=None)

In [29]:
final_model = random_cv.best_estimator_
final_model.fit(X_train, y_train)
pred = final_model.predict(X_test)

testfin = pd.read_csv("/Users/patrickfahy99/Documents/Kaggle_datasets/titanic/test.csv")

pred = pd.DataFrame(pred, index=testfin["PassengerId"])

pred.columns = ['Survived']

#final_pred = np.column_stack((testfin["PassengerId"].values, pred.values))

#final_pred

pred.to_csv("/Users/patrickfahy99/Documents/Kaggle_datasets/titanic/submit.csv", index=True)

