In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix,accuracy_score

In [2]:
# Load in the training Data
train_pd = pd.read_csv('data/train.csv')


In [3]:
train_pd.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Now we summaize some Information about the data set
train_pd.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [5]:
# Now we summaize some Information about the data set
train_pd.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
# Because we were missing values for the age, we now replace the na values with the mean age

train_pd['Age'].describe()
mean_age = train_pd['Age'].describe()[1]

print 'mean age: ', mean_age

train_pd['Age'] = train_pd['Age'].fillna(mean_age)

# We also discard any values for the Fair that appear as outliers, in this case, the >= $500 ticket price
train_pd = train_pd[ train_pd['Fare'] < 500.0]

# Now we want to convert the gender Category into a numerical label
# This function will be applied to the data frame
def label_cat(x):
    if x=='male':
        return 1
    else:
        return 0


train_pd['Sex'] = train_pd.apply(lambda x: label_cat(x['Sex']), axis=1)

train_pd.count()

mean age:  29.6991176471


PassengerId    888
Survived       888
Pclass         888
Name           888
Sex            888
Age            888
SibSp          888
Parch          888
Ticket         888
Fare           888
Cabin          202
Embarked       886
dtype: int64

In [7]:
# Now we check to see that we dropped the outliers

train_pd.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
count,888.0,888.0,888.0,888.0,888.0,888.0,888.0,888.0
mean,445.618243,0.381757,2.313063,0.647523,29.680083,0.524775,0.381757,30.582164
std,257.405474,0.486091,0.834007,0.478011,13.019819,1.104186,0.806949,41.176366
min,1.0,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,222.75,0.0,2.0,0.0,22.0,0.0,0.0,7.8958
50%,445.5,0.0,3.0,1.0,29.699118,0.0,0.0,14.4542
75%,667.25,1.0,3.0,1.0,35.0,1.0,0.0,30.77185
max,891.0,1.0,3.0,1.0,80.0,8.0,6.0,263.0


In [8]:
# We can now group the data set according the the ticket class
subset_df = train_pd[['Survived','Pclass','Sex','Age','Fare']]

# Now we examine how Survival and Ticket class are related
sns.pairplot(subset_df,hue='Survived',vars=["Fare", "Pclass","Age","Sex"]);
plt.show()

subset_df.hist()
plt.show()

# Now lets look at our data set
subset_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,1,22.0,7.25
1,1,1,0,38.0,71.2833
2,1,3,0,26.0,7.925
3,1,1,0,35.0,53.1
4,0,3,1,35.0,8.05


In [98]:
# Now that we have explored the data a bit, it's time to train a model and make some predictions
# Models that we will consider: Naive Bayes, Random Forests, SVM 
# A. We will weigh the model predictions according to their confusion matrix 
# B. We will use K-fold cross validation to check accuracy of our models
# C. We will take our classifiers and feed it into a another set of models (NB, RF, SVM) and see if we do better (Stacking)

# The Features that we will use to train the predictive models will be:
# 1. Pclass
# 2. Sex
# 3. Their Age
# 4. Fare Paid

# Let us take a subset of the data to train upon now:
split_frac = 0.6
total_indx = len(subset_np)
split_indx = int(0.7*total_indx)

print 'total rows: ', total_indx
print 'split index: ', split_indx

shuffle_df = shuffle(subset_np)
train_np = shuffle_df[0:split_indx]
test_np  = shuffle_df[split_indx:total_indx]

print 'size of train : ', len(train_df)
print 'size of split : ', len(test_df)


# First we extract the labels of the data
#train_np = train_df.values
#test_np = test_df.values

# Extract the Survival labels from the subsets
train_labels = np.asarray([train_np[k][0] for k in range(len(train_np))])

# Extract the feature matrix for all of the entries
train_feature_matrix = np.asarray([train_np[k][1:] for k in range(len(train_np))])


# Now we extract the tests labels and feature matrix
test_labels = np.asarray([test_np[k][0] for k in range(len(test_np))])
test_feature_matrix = np.asarray([test_np[k][1:] for k in range(len(test_np))])


# Here we define all of the models that we use
gnb = GaussianNB()
lr = LogisticRegression()
svc = LinearSVC(C=1.0)
rfc = RandomForestClassifier(n_estimators=100)
adab = AdaBoostClassifier()


#GaussianNB Fitting Procedure
gnb.fit(train_feature_matrix,train_labels)
lr.fit(train_feature_matrix,train_labels)
svc.fit(train_feature_matrix,train_labels)
rfc.fit(train_feature_matrix,train_labels)
adab.fit(train_feature_matrix,train_labels)

scores_gnb = cross_val_score(gnb, train_feature_matrix, train_labels, cv=5)
scores_lr = cross_val_score(lr, train_feature_matrix,  train_labels, cv=5)
scores_svc = cross_val_score(svc, train_feature_matrix,  train_labels, cv=5)
scores_rfc = cross_val_score(rfc, train_feature_matrix,  train_labels, cv=5)
scores_adab = cross_val_score(adab, train_feature_matrix, train_labels, cv=5)

print scores_gnb
print scores_lr
print scores_svc
print scores_rfc
print scores_adab


y_pred_gnb = gnb.predict(test_feature_matrix)
y_pred_lr = lr.predict(test_feature_matrix)
y_pred_svc = svc.predict(test_feature_matrix)
y_pred_rfc = rfc.predict(test_feature_matrix)
y_pred_adab = adab.predict(test_feature_matrix)

# Now we make predictions on the test data set using all of the different models
cm_gnb = confusion_matrix(test_labels, y_pred_gnb)
cm_lr = confusion_matrix(test_labels, y_pred_lr)
cm_svc =  confusion_matrix(test_labels, y_pred_svc)
cm_rfc = confusion_matrix(test_labels, y_pred_rfc)
cm_adab = confusion_matrix(test_labels, y_pred_adab)

cm_gnb =  cm_gnb/cm_gnb.astype(np.float).sum(axis=1)
cm_lr = cm_lr/cm_lr.astype(np.float).sum(axis=1)
cm_svc = cm_svc/cm_svc.astype(np.float).sum(axis=1)
cm_rfc = cm_rfc/cm_rfc.astype(np.float).sum(axis=1)
cm_adab = cm_adab/cm_adab.astype(np.float).sum(axis=1)

print ''
print 'Normalized Confusion Matrix'
print ''
print cm_gnb
print cm_lr
print cm_svc
print cm_rfc
print cm_adab



total rows:  888
split index:  621
size of train :  621
size of split :  267
[ 0.752       0.78225806  0.78225806  0.81451613  0.75806452]
[ 0.76        0.7983871   0.80645161  0.82258065  0.77419355]
[ 0.728       0.80645161  0.65322581  0.71774194  0.75806452]
[ 0.832       0.84677419  0.78225806  0.7983871   0.79032258]
[ 0.744       0.7983871   0.78225806  0.81451613  0.78225806]

Normalized Confusion Matrix

[[ 0.78616352  0.31481481]
 [ 0.18238994  0.73148148]]
[[ 0.82389937  0.25925926]
 [ 0.19496855  0.71296296]]
[[ 0.1509434   1.25      ]
 [ 0.05031447  0.92592593]]
[[ 0.91823899  0.12037037]
 [ 0.18867925  0.72222222]]
[[ 0.82389937  0.25925926]
 [ 0.16981132  0.75      ]]


In [99]:
# Now we can experiment in the way that we combine all the prediction from the models

def bayes_average(feature_matrix,model_array,cm_array):
    
    # Every row corresponds to all of the predctions of the models
    y_pred_matrix = np.asarray([model.predict(feature_matrix) for model in model_array ]) 
    
    y = []
    
    # Now for every model, we combine the predictions using the confusion matrix
    # model_indx
    # pred_indx
    for pred_indx in range(0,len(y_pred_matrix[0])):
        
        prob_fake_cond_results = 0.0
        prob_true_cond_results = 0.0
         
        for model_indx in range(0,len(model_array)):
            
            if y_pred_matrix[model_indx][pred_indx]==0:
                cond_prob_result_given_false_k = cm_array[model_indx][0,1] # Wrong Classifications
                cond_prob_result_given_true_k  = cm_array[model_indx][0,0] # Correct Classifications
            else:
                cond_prob_result_given_false_k = cm_array[model_indx][1,1] # Correct Classification
                cond_prob_result_given_true_k  = cm_array[model_indx][1,0] # Wrong Classifications
            
            prob_fake_cond_results = prob_fake_cond_results+cond_prob_result_given_false_k
            prob_true_cond_results = prob_true_cond_results+cond_prob_result_given_true_k
                
        
        Normalization = prob_fake_cond_results+prob_true_cond_results
        
        prob_fake_cond_results = prob_fake_cond_results/Normalization
        prob_true_cond_results = prob_true_cond_results/Normalization
        
        # The Final Prediction:
        if prob_fake_cond_results >= prob_true_cond_results:
            yk = 1.0
        else:
            yk = 0.0
            
        y.extend([yk])
        
    # Convert to Numy array 
    y = np.asarray(y)
    
    return y


In [100]:
model_array = [gnb,lr,svc,rfc,adab]
cm_array =[cm_gnb,cm_lr,cm_svc,cm_rfc,cm_adab]
#bayes_average(test_feature_matrix,model_array,cm_array)

y_pred_bayes = bayes_average(test_feature_matrix,model_array,cm_array)

cm_bayes = confusion_matrix(test_labels, y_pred_bayes)
cm_bayes = cm_bayes/cm_bayes.astype(np.float).sum(axis=1)

print cm_bayes

[[ 0.80503145  0.28703704]
 [ 0.13836478  0.7962963 ]]


In [102]:
# Now lets compute the accuracy of this Bayesian combination of the models
accuracy_score(test_labels, y_pred_bayes)

0.80149812734082393

In [170]:
import csv

# Now we read in the prediction data set and output our predictions as required.
# We must fill in missing values, and process the data.
# "You should submit a csv file with exactly 418 entries plus a header row."
# "Your submission will show an error if you have extra columns (beyond PassengerId and Survived) or rows."
pred_df = pd.read_csv('data/test.csv')
pred_df['Fare'] = pred_df['Fare'].fillna(pred_df["Fare"].describe()[1])
pred_df['Age'] = pred_df['Age'].fillna(pred_df["Age"].describe()[1])
pred_df['Sex'] = pred_df.apply(lambda x: label_cat(x['Sex']), axis=1)
##pred_df.head()

# Now we extract the features
feature_df = pred_df[['Pclass','Sex','Age','Fare']]
pass_df = pred_df[['PassengerId']]

#feature_df.head().describe()
pred_np = np.asarray(feature_df)
pass_np = np.asarray(pass_df)

pred_feature_matrix = np.asarray([pred_np[k][:] for k in range(len(pred_np))])

# Now that we have loaded the features, we make predictions
y_pred = bayes_average(pred_feature_matrix,model_array,cm_array)

y_pred = rfc.predict(pred_feature_matrix)


with open('kaggle_predictions.csv', "wb") as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        writer.writerow(['PassengerId','Survived'])
        
        for k in range(0,len(pass_df)):
            #writer.writerow(line)
            writer.writerow([int(pass_np[k]), int(y_pred[k])])


