In [1]:
import sys
import pandas as pd
import matplotlib
import numpy as np
import scipy as sp
import IPython
from IPython import display
import sklearn
import random
import time
import warnings
warnings.filterwarnings('ignore')
print('-'*25)

from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

-------------------------


In [2]:
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

## Read data

In [3]:
data_raw = pd.read_csv(r'C:\Users\pc\Desktop\Data Science Folder\Titanic Machine Learning From Disaster\train.csv')
data_val  = pd.read_csv(r'C:\Users\pc\Desktop\Data Science Folder\Titanic Machine Learning From Disaster\test.csv')

In [4]:
data1 = data_raw.copy(deep = True)
data_cleaner = [data1, data_val]

In [5]:
print('Train columns with null values:\n', data1.isnull().sum())
print("-"*10)

print('Test/Validation columns with null values:\n', data_val.isnull().sum())
print("-"*10)

Train columns with null values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
----------
Test/Validation columns with null values:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
----------


## Fill missing values


In [6]:
for dataset in data_cleaner:    
    
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)

    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)

    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)

## Drop unnecessary data

In [7]:
drop_column = ['PassengerId','Cabin', 'Ticket']
data1.drop(drop_column, axis=1, inplace = True)

In [25]:
print(data1.isnull().sum() )
print("-"*10)
print(data_val.isnull().sum() )



Survived      0
Pclass        0
Name          0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked      0
FamilySize    0
IsAlone       0
Title         0
FareBin       0
AgeBin        0
dtype: int64
----------
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
FamilySize       0
IsAlone          0
Title            0
FareBin          0
AgeBin           0
dtype: int64


## Adding new columns to data1 and data_val

In [8]:
for dataset in data_cleaner:    
    
    dataset['FamilySize'] = dataset ['SibSp'] + dataset['Parch'] + 1

    dataset['IsAlone'] = 1 #initialize to yes/1 is alone
   
    dataset['IsAlone'][dataset['FamilySize'] > 1] = 0

    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)

    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)


## Modify 'Title' column

In [9]:
print('******before******\n' , data1['Title'].value_counts(), '\n')

title_names = data1['Title'].value_counts() < 10 
data1['Title'] = data1['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)

print('******after*******\n' , data1['Title'].value_counts())

******before******
 Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Col               2
Major             2
Mlle              2
the Countess      1
Don               1
Jonkheer          1
Ms                1
Capt              1
Lady              1
Mme               1
Sir               1
Name: Title, dtype: int64 

******after*******
 Mr        517
Miss      182
Mrs       125
Master     40
Misc       27
Name: Title, dtype: int64


## Introduction to LabelEncoder().fit_transform

In [None]:
print( LabelEncoder().fit_transform( data1['Embarked'] )[ range(0,20)] )


## Adding new columns to data1 and data_val

In [None]:
label = LabelEncoder()
for dataset in data_cleaner:    
    dataset['Sex_Code'] = label.fit_transform(dataset['Sex'])
    dataset['Embarked_Code'] = label.fit_transform(dataset['Embarked'])
    dataset['Title_Code'] = label.fit_transform(dataset['Title'])
    dataset['AgeBin_Code'] = label.fit_transform(dataset['AgeBin'])
    dataset['FareBin_Code'] = label.fit_transform(dataset['FareBin'])

## Lists containing column names


In [None]:
Target = ['Survived']

data1_x = ['Sex','Pclass', 'Embarked', 'Title','SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone']
data1_xy =  Target + data1_x

data1_x_calc = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code','SibSp', 'Parch', 'Age', 'Fare'] #coded for algorithm calculation


data1_x_bin = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'AgeBin_Code', 'FareBin_Code']
data1_xy_bin = Target + data1_x_bin

data1_dummy = pd.get_dummies( data1[data1_x] )
data1_x_dummy = data1_dummy.columns.tolist()



In [None]:

#**** for learning ***** 
print( data1_dummy )
print ( type(data1_dummy.columns) )
L = list(data1_dummy.columns)
print(L)
print( data1['Sex_Code'])

In [None]:
data1_xy_dummy = Target + data1_x_dummy

print('Dummy X Y: ', data1_xy_dummy, '\n')


## Checking the clean data

In [None]:
print('Train columns with null values: \n', data1.isnull().sum())
print("-"*10)
print (data1.info())
print("-"*10)

print('Test/Validation columns with null values: \n', data_val.isnull().sum())
print("-"*10)
print (data_val.info())
print("-"*10)

In [None]:
#******** for learning *********
print( type(data1[data1_x_calc]), '   ' , type(data1[Target]) )

In [None]:

train1_x, test1_x, train1_y, test1_y = model_selection.train_test_split(data1[data1_x_calc], data1[Target], random_state = 0)
train1_x_bin, test1_x_bin, train1_y_bin, test1_y_bin = model_selection.train_test_split(data1[data1_x_bin], data1[Target] , random_state = 0)
train1_x_dummy, test1_x_dummy, train1_y_dummy, test1_y_dummy = model_selection.train_test_split(data1_dummy[data1_x_dummy], data1[Target], random_state = 0)

## Veryfying the shape of train1_x & test1_x

In [None]:
print("Data1 Shape: {}".format(data1.shape))
print("Train1 Shape: {}".format(train1_x.shape))
print("Test1 Shape: {}".format(test1_x.shape))

## Perform Exploratory Analysis with Statistics

In [None]:
for x in data1_x:
    if data1[x].dtype != 'float64' :
        print('Survival Correlation by:', x)
        print( (data1[[x, Target[0]]].groupby( x, as_index = False )).mean() )
        #print('-'*10, '\n')


print( pd.crosstab(data1['Title'], data1[ 'Survived' ]  ) )


In [None]:
plt.figure(figsize=[16,12])

plt.subplot(231)
plt.boxplot(x=data1['Fare'])
plt.title('Fare Boxplot')
plt.ylabel('Fare ($)')

plt.subplot(232)
plt.boxplot(data1['Age'], showmeans = True, meanline = True)
plt.title('Age Boxplot')
plt.ylabel('Age (Years)')

plt.subplot(233)
plt.boxplot(data1['FamilySize'], showmeans = True, meanline = True)
plt.title('Family Size Boxplot')
plt.ylabel('Family Size (#)')

plt.subplot(234)
plt.hist(x = [data1[data1['Survived']==1]['Fare'], data1[data1['Survived']==0]['Fare']] , stacked=True, color = ['g', 'r'], label = ['Survived','Dead'])
plt.title('Fare Histogram by Survival')
plt.xlabel('Fare ($)')
plt.ylabel('# of Passengers')
plt.legend()

plt.subplot(235)
plt.hist(x = [data1[data1['Survived']==1]['Age'], data1[data1['Survived']==0]['Age']], 
         stacked=True, color = ['g','r'],label = ['Survived','Dead'])
plt.title('Age Histogram by Survival')
plt.xlabel('Age (Years)')
plt.ylabel('# of Passengers')
plt.legend()

plt.subplot(236)
plt.hist(x = [data1[data1['Survived']==1]['FamilySize'], data1[data1['Survived']==0]['FamilySize']], stacked=True, color = ['g','r'],label = ['Survived','Dead'])
plt.title('Family Size Histogram by Survival')
plt.xlabel('Family Size (#)')
plt.ylabel('# of Passengers')
plt.legend()

plt.show()


In [None]:
plt.figure(figsize=[16, 12])

plt.subplot(231)
sns.barplot(x = 'Embarked', y = 'Survived', data=data1)
plt.subplot(232)
sns.barplot(x = 'Pclass', y = 'Survived', order=[3,2,1], data=data1 );
plt.subplot(233)
sns.barplot(x = 'IsAlone', y = 'Survived', order=[1,0], data=data1 );


plt.subplot(234)
sns.pointplot(x = 'FareBin', y = 'Survived',  data=data1 );
plt.subplot(235)
sns.pointplot(x = 'AgeBin', y = 'Survived',  data=data1 );
plt.subplot(236)
sns.pointplot(x = 'FamilySize', y = 'Survived', data=data1 );
