In [1]:
import scipy
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import xlrd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [2]:
data = pd.read_excel('../titanic_data/titanic3.xls')
data.head(3)

Unnamed: 0,passengerid,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,2,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,3,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [3]:
data = data.drop(['passengerid','name','ticket','cabin','boat','body','home.dest'],axis = 1)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   survived  1309 non-null   int64  
 2   sex       1309 non-null   object 
 3   age       1046 non-null   float64
 4   sibsp     1309 non-null   int64  
 5   parch     1309 non-null   int64  
 6   fare      1308 non-null   float64
 7   embarked  1307 non-null   object 
dtypes: float64(2), int64(4), object(2)
memory usage: 81.9+ KB


In [5]:
value = data['age'].mean()
data['age']= data['age'].fillna(value=value)

In [6]:
data = data.dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1306 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1306 non-null   int64  
 1   survived  1306 non-null   int64  
 2   sex       1306 non-null   object 
 3   age       1306 non-null   float64
 4   sibsp     1306 non-null   int64  
 5   parch     1306 non-null   int64  
 6   fare      1306 non-null   float64
 7   embarked  1306 non-null   object 
dtypes: float64(2), int64(4), object(2)
memory usage: 91.8+ KB


In [7]:
from sklearn.preprocessing import LabelEncoder

data['sex_binary'] = LabelEncoder().fit_transform(data['sex'])

data['embark']= LabelEncoder().fit_transform(data['embarked'])


In [8]:
data=data.drop(['sex','embarked'],axis=1)

In [9]:
X = data.drop('survived',axis=1)
Y = data['survived']

In [11]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X,Y)

#use inbuilt class feature_importances of tree based classifiers
print(model.feature_importances_)

[0.09421022 0.25093977 0.03972489 0.04349337 0.25517074 0.28626845
 0.03019255]


In [12]:
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=1)

In [13]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# Building and Evaluating Models

models = []
models.append(('LR', LogisticRegression(solver='liblinear',multi_class='ovr')))
# models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM',SVC(gamma='auto')))

# evaluate each model

result = []
names = []
for name, model in models:
        kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
        cv_result = cross_val_score(model, X_train_scaled,Y_train, cv=kfold, scoring='accuracy')
        result.append(cv_result)
        names.append(name)
        print('%s:%f(%f)'%(name,cv_result.mean(),cv_result.std()))

LR:0.780370(0.014162)
KNN:0.772186(0.042017)
CART:0.753787(0.047995)
NB:0.773206(0.025224)
SVM:0.809005(0.027291)
