In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
import matplotlib as plt
from scipy.stats import pearsonr
%matplotlib inline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score

In [2]:
df_train = pd.read_csv('train.csv')
df_train.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis = 1, inplace = True)
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [3]:
#Check the number of null values in each column
for i in list(df_train.columns):
    print(i + " : " + str(df_train[i].isna().sum()))

Survived : 0
Pclass : 0
Sex : 0
Age : 177
SibSp : 0
Parch : 0
Fare : 0
Embarked : 2


In [4]:
# We observe that the Age column has the
# Maximum number of NaNs
#Hence we fill age with a mean value
#preprocess embarked column
df_train['Age'].fillna(value = np.mean(df_train['Age']), inplace = True)
for i in list(df_train.columns):
    print(i + " : " + str(df_train[i].isna().sum()))

Survived : 0
Pclass : 0
Sex : 0
Age : 0
SibSp : 0
Parch : 0
Fare : 0
Embarked : 2


In [5]:
#preprocess Sex column
#Since Sex can be considered to be both nominal or ordinal,
#We assume it as nominal and one hot encode it
df_train = pd.get_dummies(df_train, columns = ['Sex'])
df_train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_female,Sex_male
0,0,3,22.0,1,0,7.25,S,0,1
1,1,1,38.0,1,0,71.2833,C,1,0
2,1,3,26.0,0,0,7.925,S,1,0
3,1,1,35.0,1,0,53.1,S,1,0
4,0,3,35.0,0,0,8.05,S,0,1


In [6]:
#preprocess Embarked column
#Since all values are distinct and thus nominal,
#we one hot encode them
df_train = pd.get_dummies(df_train, columns = ['Embarked'])
df_train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,1,3,26.0,0,0,7.925,1,0,0,0,1
3,1,1,35.0,1,0,53.1,1,0,0,0,1
4,0,3,35.0,0,0,8.05,0,1,0,0,1


In [7]:
#Check the correlations between features and target
features = list(df_train.iloc[:, 1:].columns)
target = df_train.iloc[:, 0].name

In [8]:
#check for correlations
correlations = {}
for f in features:
    df_train_temp = df_train[[f,target]]
    x1 = df_train_temp[f].values
    x2 = df_train_temp[target].values
    key = f + ' vs ' + target
    correlations[key] = pearsonr(x1.ravel(),x2.ravel())[0]

In [9]:
#display correlations
data_correlations = pd.DataFrame(correlations, index=['Value']).T
data_correlations.loc[data_correlations['Value'].abs().sort_values(ascending=False).index]

Unnamed: 0,Value
Sex_male vs Survived,-0.543351
Sex_female vs Survived,0.543351
Pclass vs Survived,-0.338481
Fare vs Survived,0.257307
Embarked_C vs Survived,0.16824
Embarked_S vs Survived,-0.15566
Parch vs Survived,0.081629
Age vs Survived,-0.069809
SibSp vs Survived,-0.035322
Embarked_Q vs Survived,0.00365


In [10]:
y = df_train.loc[:,['Sex_female','Sex_male','Pclass', 'Fare', target]].sort_values(target, ascending=True).values
x = np.arange(y.shape[0])

In [11]:
#plt.subplot(5,1,1)
#plt.plot(x,y[:,0])
#plt.title('Sex_female, Sex_male, Pclass & Fare vs Survived')
#plt.ylabel('Sqft')

#plt.subplot(5,1,2)
#plt.plot(x,y[:,1])
#plt.ylabel('Sex_female')

#plt.subplot(5,1,3)
#plt.plot(x,y[:,2],'r')
#plt.ylabel("Sex_male")

#plt.subplot(5,1,4)
#plt.plot(x,y[:,3],'r')
#plt.ylabel("Pclass")

#plt.subplot(5,1,5)
#plt.plot(x,y[:,4],'r')
#plt.ylabel("Survived")

#plt.show()

In [12]:
y = df_train['Survived']
X = df_train.drop(columns = ['Survived'], axis = 1)
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,26.0,0,0,7.925,1,0,0,0,1
3,1,35.0,1,0,53.1,1,0,0,0,1
4,3,35.0,0,0,8.05,0,1,0,0,1


In [13]:
#Split into testing and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2)

In [14]:
#distributed parameters
params_grid = {
    'max_depth' : [1, 2, 3],
    'n_estimators' : [5, 10, 25, 50],
    'learrning_rate' : np.linspace(0.01, 0.1, 1, 3, 5)
}

In [64]:
xgb = XGBClassifier(max_depth = 1, learning_rate = 0.01, n_estimators = 50)
rfc = RandomForestClassifier()
gnb = GaussianNB()
sgd = SGDClassifier()
dtc = DecisionTreeClassifier()
knn = KNeighborsClassifier(n_neighbors = 4)
logreg = LogisticRegression(solver = 'lbfgs')
svc = SVC()
lsvc = LinearSVC()

In [65]:
xgb.fit(X_train, y_train)
xgb_predictions = xgb.predict(X_val)
print("XGBClassifier : " + str(accuracy_score(y_val, xgb_predictions)))

  if diff:


In [61]:
rfc.fit(X_train, y_train)
gnb.fit(X_train, y_train)
sgd.fit(X_train, y_train)
dtc.fit(X_train, y_train)
knn.fit(X_train, y_train)
svc.fit(X_train, y_train)
lsvc.fit(X_train, y_train)
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [62]:
xgb_predictions = xgb.predict(X_val)
rfc_predictions = rfc.predict(X_val)
gnb_predictions = gnb.predict(X_val)
sgd_predictions = sgd.predict(X_val)
dtc_predictions = dtc.predict(X_val)
knn_predictions = knn.predict(X_val)
svc_predictions = svc.predict(X_val)
lsvc_predictions = lsvc.predict(X_val)
logreg_predictions = logreg.predict(X_val)

  if diff:


In [63]:
print("XGBClassifier : " + str(accuracy_score(y_val, xgb_predictions)))
print("RandomForestClassifier : " + str(accuracy_score(y_val, rfc_predictions)))
print("GaussianNB : " + str(accuracy_score(y_val, gnb_predictions)))
print("SGDClassifier : " + str(accuracy_score(y_val, sgd_predictions)))
print("DecisionTreeClassifier : " + str(accuracy_score(y_val, dtc_predictions)))
print("KNN : " + str(accuracy_score(y_val, knn_predictions)))
print("SVC : " + str(accuracy_score(y_val, svc_predictions)))
print("LinearSVC : " + str(accuracy_score(y_val, lsvc_predictions)))
print("LogisticRegression : " + str(accuracy_score(y_val, logreg_predictions)))

XGBClassifier : 0.8435754189944135
RandomForestClassifier : 0.8324022346368715
GaussianNB : 0.8268156424581006
SGDClassifier : 0.7653631284916201
DecisionTreeClassifier : 0.8044692737430168
KNN : 0.7039106145251397
SVC : 0.7541899441340782
LinearSVC : 0.8268156424581006
LogisticRegression : 0.8435754189944135
