Variables/Features
--------------------------
survival  -	Survival	0 = No, 1 = Yes
pclass -	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
sex	- Sex	 male or female
Age	 - Age in years	
sibsp -	# of siblings / spouses aboard the Titanic	
parch -	# of parents / children aboard the Titanic	
ticket -	Ticket number	
fare - Passenger fare	
cabin - Cabin number	
embarked - Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

In [48]:
import numpy as np
import pandas as pd
import math
import warnings
warnings.filterwarnings('ignore')         
train=pd.read_csv('titanic_train.csv')
test=pd.read_csv('test.csv')

In [2]:
train.head()                      #displays the first 5 entries of the data frame

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
train.info()       #gives a complete picture of number of non-null entries in a column along with their data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [49]:
train.drop('Cabin',axis=1,inplace=True)         #column 'cabin' is dropped as it contains too many null values
train.dropna(subset=['Embarked'],inplace=True)  #Rows having Embarked column as null are dropped   
test.drop('Cabin',axis=1,inplace=True)
train.describe()              #this gives a detailed information  about all the numerical columns present in the data frame

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,889.0,889.0,889.0,712.0,889.0,889.0,889.0
mean,446.0,0.382452,2.311586,29.642093,0.524184,0.382452,32.096681
std,256.998173,0.48626,0.8347,14.492933,1.103705,0.806761,49.697504
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,224.0,0.0,2.0,20.0,0.0,0.0,7.8958
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.0,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [50]:
dummies_df=pd.get_dummies(train[['Embarked','Sex']],drop_first=True) #creates binary values for the columns Embarked and Sex
train=pd.concat([train,dummies_df],axis=1)   #concatentaes the two data frames
dummies_df_test=pd.get_dummies(test[['Embarked','Sex']],drop_first=True)
test=pd.concat([test,dummies_df_test],axis=1)

In [51]:
Age_values=train.groupby('Sex').mean()['Age']       #this is being calculated to obatin mean age of all males and females
Age_values

Sex
female    27.745174
male      30.726645
Name: Age, dtype: float64

In [52]:
def fill_age(a):
    if math.isnan(a.Age):
        return Age_values[a['Sex']]
    return a.Age
train['Age']=train.apply(lambda x:fill_age(x),axis=1)# to impute the missing age values in train data frame 
#Here the whole data frame is being passed to lambda function which in turn calls the fill_age which fills the rows of
#train data frame containing null values for age taking into account their gender
test['Age']=test.apply(lambda x:fill_age(x),axis=1)

In [53]:
train.drop(columns=['Sex','Embarked'],axis=1,inplace=True) #as binary columns are created for these two columns,original 
#columns are dropped
test.drop(columns=['Sex','Embarked'],axis=1,inplace=True)

In [54]:
test['Fare']=test['Fare'].fillna(method='ffill')

In [24]:
train.info()  #Here we can see the age values are imputed 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    889 non-null int64
Survived       889 non-null int64
Pclass         889 non-null int64
Name           889 non-null object
Age            889 non-null float64
SibSp          889 non-null int64
Parch          889 non-null int64
Ticket         889 non-null object
Fare           889 non-null float64
Embarked_Q     889 non-null uint8
Embarked_S     889 non-null uint8
Sex_male       889 non-null uint8
dtypes: float64(2), int64(5), object(2), uint8(3)
memory usage: 72.1+ KB


In [8]:
train.columns   #lists all the columns present in the data frame

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Embarked_Q', 'Embarked_S', 'Sex_male'],
      dtype='object')

In [55]:
features_to_train=['Age','Pclass','Fare', 'Embarked_Q', 'Embarked_S', 'Sex_male'] #we have decided to train the ML model only on 
                                                                                  #these columns
X=train[features_to_train]
y=train['Survived']
test=test[features_to_train]

In [29]:
X.head()

Unnamed: 0,Age,Pclass,Fare,Embarked_Q,Embarked_S,Sex_male
0,22.0,3,7.25,0,1,1
1,38.0,1,71.2833,0,0,0
2,26.0,3,7.925,0,1,0
3,35.0,1,53.1,0,1,0
4,35.0,3,8.05,0,1,1


In [30]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV   #used for parameter tuning

In [56]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33)   #splits the given data set to two parts, train and test sets

In [36]:
sv_clf=SVC(kernel='linear')      #creating an object for SVC classifier
parameter_values={'gamma':[0.001,0.05,0.1,1,10],'C':[0.1,1,10]}          #parameter values to iterate upon
grid_clf_acc=GridSearchCV(sv_clf,param_grid=parameter_values,cv=3)       #This might take a while based on processor speed
'''
For all the parameter values given in param_grid, each combination of gamma and C values are used as parameters to train
the model and the best combination(having maximum accuracy score) is used to train the final model over the entire set of data.
This can be considered as nested for loops with outer loop iterating over gamma values and inner loop over C values.
See the Cross Validation section to know how the cv parameter works.
The defualt parameter for scoring is 'accuarcy'. However we can specify the scoring parameter based on our requirements.
'''
grid_clf_acc.fit(X_train,y_train)
print('The best fit parameters for the model with highest accuracy are:',grid_clf_acc.best_params_)

The best fit parameters for the model with highest accuracy are: {'C': 0.1, 'gamma': 0.001}


In [40]:
grid_clf_acc.score(X_test,y_test)        #performance score of the model with best parameters on unseen test data

0.7959183673469388

Cross Validation
------------------------
This is just an example to show how Cross validation parameter works.

Below, cv=5 divides the dataframe(containing features) into 5 sets(same as in train-test-split). Hence 5 models are trained in total with 4 sets used for training and 1 set for cross validation. 
For example if 
X=[1,2,3,4,5,6,7,8,9,10]
if cv =5 then X is divided into 5 sets randomly as below
X1=[2,5],
X2=[1,8],
X3=[3,7],
X4=[4,9],
X5=[6,10]
Now 5 models are trained in total each time one of the 5 sets being the test set and the remaining 4 as training sets.
The mean scores of the model is taken and stored as the resultant score for the particular combination of parameters.
In GridSearch CV, the combination(grid parameters) for the best mean score are selected as the best parameters and fitted over the entire training set.

In [43]:
from sklearn.model_selection import cross_val_score

In [44]:
print('The cross validation score across the 5 sets  ',cross_val_score(SVC(kernel='linear',C=1,gamma=10),X,y,cv=5))

The cross validation score across the 5 sets   [0.80337079 0.80337079 0.78651685 0.75280899 0.78531073]


In [95]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

In [70]:
def Stacking(model,n_fold,train,y,test):
    skf=StratifiedKFold(n_splits=n_fold)
    test_pred=np.empty((test.shape[0],1),float)
    train_pred=np.empty((0,1),float)
    for train_indices,val_indices in skf.split(train,y):
        X_train,X_val=X.iloc[train_indices],X.iloc[val_indices]
        y_train,y_val=y.iloc[train_indices],y.iloc[val_indices]
        model.fit(X_train,y_train)
        train_pred=np.append(train_pred,model.predict(X_val))
        test_pred=np.append(test_pred,model.predict(test))
    return test_pred.reshape(-1,1),train_pred
    

In [87]:
model1 = DecisionTreeClassifier(random_state=1)
test_pred1 ,train_pred1=Stacking(model=model1,n_fold=10,train=X_train,test=X_test,y=y_train)
train_pred1=pd.DataFrame(train_pred1,columns=['decision_tree']).astype(int)
test_pred1=pd.DataFrame(test_pred1,columns=['decision_tree']).astype(int)

In [88]:
model2 = KNeighborsClassifier()

test_pred2 ,train_pred2=Stacking(model=model2,n_fold=10,train=X_train,test=X_test,y=y_train)

train_pred2=pd.DataFrame(train_pred2,columns=['knn']).astype(int)
test_pred2=pd.DataFrame(test_pred2,columns=['knn']).astype(int)

In [91]:
df = pd.concat([train_pred1, train_pred2],axis=1)
df_test = pd.concat([test_pred1, test_pred2],axis=1)

In [108]:
y.shape

(3234, 2)

In [97]:
model = LogisticRegression(random_state=1)
model.fit(df,y_train)
#model.score(df_test, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [107]:
model.score(y_test,model.predict(df_test))

ValueError: Expected 2D array, got 1D array instead:
array=[0 0 0 0 1 1 1 0 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 1 0 0 1
 1 1 1 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 1 1 0 1 0 0 0
 0 0 1 0 1 0 1 0 0 1 0 1 1 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 0 0 1
 1 0 0 1 1 0 0 0 0 1 0 1 0 0 0 1 1 1 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 0 0 0 1
 1 1 0 1 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 1 1 1 0 0 1 0
 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 1 0 1 1 0 0 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 1
 1 0 0 0 1 0 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1
 1 1 1 1 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 1 1 0 1 1].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.