Importing necessary packages

In [1]:
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd

Reading the data

In [2]:
dis=pd.read_csv('C:/Users/HP/Desktop/New folder/train.csv',encoding='utf-8')
dat=pd.read_csv('C:/Users/HP/Desktop/New folder/test.csv',encoding='utf-8')
dis.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
dat.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


As it is evident from the data set that 'Name','Ticket' and 'Cabin' are unnecessary columns and hence we drop them.

In [4]:
dis=dis.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
dat=dat.drop(['Name','Ticket','Cabin'], axis=1)

Now we'll check the missing values in the data set.

In [5]:
dat.isnull().sum()

PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
Fare            1
Embarked        0
dtype: int64

Now we'll replace the missing values in the columns with the median value of the respective column.

In [6]:
dat['Fare'].fillna(dat['Fare'].median(),inplace=True)
dis['Fare']=dis['Fare'].astype(int)
dat['Fare']=dat['Fare'].astype(int)

In [7]:
dat['Age'].fillna(dat['Age'].median(),inplace=True)
dis['Age'].fillna(dis['Age'].median(),inplace=True)
dis['Age']=dis['Age'].astype(int)
dat['Age']=dat['Age'].astype(int)

Lets have a look at the data after replacing the missing values.

In [8]:
dat.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34,0,0,7,Q
1,893,3,female,47,1,0,7,S
2,894,2,male,62,0,0,9,Q
3,895,3,male,27,0,0,8,S
4,896,3,female,22,1,1,12,S


In [9]:
dis.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22,1,0,7,S
1,1,1,female,38,1,0,71,C
2,1,3,female,26,0,0,7,S
3,1,1,female,35,1,0,53,S
4,0,3,male,35,0,0,8,S


For the column 'Embarked' you can either use one-hot encoding or drop it (as it doesn't seem to be important).

In [10]:
dis=dis.drop(['Embarked'],axis=1)
dat=dat.drop(['Embarked'],axis=1)

We have to do one-hot encoding to the column 'Sex' as it is categorical.
We'll be creating a column 'P' which has three categories called 'C','F','M' (child,female,male)
After applying one-hot encoding to the column 'P', we can see there are three columns of which column 'M'
doesnt have much values so we drop it as well.
Finally we drop column 'P' after we join the 'C','F' columns to the dataset.

In [11]:
def get_person(passenger):
    age,sex = passenger
    return 'child' if age < 16 else sex

dis['P'] = dis[['Age','Sex']].apply(get_person,axis=1)
dat['P'] = dat[['Age','Sex']].apply(get_person,axis=1)

dis.drop(['Sex'],axis=1,inplace=True)
dat.drop(['Sex'],axis=1,inplace=True)


dt_train = pd.get_dummies(dis['P'])
dt_train.columns = ['C','F','M']
dt_train.drop(['M'], axis=1, inplace=True)

dt_test  = pd.get_dummies(dat['P'])
dt_test.columns = ['C','F','M']
dt_test.drop(['M'], axis=1, inplace=True)

dis = dis.join(dt_train)
dat = dat.join(dt_test)


In [12]:
dis.drop(['P'],axis=1,inplace=True)
dat.drop(['P'],axis=1,inplace=True)

In [13]:
dis.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,C,F
0,0,3,22,1,0,7,0,0
1,1,1,38,1,0,71,0,1
2,1,3,26,0,0,7,0,1
3,1,1,35,1,0,53,0,1
4,0,3,35,0,0,8,0,0


In [14]:
dat.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,C,F
0,892,3,34,0,0,7,0,0
1,893,3,47,1,0,7,0,1
2,894,2,62,0,0,9,0,0
3,895,3,27,0,0,8,0,0
4,896,3,22,1,1,12,0,1


We divide the training data set into X and Y labels.

In [15]:
X_train = dis.drop("Survived",axis=1)
Y_train = dis["Survived"]
X_test  = dat.drop("PassengerId",axis=1)

Modelling using Scikit-learn's Logistic Regression and calculating the confidence score for the model.

In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, Y_train)
pred = model.predict(X_test)

sc = model.score(X_train, Y_train)
print(sc)

0.8215488215488216




Lets find out the correlation between the features and our target variable which gives out the change in our target
variable due to change in a feature.

In [17]:
coeff_df = pd.DataFrame(dis.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(model.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
6,F,2.968291
5,C,2.676475
4,Fare,0.004384
1,Age,-0.020603
3,Parch,-0.216602
2,SibSp,-0.452731
0,Pclass,-0.938192
