In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")


In [50]:
# test = pd.read_csv('test.csv')
# test.head()

In [51]:
passengers = pd.read_csv('train.csv')
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [52]:
passengers.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [53]:
passengers.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### UPDATE SEX COLUMN TO NUMERICAL

In [54]:
passengers['Sex'] = passengers['Sex'].map({'male':'0','female':'1'})
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


### Filling NaN values in Age column as mean.

In [55]:
passengers = passengers.fillna(value={'Age':passengers['Age'].mean()})

In [56]:
print(passengers.Age.values)


[22.         38.         26.         35.         35.         29.69911765
 54.          2.         27.         14.          4.         58.
 20.         39.         14.         55.          2.         29.69911765
 31.         29.69911765 35.         34.         15.         28.
  8.         38.         29.69911765 19.         29.69911765 29.69911765
 40.         29.69911765 29.69911765 66.         28.         42.
 29.69911765 21.         18.         14.         40.         27.
 29.69911765  3.         19.         29.69911765 29.69911765 29.69911765
 29.69911765 18.          7.         21.         49.         29.
 65.         29.69911765 21.         28.5         5.         11.
 22.         38.         45.          4.         29.69911765 29.69911765
 29.         19.         17.         26.         32.         16.
 21.         26.         32.         25.         29.69911765 29.69911765
  0.83       30.         22.         29.         29.69911765 28.
 17.         33.         16.         29.69

### Creating a first class column

In [57]:
passengers['FirstClass'] = passengers['Pclass'].apply(lambda x: 1 if x == 1 else 0)
passengers.tail(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass
881,882,0,3,"Markun, Mr. Johann",0,33.0,0,0,349257,7.8958,,S,0
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",1,22.0,0,0,7552,10.5167,,S,0
883,884,0,2,"Banfield, Mr. Frederick James",0,28.0,0,0,C.A./SOTON 34068,10.5,,S,0
884,885,0,3,"Sutehall, Mr. Henry Jr",0,25.0,0,0,SOTON/OQ 392076,7.05,,S,0
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",1,39.0,0,5,382652,29.125,,Q,0
886,887,0,2,"Montvila, Rev. Juozas",0,27.0,0,0,211536,13.0,,S,0
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.0,0,0,112053,30.0,B42,S,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,29.699118,1,2,W./C. 6607,23.45,,S,0
889,890,1,1,"Behr, Mr. Karl Howell",0,26.0,0,0,111369,30.0,C148,C,1
890,891,0,3,"Dooley, Mr. Patrick",0,32.0,0,0,370376,7.75,,Q,0


### Creating a second class column

In [58]:
# Create a second class column
passengers['SecondClass'] = passengers['Pclass'].apply(lambda x: 1 if x == 2 else 0)
passengers.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,1,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0,0
5,6,0,3,"Moran, Mr. James",0,29.699118,0,0,330877,8.4583,,Q,0,0
6,7,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,17463,51.8625,E46,S,1,0
7,8,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,349909,21.075,,S,0,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,347742,11.1333,,S,0,0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,237736,30.0708,,C,0,1


### Selecting independent variables

In [59]:
features = passengers[['Sex', 'Age', 'FirstClass', 'SecondClass']]
survival = passengers[['Survived']]

### Spliting data into train and test

In [60]:
Xtrain,Xtest, ytrain, ytest = train_test_split(features, survival,test_size=0.8)

### Normalizing the features for the logistic regression

In [61]:
scaler = StandardScaler()
Xtrain = scaler.fit_transform(Xtrain)
Xtest = scaler.transform(Xtest)

### Train

In [65]:
model = LogisticRegression()
model.fit(Xtrain, ytrain)
print(model.score(Xtrain, ytrain))
print(model.score(Xtest, ytest))

0.8146067415730337
0.7896213183730715


### Checking out which features holds how much weight.

In [68]:
print(model.coef_)

print('by looking at the coefficients, "Sex" holds the most and "Age" the least.')

[[ 1.19613565 -0.27255349  0.81465639  0.31417216]]
by looking at the coefficients, "Sex" holds the most and "Age" the least.


### Predicting

In [69]:
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
Vijit = np.array([0.0,25.0,0.0,1.0])

In [71]:
sample_passengers = np.array([Jack, Rose, Vijit])
print(sample_passengers)

[[ 0. 20.  0.  0.]
 [ 1. 17.  1.  0.]
 [ 0. 25.  0.  1.]]


In [72]:
# Normalization
sample_passengers = scaler.transform(sample_passengers)
print(sample_passengers)


[[-0.74926865 -0.71262782 -0.56437445 -0.49472744]
 [ 1.33463478 -0.94574888  1.77187327 -0.49472744]
 [-0.74926865 -0.32409271 -0.56437445  2.02131499]]


In [77]:
survival_p = model.predict(sample_passengers)
print(survival_p)

print("Here '0' represent 'Not survived' and '1' represent survived, Therefore by the predictions, jack and vijit did not survive but rose will survive.")

[0 1 0]
Here '0' represent 'Not survived' and '1' represent survived, Therefore by the predictions, jack and vijit did not survive but rose will survive.


In [75]:
survival_per = model.predict_proba(sample_passengers)
print(survival_per)

print('The first column tells the pobablity of not surviving, the second column tells the probabily of surviving')

[[0.8808987  0.1191013 ]
 [0.07882353 0.92117647]
 [0.78858119 0.21141881]]
The first column tells the pobablity of not surviving, the second column tells the probabily of surviving
