In [1]:
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LogisticRegression

In [2]:
Train = pd.read_csv("train_Titanic.csv")
Test = pd.read_csv("test_Titanic.csv")
Train.shape

(668, 11)

In [3]:
Train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668 entries, 0 to 667
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    668 non-null    int64  
 1   Name      668 non-null    object 
 2   Sex       668 non-null    object 
 3   Age       536 non-null    float64
 4   SibSp     668 non-null    int64  
 5   Parch     668 non-null    int64  
 6   Ticket    668 non-null    object 
 7   Fare      668 non-null    float64
 8   Cabin     154 non-null    object 
 9   Embarked  667 non-null    object 
 10  Survived  668 non-null    int64  
dtypes: float64(2), int64(4), object(5)
memory usage: 57.5+ KB


In [4]:
Train.isnull().sum()

Pclass        0
Name          0
Sex           0
Age         132
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       514
Embarked      1
Survived      0
dtype: int64

## Here by studying Train dataframe 
### 132 entries in Age column are null out of 668 entries
### 514 entries in Cabin column are null out of 668 entries 
### 1 entry in embarked column is null out of 668 entries
### Name and Ticket entity does not have any value in Learning model.
### Also the Sex and Embarked column that I want to keep has object string which is categorical data. 

### So, Here's what I going to do :

### 1) As the Age have a dtype of float, I'm going to replace Age NAN          entries with the mean of their respective survived class, that is, I going to calculate the age mean based of survived class i.e 0 or 1 and I'll replace age of a particular row based on is survived entry.

### 2) As Cabin has majority of NAN entries I am simply going to drop that column. 

### 3) Embarked column only has one NAN entry so I'll drop only that row.

### 4) Drop Name and Ticket column 

### 5) Convert categorical data into numeric 


In [5]:
a = Train[Train.Survived == 1]
b = Train[Train.Survived == 0]
print(a.Age.mean())
print(b.Age.mean())



28.138157894736842
30.857142857142858


In [6]:
# filiing nan entries of Age 
m1 = Train.Age.isna()
m2 = Train.Survived.eq(0)
m3 = Train.Survived.eq(1)
Train.loc[:,'Age'] = np.select([m1&m2, m1&m3], [30.85,28.13], 
                                       default=Train.Age)

#dropping Cabin and age column 
Train.drop(["Name","Cabin","Ticket"], axis = 1 , inplace = True)
Train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,2,female,29.00,1,0,26.0000,S,1
1,3,male,30.85,0,0,8.0500,S,0
2,2,male,39.00,0,0,26.0000,S,0
3,3,female,29.00,0,4,21.0750,S,0
4,3,male,25.00,0,0,7.0500,S,0
...,...,...,...,...,...,...,...,...
663,2,female,17.00,0,0,10.5000,S,1
664,3,male,30.85,0,0,7.7500,Q,0
665,3,male,32.00,0,0,56.4958,S,1
666,3,female,22.00,0,0,9.8375,S,0


In [7]:
#Dropping NAN entry in embarked column
Train.dropna(inplace=True)

Train.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Survived    0
dtype: int64

In [8]:
# replace gender with 1(male) and 0(female) for test data as well as train data 
def replace_sex(x):
    if x['Sex'] == "male":
        return 1
    elif x['Sex'] == "female":
        return 0
    else:
        return 2
# print(Train['Embarked'].unique())  ==> ['S' 'C' 'Q']
"""
as I've already used 0,1,2 for replace in sex column
I'll use 3,4,5 here so that the values of these different attribute does not get conflict or mixed up by my algorithm
"""
def replace_Embarked(x):
    if x["Embarked"] == 'S':
        return 3
    elif x["Embarked"] == 'C':
        return 4 
    else:
        return 5
    
Train["Embarked"] =  Train.apply(replace_Embarked,axis=1)

Train["Sex"] = Train.apply(replace_sex, axis=1)

In [9]:
Train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,2,0,29.00,1,0,26.0000,3,1
1,3,1,30.85,0,0,8.0500,3,0
2,2,1,39.00,0,0,26.0000,3,0
3,3,0,29.00,0,4,21.0750,3,0
4,3,1,25.00,0,0,7.0500,3,0
...,...,...,...,...,...,...,...,...
663,2,0,17.00,0,0,10.5000,3,1
664,3,1,30.85,0,0,7.7500,5,0
665,3,1,32.00,0,0,56.4958,3,1
666,3,0,22.00,0,0,9.8375,3,0


In [10]:
Train.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
count,667.0,667.0,667.0,667.0,667.0,667.0,667.0,667.0
mean,2.298351,0.64018,29.748396,0.529235,0.407796,31.992684,3.349325,0.401799
std,0.830742,0.480308,12.772018,1.080943,0.85519,45.316739,0.614645,0.49063
min,1.0,0.0,0.67,0.0,0.0,0.0,3.0,0.0
25%,2.0,0.0,23.0,0.0,0.0,7.925,3.0,0.0
50%,3.0,1.0,30.0,0.0,0.0,14.5,3.0,0.0
75%,3.0,1.0,35.0,1.0,0.0,31.275,4.0,1.0
max,3.0,1.0,80.0,8.0,6.0,512.3292,5.0,1.0


In [11]:
print("Updated Train dataset",Train.shape)

Updated Train dataset (667, 8)


In [12]:
Test.drop(["Name","Cabin","Ticket"], axis = 1 , inplace = True)
Test.dropna(inplace=True)


In [13]:
Test["Embarked"] = Test.apply(replace_Embarked, axis=1)
Test["Sex"] = Test.apply(replace_sex, axis=1)
Test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,2,1,8.0,1,1,36.7500,3
1,1,0,49.0,0,0,25.9292,3
3,2,0,24.0,2,1,27.0000,3
4,1,1,36.0,0,0,26.2875,3
5,1,1,71.0,0,0,49.5042,4
...,...,...,...,...,...,...,...
218,3,1,20.0,1,0,7.9250,3
219,1,1,45.0,0,0,26.5500,3
220,1,0,17.0,1,0,108.9000,4
221,3,1,43.0,0,0,6.4500,3


In [14]:
Test.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,177.0,177.0,177.0,177.0,177.0,177.0,177.0
mean,2.271186,0.666667,29.51226,0.502825,0.338983,36.296679,3.276836
std,0.856071,0.472742,15.247426,1.006368,0.646938,67.455592,0.561223
min,1.0,0.0,0.42,0.0,0.0,0.0,3.0
25%,1.0,0.0,19.0,0.0,0.0,7.925,3.0
50%,3.0,1.0,27.0,0.0,0.0,13.0,3.0
75%,3.0,1.0,37.0,1.0,0.0,30.5,3.0
max,3.0,1.0,71.0,5.0,2.0,512.3292,5.0


In [15]:
X = Train.values
X

array([[ 2.    ,  0.    , 29.    , ..., 26.    ,  3.    ,  1.    ],
       [ 3.    ,  1.    , 30.85  , ...,  8.05  ,  3.    ,  0.    ],
       [ 2.    ,  1.    , 39.    , ..., 26.    ,  3.    ,  0.    ],
       ...,
       [ 3.    ,  1.    , 32.    , ..., 56.4958,  3.    ,  1.    ],
       [ 3.    ,  0.    , 22.    , ...,  9.8375,  3.    ,  0.    ],
       [ 3.    ,  0.    , 28.13  , ..., 15.5   ,  5.    ,  1.    ]])

In [16]:
X_test = Test.values

In [17]:
X_train = X[:,0:-1]
Y_train = X[:,-1]

In [27]:
Y_train.dtype
Y_train = Y_train.astype(int)

In [39]:
alg = LogisticRegression(max_iter=5900,penalty='none',solver='newton-cg')

In [40]:
alg.fit(X_train,Y_train)

LogisticRegression(max_iter=5900, penalty='none', solver='newton-cg')

In [41]:
alg.score(X_train,Y_train)

0.7886056971514243

In [24]:
alg.predict(X_test)

array([0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0])

In [25]:
alg.predict_proba(X_train)

array([[0.4292843 , 0.5707157 ],
       [0.85170302, 0.14829698],
       [0.77560194, 0.22439806],
       ...,
       [0.81226248, 0.18773752],
       [0.50981386, 0.49018614],
       [0.29053047, 0.70946953]])