In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
dataframe_train = pd.read_csv('train.csv')
dataframe_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
dataframe_train.shape

(891, 12)

In [5]:
dataframe_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [6]:
dataframe_train.Cabin.unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [7]:
dataframe_train.Survived.value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [8]:
dataframe_train.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [9]:
dataframe_train.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [10]:
dataframe_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [11]:
dataframe_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [12]:
# All these columns seems to be irrelevant. Though you can add SibSp and Parch in your data.
columns_to_drop = ['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked']

In [13]:
dataframe_train = dataframe_train.drop(columns_to_drop, axis = 1)
dataframe_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [14]:
dataframe_train[dataframe_train['Age'].isna()]

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
5,0,3,male,,8.4583
17,1,2,male,,13.0000
19,1,3,female,,7.2250
26,0,3,male,,7.2250
28,1,3,female,,7.8792
...,...,...,...,...,...
859,0,3,male,,7.2292
863,0,3,female,,69.5500
868,0,3,male,,9.5000
878,0,3,male,,7.8958


In [15]:
# Smart way of replacing null values in age column by looking into Pclass as reference column
dataframe_train.groupby('Pclass').mean()[['Age']]

Unnamed: 0_level_0,Age
Pclass,Unnamed: 1_level_1
1,38.233441
2,29.87763
3,25.14062


In [15]:
def age_approx(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1 :
            return 39
        elif Pclass == 2:
            return 30
        else:
            return 24
    else:
        return Age

In [16]:
dataframe_train['Age'] = dataframe_train[['Age','Pclass']].apply(age_approx, axis = 1)
dataframe_train

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.2500
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.9250
3,1,1,female,35.0,53.1000
4,0,3,male,35.0,8.0500
...,...,...,...,...,...
886,0,2,male,27.0,13.0000
887,1,1,female,19.0,30.0000
888,0,3,female,24.0,23.4500
889,1,1,male,26.0,30.0000


In [17]:
dataframe_train.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
Fare        0
dtype: int64

In [18]:
dataframe_train.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
Fare        float64
dtype: object

In [19]:
# Both the columns are nominal variables so applying one hot encoding
dataframe_train_one_hot = pd.get_dummies(dataframe_train, columns = ['Sex'])

In [20]:
dataframe_train_one_hot

Unnamed: 0,Survived,Pclass,Age,Fare,Sex_female,Sex_male
0,0,3,22.0,7.2500,0,1
1,1,1,38.0,71.2833,1,0
2,1,3,26.0,7.9250,1,0
3,1,1,35.0,53.1000,1,0
4,0,3,35.0,8.0500,0,1
...,...,...,...,...,...,...
886,0,2,27.0,13.0000,0,1
887,1,1,19.0,30.0000,1,0
888,0,3,24.0,23.4500,1,0
889,1,1,26.0,30.0000,0,1


In [21]:
dataframe_train_one_hot.columns

Index(['Survived', 'Pclass', 'Age', 'Fare', 'Sex_female', 'Sex_male'], dtype='object')

In [22]:
X = dataframe_train_one_hot[['Pclass', 'Age', 'Fare', 'Sex_female', 'Sex_male']]
y = dataframe_train_one_hot['Survived']

In [23]:
X.shape,y.shape

((891, 5), (891,))

In [24]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [25]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((668, 5), (223, 5), (668,), (223,))

In [26]:
# from sklearn.preprocessing import StandardScaler, MinMaxScaler

# age_mm_scaler = MinMaxScaler()
# fare_std_scaler = StandardScaler()

# X_train['Age'] = age_mm_scaler.fit_transform(X_train[['Age']])
# X_train['Fare'] = fare_std_scaler.fit_transform(X_train[['Fare']])
# X_train

In [27]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [28]:
logreg.fit(X_train,y_train)

LogisticRegression()

In [30]:
y_pred = logreg.predict(X_test)

In [50]:
from sklearn import metrics

print('Accuracy: ',metrics.accuracy_score(y_test, y_pred))
print('Confusion Matrix : ', metrics.confusion_matrix(y_test,y_pred))
print('Recall Score : ', metrics.recall_score(y_test, y_pred))

Accuracy:  0.8116591928251121
Confusion Matrix :  [[119  18]
 [ 24  62]]
Recall Score :  0.7209302325581395


In [37]:
test_data = pd.read_csv('test.csv')
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [38]:
#This is not to be done at any cost
logreg.predict(test_data)

ValueError: could not convert string to float: 'Kelly, Mr. James'

In [39]:
dataframe_test = test_data.drop(columns_to_drop, axis = 1)
dataframe_test.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,34.5,7.8292
1,3,female,47.0,7.0
2,2,male,62.0,9.6875
3,3,male,27.0,8.6625
4,3,female,22.0,12.2875


In [40]:
dataframe_test[dataframe_test['Age'].isna()]

Unnamed: 0,Pclass,Sex,Age,Fare
10,3,male,,7.8958
22,1,female,,31.6833
29,3,male,,21.6792
33,3,female,,23.4500
36,3,female,,8.0500
...,...,...,...,...
408,3,female,,7.7208
410,3,female,,7.7500
413,3,male,,8.0500
416,3,male,,8.0500


In [41]:
dataframe_test['Age'] = dataframe_test[['Age','Pclass']].apply(age_approx, axis = 1)
dataframe_test

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,34.5,7.8292
1,3,female,47.0,7.0000
2,2,male,62.0,9.6875
3,3,male,27.0,8.6625
4,3,female,22.0,12.2875
...,...,...,...,...
413,3,male,24.0,8.0500
414,1,female,39.0,108.9000
415,3,male,38.5,7.2500
416,3,male,24.0,8.0500


In [42]:
# Both the columns are nominal variables so applying one hot encoding
dataframe_test_one_hot = pd.get_dummies(dataframe_test, columns = ['Sex'])

In [43]:
dataframe_test_one_hot.columns

Index(['Pclass', 'Age', 'Fare', 'Sex_female', 'Sex_male'], dtype='object')

In [44]:
logreg.predict(dataframe_test_one_hot)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [45]:
dataframe_test_one_hot.isna().sum()

Pclass        0
Age           0
Fare          1
Sex_female    0
Sex_male      0
dtype: int64

In [46]:
dataframe_test_one_hot.dropna(inplace=True)

In [47]:
logreg.predict(dataframe_test_one_hot)

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,

In [48]:
dataframe_test_one_hot['predictions'] = logreg.predict(dataframe_test_one_hot)
dataframe_test_one_hot.head(10)

Unnamed: 0,Pclass,Age,Fare,Sex_female,Sex_male,predictions
0,3,34.5,7.8292,0,1,0
1,3,47.0,7.0,1,0,0
2,2,62.0,9.6875,0,1,0
3,3,27.0,8.6625,0,1,0
4,3,22.0,12.2875,1,0,1
5,3,14.0,9.225,0,1,0
6,3,30.0,7.6292,1,0,1
7,2,26.0,29.0,0,1,0
8,3,18.0,7.2292,1,0,1
9,3,21.0,24.15,0,1,0


Remember no evaluation can be done on test.csv as the target variable is not shared with us. Generally we do the predictions and make the submission and thats it. 