In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn import model_selection

In [2]:
df = pd.read_csv('training_titanic_x_y_train.csv')
df = df.drop(['Name', 'Cabin'], axis = 1)

In [3]:
df.Survived.value_counts()

0    399
1    269
Name: Survived, dtype: int64

In [4]:
df.isnull().sum()

Pclass        0
Sex           0
Age         132
SibSp         0
Parch         0
Ticket        0
Fare          0
Embarked      1
Survived      0
dtype: int64

In [5]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='median')
df['Age'] = imp.fit_transform(df[['Age']])

In [6]:
for i in df['Pclass'].unique():
    for j in df['Sex'].unique():
        print("Average Age of " + str(j) + "s in Class " + str(i) + " is: ",np.round(np.mean(df[(df.Pclass == i) & (df.Sex == j)].Age)))  

Average Age of females in Class 2 is:  28.0
Average Age of males in Class 2 is:  30.0
Average Age of females in Class 3 is:  25.0
Average Age of males in Class 3 is:  28.0
Average Age of females in Class 1 is:  33.0
Average Age of males in Class 1 is:  38.0


In [7]:
for i in range(df.shape[0]):
    if np.isnan(df.Age[i]):
        df.Age[i] = np.round(np.mean(df[(df.Pclass == df.Pclass[i]) & (df.Sex == df.Sex[i])].Age))

In [8]:
df.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    1
Survived    0
dtype: int64

In [9]:
df.shape

(668, 9)

In [10]:
df1 = df.dropna()
df1.shape

(667, 9)

In [11]:
df1.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
Survived    0
dtype: int64

In [12]:
df1.dtypes

Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Embarked     object
Survived      int64
dtype: object

In [15]:
P_class = pd.get_dummies(df1.Pclass, drop_first = True)
Gender = pd.get_dummies(df1.Sex, drop_first = True)
Embark = pd.get_dummies(df1.Embarked, drop_first = True)
df2 = pd.concat([df1, P_class, Gender, Embark], axis = 1)
df2

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived,2,3,male,Q,S
0,2,female,29.0,1,0,228414,26.0000,S,1,1,0,0,0,1
1,3,male,29.0,0,0,A/5 2466,8.0500,S,0,0,1,1,0,1
2,2,male,39.0,0,0,250655,26.0000,S,0,1,0,1,0,1
3,3,female,29.0,0,4,349909,21.0750,S,0,0,1,0,0,1
4,3,male,25.0,0,0,SOTON/OQ 392076,7.0500,S,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,2,female,17.0,0,0,SO/C 14885,10.5000,S,1,1,0,0,0,1
664,3,male,29.0,0,0,372622,7.7500,Q,0,0,1,1,1,0
665,3,male,32.0,0,0,1601,56.4958,S,1,0,1,1,0,1
666,3,female,22.0,0,0,7553,9.8375,S,0,0,1,0,0,1


In [16]:
df3 = df2.drop(['Pclass', 'Sex', 'Ticket', 'Embarked'], axis = 1)
df3

Unnamed: 0,Age,SibSp,Parch,Fare,Survived,2,3,male,Q,S
0,29.0,1,0,26.0000,1,1,0,0,0,1
1,29.0,0,0,8.0500,0,0,1,1,0,1
2,39.0,0,0,26.0000,0,1,0,1,0,1
3,29.0,0,4,21.0750,0,0,1,0,0,1
4,25.0,0,0,7.0500,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...
663,17.0,0,0,10.5000,1,1,0,0,0,1
664,29.0,0,0,7.7500,0,0,1,1,1,0
665,32.0,0,0,56.4958,1,0,1,1,0,1
666,22.0,0,0,9.8375,0,0,1,0,0,1


In [17]:
seed = 10
x_train, x_test, y_train, y_test = model_selection.train_test_split(df3.drop('Survived', axis=1), df3['Survived'], random_state = seed, stratify = df3['Survived'])

In [19]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C = 2, solver = 'saga', random_state = 10, class_weight='balanced', max_iter = 100000)
model.fit(x_train,y_train)



LogisticRegression(C=2, class_weight='balanced', max_iter=100000,
                   random_state=10, solver='saga')

In [20]:
model.predict_proba(x_test)



array([[0.60663332, 0.39336668],
       [0.57356816, 0.42643184],
       [0.77967895, 0.22032105],
       [0.75378472, 0.24621528],
       [0.32808182, 0.67191818],
       [0.05679823, 0.94320177],
       [0.64422235, 0.35577765],
       [0.75608498, 0.24391502],
       [0.36617837, 0.63382163],
       [0.10372311, 0.89627689],
       [0.31105819, 0.68894181],
       [0.77324353, 0.22675647],
       [0.45401976, 0.54598024],
       [0.44292503, 0.55707497],
       [0.90579322, 0.09420678],
       [0.49257228, 0.50742772],
       [0.14280186, 0.85719814],
       [0.76337113, 0.23662887],
       [0.45354801, 0.54645199],
       [0.60967003, 0.39032997],
       [0.72832597, 0.27167403],
       [0.5621626 , 0.4378374 ],
       [0.62579648, 0.37420352],
       [0.4425997 , 0.5574003 ],
       [0.76427911, 0.23572089],
       [0.27786229, 0.72213771],
       [0.05424265, 0.94575735],
       [0.3383203 , 0.6616797 ],
       [0.59503092, 0.40496908],
       [0.75018528, 0.24981472],
       [0.

In [21]:
y_pred = model.predict(x_test)
y_pred



array([0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1], dtype=int64)

## Writing Predictions, for Testing data

In [22]:
df_test = pd.read_csv('test_titanic_x_test.csv')
df_test = df_test.drop(['Name', 'Cabin'], axis = 1)
df_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,2,male,8.0,1,1,C.A. 33112,36.7500,S
1,1,female,49.0,0,0,17465,25.9292,S
2,3,male,,0,0,36865,7.7375,Q
3,2,female,24.0,2,1,243847,27.0000,S
4,1,male,36.0,0,0,PC 17473,26.2875,S
...,...,...,...,...,...,...,...,...
218,3,male,20.0,1,0,STON/O 2. 3101285,7.9250,S
219,1,male,45.0,0,0,113050,26.5500,S
220,1,female,17.0,1,0,PC 17758,108.9000,C
221,3,male,43.0,0,0,C 7075,6.4500,S


In [23]:
df_test.isnull().sum()

Pclass       0
Sex          0
Age         45
SibSp        0
Parch        0
Ticket       0
Fare         0
Embarked     1
dtype: int64

In [24]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='median')
df_test['Age'] = imp.fit_transform(df_test[['Age']])

In [25]:
for i in df_test['Pclass'].unique():
    for j in df_test['Sex'].unique():
        print("Average Age of " + str(j) + "s in Class " + str(i) + " is: ",np.round(np.mean(df_test[(df_test.Pclass == i) & (df_test.Sex == j)].Age)))  

Average Age of males in Class 2 is:  32.0
Average Age of females in Class 2 is:  30.0
Average Age of males in Class 1 is:  41.0
Average Age of females in Class 1 is:  37.0
Average Age of males in Class 3 is:  26.0
Average Age of females in Class 3 is:  21.0


In [26]:
for i in range(df_test.shape[0]):
    if np.isnan(df_test.Age[i]):
        df_test.Age[i] = np.round(np.mean(df_test[(df_test.Pclass == df_test.Pclass[i]) & (df_test.Sex == df_test.Sex[i])].Age))

In [2]:
df1_test = df_test.dropna()
df1_test.shape


NameError: name 'df_test' is not defined

In [28]:
df1_test.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
dtype: int64

In [29]:
P_class = pd.get_dummies(df1_test.Pclass, drop_first = True)
Gender = pd.get_dummies(df1_test.Sex, drop_first = True)
Embark = pd.get_dummies(df1_test.Embarked, drop_first = True)
df2_test = pd.concat([df1_test, P_class, Gender, Embark], axis = 1)
df2_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,2,3,male,Q,S
0,2,male,8.0,1,1,C.A. 33112,36.7500,S,1,0,1,0,1
1,1,female,49.0,0,0,17465,25.9292,S,0,0,0,0,1
2,3,male,27.0,0,0,36865,7.7375,Q,0,1,1,1,0
3,2,female,24.0,2,1,243847,27.0000,S,1,0,0,0,1
4,1,male,36.0,0,0,PC 17473,26.2875,S,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,3,male,20.0,1,0,STON/O 2. 3101285,7.9250,S,0,1,1,0,1
219,1,male,45.0,0,0,113050,26.5500,S,0,0,1,0,1
220,1,female,17.0,1,0,PC 17758,108.9000,C,0,0,0,0,0
221,3,male,43.0,0,0,C 7075,6.4500,S,0,1,1,0,1


In [30]:
df3_test = df2_test.drop(['Pclass', 'Sex', 'Ticket', 'Embarked'], axis = 1)
df3_test

Unnamed: 0,Age,SibSp,Parch,Fare,2,3,male,Q,S
0,8.0,1,1,36.7500,1,0,1,0,1
1,49.0,0,0,25.9292,0,0,0,0,1
2,27.0,0,0,7.7375,0,1,1,1,0
3,24.0,2,1,27.0000,1,0,0,0,1
4,36.0,0,0,26.2875,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...
218,20.0,1,0,7.9250,0,1,1,0,1
219,45.0,0,0,26.5500,0,0,1,0,1
220,17.0,1,0,108.9000,0,0,0,0,0
221,43.0,0,0,6.4500,0,1,1,0,1


In [31]:
X_test = df3_test.to_numpy()
print(X_test)

[[ 8.   1.   1.  ...  1.   0.   1. ]
 [49.   0.   0.  ...  0.   0.   1. ]
 [27.   0.   0.  ...  1.   1.   0. ]
 ...
 [17.   1.   0.  ...  0.   0.   0. ]
 [43.   0.   0.  ...  1.   0.   1. ]
 [36.5  0.   2.  ...  1.   0.   1. ]]


In [32]:
ans_Y_pred = model.predict(X_test)
print(ans_Y_pred)

[0 1 0 1 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 1
 0 0 0 1 0 0 0 1 1 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0
 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 1 0 1 1 0 0 0 0 0
 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0
 1 1 1 0 0 0 1 0 0 1 0 0 0 1 1 1 1 0 0 0 1 1 1 0 1 0 1 0 0 0 0 1 0 0 0 0 1
 0 1 1 1 0 0 0 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0]


In [35]:
len(ans_Y_pred)

222

In [34]:
np.savetxt("titanic_result.csv", ans_Y_pred, delimiter=",", fmt='%f')