In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.groupby('Survived')['Fare'].mean()

Survived
0    22.117887
1    48.395408
Name: Fare, dtype: float64

In [5]:
# frequency encoding
df['ticket_encoded'] = df['Ticket'].map(df['Ticket'].value_counts())

In [6]:
df['ticket_encoded'].value_counts()

ticket_encoded
1    547
2    188
3     63
4     44
7     21
6     18
5     10
Name: count, dtype: int64

In [7]:
df['Ticket'].value_counts()

Ticket
347082              7
1601                7
CA. 2343            7
3101295             6
CA 2144             6
                   ..
PC 17590            1
17463               1
330877              1
373450              1
STON/O2. 3101282    1
Name: count, Length: 681, dtype: int64

In [8]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ticket_encoded
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,2
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1


In [9]:
df['Gender'] = pd.get_dummies(data = df['Sex'], drop_first=True, dtype=np.int_)

In [10]:
# male -> 1
# female -> 0

In [11]:
df[df['Embarked'].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ticket_encoded,Gender
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,2,0
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,2,0


In [12]:
df[df['Cabin'] == 'B28']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ticket_encoded,Gender
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,2,0
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,2,0


In [13]:
df['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [14]:
df.isna().sum()

PassengerId         0
Survived            0
Pclass              0
Name                0
Sex                 0
Age               177
SibSp               0
Parch               0
Ticket              0
Fare                0
Cabin             687
Embarked            2
ticket_encoded      0
Gender              0
dtype: int64

In [15]:
# dropping two rows with missing embarked
df.dropna(axis = 0, how = 'any', subset = 'Embarked', inplace = True)

In [16]:
df.shape[0]

889

In [17]:
embarked_dummies = pd.get_dummies(data = df['Embarked'], dtype=np.int_ )

In [18]:
# if both Q and S are 0 then it is C
df = pd.concat(objs=[df, embarked_dummies], axis = 1)

In [19]:
df['Age'] = df['Age'].fillna(df['Age'].median())

In [20]:
df_numeric = df.select_dtypes(include=np.number)

In [21]:
df_numeric

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,ticket_encoded,Gender,C,Q,S
0,1,0,3,22.0,1,0,7.2500,1,1,0,0,1
1,2,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,1,3,26.0,0,0,7.9250,1,0,0,0,1
3,4,1,1,35.0,1,0,53.1000,2,0,0,0,1
4,5,0,3,35.0,0,0,8.0500,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.0,0,0,13.0000,1,1,0,0,1
887,888,1,1,19.0,0,0,30.0000,1,0,0,0,1
888,889,0,3,28.0,1,2,23.4500,2,0,0,0,1
889,890,1,1,26.0,0,0,30.0000,1,1,1,0,0


In [22]:
Pclass_dummies = pd.get_dummies(df['Pclass'], dtype= np.int_)

In [23]:
ticket_encoded_dummies = pd.get_dummies(df['ticket_encoded'], dtype=np.int_)

In [24]:
df_numeric = pd.concat(objs= [df_numeric, Pclass_dummies, ticket_encoded_dummies], axis = 1)

In [25]:
df_numeric.drop(labels=['Pclass', 'ticket_encoded', 'PassengerId'], axis = 1, inplace= True)

In [26]:
df_numeric

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Gender,C,Q,S,1,2,3,1.1,2.1,3.1,4,5,6,7
0,0,22.0,1,0,7.2500,1,0,0,1,0,0,1,1,0,0,0,0,0,0
1,1,38.0,1,0,71.2833,0,1,0,0,1,0,0,1,0,0,0,0,0,0
2,1,26.0,0,0,7.9250,0,0,0,1,0,0,1,1,0,0,0,0,0,0
3,1,35.0,1,0,53.1000,0,0,0,1,1,0,0,0,1,0,0,0,0,0
4,0,35.0,0,0,8.0500,1,0,0,1,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,0,0,13.0000,1,0,0,1,0,1,0,1,0,0,0,0,0,0
887,1,19.0,0,0,30.0000,0,0,0,1,1,0,0,1,0,0,0,0,0,0
888,0,28.0,1,2,23.4500,0,0,0,1,0,0,1,0,1,0,0,0,0,0
889,1,26.0,0,0,30.0000,1,1,0,0,1,0,0,1,0,0,0,0,0,0


In [27]:
def strType(x):
    stringified = {}
    for el in x:
        stringified[el] =str(el)
    return stringified

In [28]:
# while encoding, the encoded numerical data got the non string feature name, so converting to string
df_numeric.rename(columns= strType(df_numeric.columns), inplace =True)

In [29]:
X = df_numeric.drop('Survived', axis = 1)
y = df_numeric['Survived']

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

In [31]:
scaler = StandardScaler()
scaler.fit(X)
scaled_X_train = scaler.transform(X)

In [32]:
model = LogisticRegression(solver='saga', penalty='elasticnet', max_iter= 10000 )
C = np.logspace(0, 20, 10)
l1_ratio = np.linspace(0,1,10)
grid_params = {'C':C, 'l1_ratio':l1_ratio}

In [33]:
grid_model = GridSearchCV(estimator= model, param_grid= grid_params, cv = 5, verbose = 1)

In [34]:
grid_model.fit(scaled_X_train, y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [35]:
grid_model.best_estimator_

In [36]:
grid_model.best_score_

np.float64(0.7964324255697328)

In [37]:
y_pred = grid_model.predict(scaled_X_train)

In [38]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84       549
           1       0.76      0.71      0.73       340

    accuracy                           0.80       889
   macro avg       0.79      0.79      0.79       889
weighted avg       0.80      0.80      0.80       889



In [39]:
# 1. check without drop_first in dummy encoding (i.e do one-hot encoding, donot drop first)
# 2. missing ages were filled with median of age feature. Find if there's better way to fillna
# 3. Analyze other way to improve model if not work with test.csv to predict and upload to kaggle 

In [40]:
test_df = pd.read_csv('test.csv')

In [41]:
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [42]:
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].median())
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].mean())

In [43]:
test_df.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

In [44]:
# frequency encoding for tickets
test_df['ticket_encoded'] = test_df['Ticket'].map(test_df['Ticket'].value_counts())

# one-hot encoding for other categorical features
test_Pclass_dummies = pd.get_dummies(test_df['Pclass'], dtype= np.int_)
test_ticket_encoded_dummies = pd.get_dummies(test_df['ticket_encoded'], dtype=np.int_)
test_embarked_dummies = pd.get_dummies(test_df['Embarked'], dtype = np.int_)
test_Sex_encoded = pd.get_dummies(test_df['Sex'], dtype = np.int_, drop_first=True)

In [45]:
test_Sex_encoded.rename(columns={'male':'Gender'}, inplace=True)

In [46]:
test_df = pd.concat(objs=[test_df, test_Sex_encoded, 
                          test_embarked_dummies,
                          test_Pclass_dummies,
                          test_ticket_encoded_dummies,
                          ], axis = 1)

In [47]:
test_df_numeric = test_df.select_dtypes(include = np.number)

In [48]:
final_data = pd.DataFrame()

In [49]:
final_data['PassengerId'] = test_df_numeric['PassengerId']

In [50]:
test_df_numeric['ticket_encoded'].value_counts()

ticket_encoded
1    321
2     66
3     18
4      8
5      5
Name: count, dtype: int64

In [51]:
test_df_numeric

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,ticket_encoded,Gender,C,Q,S,1,2,3,1.1,2.1,3.1,4,5
0,892,3,34.5,0,0,7.8292,1,1,0,1,0,0,0,1,1,0,0,0,0
1,893,3,47.0,1,0,7.0000,1,0,0,0,1,0,0,1,1,0,0,0,0
2,894,2,62.0,0,0,9.6875,1,1,0,1,0,0,1,0,1,0,0,0,0
3,895,3,27.0,0,0,8.6625,1,1,0,0,1,0,0,1,1,0,0,0,0
4,896,3,22.0,1,1,12.2875,1,0,0,0,1,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,27.0,0,0,8.0500,1,1,0,0,1,0,0,1,1,0,0,0,0
414,1306,1,39.0,0,0,108.9000,1,0,1,0,0,1,0,0,1,0,0,0,0
415,1307,3,38.5,0,0,7.2500,1,1,0,0,1,0,0,1,1,0,0,0,0
416,1308,3,27.0,0,0,8.0500,1,1,0,0,1,0,0,1,1,0,0,0,0


In [52]:
X_test = test_df_numeric.drop(labels=['PassengerId', 'ticket_encoded', 'Pclass'], axis = 1)

In [53]:
X_test.shape

(418, 16)

In [54]:
X_test['6'] = np.zeros(418, dtype = np.int_)
X_test['7'] = np.zeros(418, dtype = np.int_)

In [55]:
X_test.rename(columns=strType(X_test.columns), inplace = True)

In [56]:
scaled_X_test = scaler.transform(X_test)

In [57]:
y_test_pred = grid_model.predict(scaled_X_test)

In [60]:
final_data['Survived'] = y_test_pred

In [62]:
final_data['Survived'].value_counts()

Survived
0    262
1    156
Name: count, dtype: int64

In [63]:
final_data.to_csv('final_data_encod_grid.csv', index= False)