# Logistic Regression with Titanic Dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("./titanic.csv", index_col="PassengerId")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [3]:
# age has normal distribution, fill with mean age
df.Age.fillna(df.Age.mean(), inplace=True)
# replace categorical variables with numbers
sex = pd.get_dummies(df.Sex, drop_first=True)
embark = pd.get_dummies(df.Embarked, drop_first=True)
# cabin null is 0 and take the first Letter of Cabin number
df.Cabin = df.Cabin.apply(lambda x : "0" if pd.isnull(x) else str(x)[0])
# replace categorical variables with numbers
df.Cabin.replace({"A": 1, "B":2, "C": 3,"D": 4,"E": 5,"F": 6,"G": 7,"T": 8}, inplace=True)
# unused columns are dropped
df.drop(["Name","Ticket","Embarked", "Sex"], axis=1, inplace=True)
df = pd.concat([df,sex,embark], axis = 1)
# 2 null values are dropped
df.dropna(inplace=True)

In [4]:
x = df.drop("Survived",axis = 1)
y = df["Survived"]

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25,random_state= 42)

In [7]:
from sklearn.linear_model import LogisticRegression

In [8]:
logmodel = LogisticRegression(max_iter= 350)
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 668 entries, 299 to 103
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  668 non-null    int64  
 1   Age     668 non-null    float64
 2   SibSp   668 non-null    int64  
 3   Parch   668 non-null    int64  
 4   Fare    668 non-null    float64
 5   Cabin   668 non-null    object 
 6   male    668 non-null    uint8  
 7   Q       668 non-null    uint8  
 8   S       668 non-null    uint8  
dtypes: float64(2), int64(3), object(1), uint8(3)
memory usage: 38.5+ KB


In [9]:
logmodel.fit(x_train, y_train)

LogisticRegression(max_iter=350)

In [10]:
predictions = logmodel.predict(x_test)

In [11]:
from sklearn.metrics import classification_report

In [12]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85       134
           1       0.79      0.74      0.76        89

    accuracy                           0.82       223
   macro avg       0.81      0.80      0.81       223
weighted avg       0.82      0.82      0.82       223



In [13]:
from sklearn.metrics import confusion_matrix

In [14]:
pd.DataFrame(confusion_matrix(y_test,predictions), columns= ["Pred No","Pred Yes"], 
             index= ["Act No", "Act Yes"] )

Unnamed: 0,Pred No,Pred Yes
Act No,116,18
Act Yes,23,66


In [15]:
from sklearn.metrics import accuracy_score

In [16]:
accuracy_score(y_test, predictions)

0.8161434977578476

In [17]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin,male,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,22.0,1,0,7.25,0,1,0,1
2,1,1,38.0,1,0,71.2833,3,0,0,0
3,1,3,26.0,0,0,7.925,0,0,0,1
4,1,1,35.0,1,0,53.1,3,0,0,1
5,0,3,35.0,0,0,8.05,0,1,0,1
