#### What is Logistic Regression?
##### Logistic Regression is a statistical method used for binary classification (i.e., predicting one of two outcomes). Despite the word “regression,” it is mainly used for classification tasks. The core idea is to model the probability that a given input belongs to a particular category.
#### Key Points:
##### Binary Output: The output is either 0 or 1 (or, equivalently, “No”/“Yes”, “False”/“True”).
##### Sigmoid Function: Unlike linear regression, logistic regression applies a sigmoid (or logistic) function to the linear combination of input features. The sigmoid function squashes the output into a range between 0 and 1.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  mean_squared_error,r2_score,accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

In [2]:
#initialise dataset
df = pd.read_csv("titanic_test.csv")
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
295,296,0,1,"Lewy, Mr. Ervin G",male,,0,0,PC 17612,27.7208,,C
215,216,1,1,"Newell, Miss. Madeleine",female,31.0,1,0,35273,113.275,D36,C
94,95,0,3,"Coxon, Mr. Daniel",male,59.0,0,0,364500,7.25,,S
286,287,1,3,"de Mulder, Mr. Theodore",male,30.0,0,0,345774,9.5,,S
250,251,0,3,"Reed, Mr. James George",male,,0,0,362316,7.25,,S


In [3]:
#find missing values
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
#fill na values
df.fillna({"Age": df["Age"].mean()}, inplace=True)

#extract cabin no and category similar for ticket no and 
df['Cabin_num'] = df['Cabin'].str.extract(r'(\d+)')
df['Cabin_cat'] = df['Cabin'].str[0]

df['Ticket_num'] = df['Ticket'].apply(lambda s: s.split()[-1])
df['Ticket_cat'] = df['Ticket'].apply(lambda s: s.split()[0])
df['Ticket_cat'] = np.where(df['Ticket_cat'].str.isdigit(), np.nan, df['Ticket_cat'])

df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_num,Cabin_cat,Ticket_num,Ticket_cat
279,280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35.0,1,1,C.A. 2673,20.25,,S,,,2673,C.A.
653,654,1,3,"O'Leary, Miss. Hanora ""Norah""",female,29.699118,0,0,330919,7.8292,,Q,,,330919,
591,592,1,1,"Stephenson, Mrs. Walter Bertram (Martha Eustis)",female,52.0,1,0,36947,78.2667,D20,C,20.0,D,36947,
545,546,0,1,"Nicholson, Mr. Arthur Ernest",male,64.0,0,0,693,26.0,,S,,,693,
774,775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54.0,1,3,29105,23.0,,S,,,29105,


In [5]:
#drop unnecessary columns
df = df.drop(columns=['Cabin','Ticket'])
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_num,Cabin_cat,Ticket_num,Ticket_cat
13,14,0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,31.275,S,,,347082,
146,147,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,7.7958,S,,,350043,
97,98,1,1,"Greenfield, Mr. William Bertram",male,23.0,0,1,63.3583,C,10.0,D,17759,PC
558,559,1,1,"Taussig, Mrs. Emil (Tillie Mandelbaum)",female,39.0,1,1,79.65,S,67.0,E,110413,
385,386,0,2,"Davies, Mr. Charles Henry",male,18.0,0,0,73.5,S,,,14879,S.O.C.


In [6]:
#convert numerical to categorical value
df = pd.get_dummies(df, drop_first=True)
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,"Name_Abbott, Mr. Rossmore Edward","Name_Abbott, Mrs. Stanton (Rosa Hunt)","Name_Abelson, Mr. Samuel",...,Ticket_cat_SOTON/O.Q.,Ticket_cat_SOTON/O2,Ticket_cat_SOTON/OQ,Ticket_cat_STON/O,Ticket_cat_STON/O2.,Ticket_cat_SW/PP,Ticket_cat_W./C.,Ticket_cat_W.E.P.,Ticket_cat_W/C,Ticket_cat_WE/P
113,114,0,3,20.0,1,0,9.825,False,False,False,...,False,False,False,False,False,False,False,False,False,False
301,302,1,3,29.699118,2,0,23.25,False,False,False,...,False,False,False,False,False,False,False,False,False,False
667,668,0,3,29.699118,0,0,7.775,False,False,False,...,False,False,False,False,False,False,False,False,False,False
839,840,1,1,29.699118,0,0,29.7,False,False,False,...,False,False,False,False,False,False,False,False,False,False
623,624,0,3,21.0,0,0,7.8542,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
#get target value
column_to_move = 'Survived'

# Move 'target' column to the end
df[column_to_move] = df.pop(column_to_move)
X = df.iloc[:, 0:1717].to_numpy()
y = df.iloc[:, -1].to_numpy()

In [8]:
#train test split the data
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

#use standard scaler to scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#use logistic regression
logistic = LogisticRegression()
logistic.fit(X_train,y_train)

In [9]:
#calculate predictions
y_pred = logistic.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")

# In a confusion matrix, the arrangement of values can vary depending on the implementation or the specific context. 
# While some frameworks or discussions might present the confusion matrix in the order of True Negatives 
# (TN), False Positives (FP), False Negatives (FN), and True Positives (TP), this is not a universal standard
print(f"\nConfusion Matrix: {confusion_matrix(y_test, y_pred)}")

# A classification report provides a comprehensive evaluation of a classification model's performance 
# by detailing several key metrics for each class in the dataset
print(f"\nClassification Report:{classification_report(y_test, y_pred)}")

Accuracy: 75.98%

Confusion Matrix: [[102   3]
 [ 40  34]]

Classification Report:              precision    recall  f1-score   support

           0       0.72      0.97      0.83       105
           1       0.92      0.46      0.61        74

    accuracy                           0.76       179
   macro avg       0.82      0.72      0.72       179
weighted avg       0.80      0.76      0.74       179



In [12]:
# from mlxtend.plotting import plot_decision_regions
# plot_decision_regions(X,y,logistic,legend=1)
# plt.xlabel("Survived")
# plt.xlabel("Not Survied")
# plt.title("Titanic Using Logistic Reg")