### This notebook reproduced the base model result of this kernel:
https://www.kaggle.com/code/jiteshmd/logistic-regression-from-scratch

In [28]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from preprocessing import encode_categorical_features

In [11]:
# Load data
heart_data = pd.read_csv("dataset/heart.csv")
heart_data.info()
heart_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [12]:
# Encode categorical features
categorical_features = list(heart_data.select_dtypes(include=['object']).columns)
heart_data_processed = encode_categorical_features(heart_data, categorical_features)
heart_data_processed.info()
heart_data_processed.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    int64  
 2   ChestPainType   918 non-null    int64  
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    int64  
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    int64  
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    int64  
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(11)
memory usage: 86.2 KB


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0


In [24]:
SEED = 0
# Split train/test set
X_train, X_test, y_train, y_test = train_test_split(heart_data_processed.iloc[:,:-1], 
                                                    heart_data_processed["HeartDisease"], 
                                                    random_state=SEED
                                                   )

In [42]:
# Train logistic regression model
n_iter = 2000
lr = LogisticRegression(penalty="none", max_iter=n_iter)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=2000, penalty='none')

In [43]:
# Accuracy
acc = lr.score(X_test, y_test)
print(f"Accuracy: {acc}")

Accuracy: 0.8304347826086956


In [44]:
# F1 score
preds = lr.predict(X_test)
f1 = f1_score(y_test, preds, pos_label=1, average='binary')
print(f"F1 score: {f1}")

F1 score: 0.8612099644128114


### Comment:
- The original kernel built Logistic regression from scratch and got an accuracy of approximately 55%. I used Logistic Regression from scikit-learn framework and got an accuracy of 83.04% and a F1-score of 86.12%.
- New benchmark score:
    - Accurarcy: 83.04%
    - F1-score: 86.12%