## Logistic Regression Model

In [4]:
# Importing dependencies
from pathlib import Path
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sqlalchemy import create_engine

In [5]:
# Connecting to PostgreSQL
engine = create_engine('postgresql://postgres:postgres@localhost:5432/proj4_db')

query = "SELECT * FROM cleaned_data"
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,sex,age,education,smokingStatus,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,CHDRisk
0,1.0,39,4,0.0,0,False,False,False,False,195,106.0,70.0,26.97,80,77,False
1,0.0,46,2,0.0,0,False,False,False,False,250,121.0,81.0,28.73,95,76,False
2,1.0,48,1,1.0,20,False,False,False,False,245,127.5,80.0,25.34,75,70,False
3,0.0,61,3,1.0,30,False,False,True,False,225,150.0,95.0,28.58,65,103,True
4,0.0,46,3,1.0,23,False,False,False,False,285,130.0,84.0,23.1,85,85,False


In [6]:
# Extracting the Target Variable
y = df['CHDRisk']
X = df.drop(columns='CHDRisk')

In [7]:
# Count Values
y.value_counts()

CHDRisk
False    3084
True      553
Name: count, dtype: int64

In [8]:
# Splitting the data into testing and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [9]:
# Count Values
y_train.value_counts()

CHDRisk
False    2312
True      415
Name: count, dtype: int64

In [10]:
# Using synthetic minority over-sampling technique to balance the target variable conditions in the training data
smote = SMOTE(random_state=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [11]:
y_train_resampled.value_counts()

CHDRisk
False    2312
True     2312
Name: count, dtype: int64

In [12]:
X_train_resampled.shape

(4624, 15)

In [13]:
y_train_resampled.shape

(4624,)

In [14]:
# Scaling the feature variables
scaler = StandardScaler()
X_scaler = scaler.fit(X_train_resampled)

X_train_scaled = X_scaler.transform(X_train_resampled)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
# Instantiating Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

In [16]:
# Train LR Model
classifier.fit(X_train_scaled, y_train_resampled)

In [17]:
y_pred = classifier.predict(X_test_scaled)

In [18]:
# Printing the results
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.6626373626373626
Classification Report:
              precision    recall  f1-score   support

       False       0.91      0.66      0.77       772
        True       0.26      0.65      0.37       138

    accuracy                           0.66       910
   macro avg       0.59      0.66      0.57       910
weighted avg       0.81      0.66      0.71       910



In [19]:
# Creating the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix_df = pd.DataFrame(conf_matrix, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
print(conf_matrix_df)

          Predicted 0  Predicted 1
Actual 0          513          259
Actual 1           48           90


## Conclusion
The baseline logistic regression model had an accuracy score of 0.66, showing room for improvement. However, out of all the baseline models, this model had the best recall score for the minority class (1), showing the fewest false negatives, which is important for health data.