# Logistic Regression

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler,StandardScaler,OneHotEncoder
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

In [3]:
# load the df
df = pd.read_csv('../../data/df_dummies.csv')

In [4]:
df['heart_attack_risk'].value_counts()

heart_attack_risk
0    5624
1    3139
Name: count, dtype: int64

In [5]:
# train_test_split
features = df.drop(columns=['heart_attack_risk','income'])
target = df['heart_attack_risk']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

In [6]:
# normalization
normalizer = MinMaxScaler()
normalizer.fit(X_train)
X_train_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)

X_train_norm = pd.DataFrame(X_train_norm,columns=X_train.columns)
X_test_norm = pd.DataFrame(X_test_norm,columns=X_test.columns)

# deal with imbalance dataset

## smote

In [1]:
from imblearn.over_sampling import SMOTE

In [7]:
sm = SMOTE(random_state=123,sampling_strategy=1.0)
X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train_norm,y_train)

In [8]:
y_train_SMOTE.value_counts()

heart_attack_risk
0    4510
1    4510
Name: count, dtype: int64

In [30]:
# LR
weight = {0:1, 1:1.1}

lr=LogisticRegression(class_weight=weight,random_state=10)

lr.fit(X_train_SMOTE,y_train_SMOTE)

In [31]:
pred_lr=lr.predict(X_test_norm)

In [32]:
print(classification_report(y_test,pred_lr))

              precision    recall  f1-score   support

           0       0.65      0.26      0.38      1114
           1       0.37      0.75      0.50       639

    accuracy                           0.44      1753
   macro avg       0.51      0.51      0.44      1753
weighted avg       0.55      0.44      0.42      1753



In [33]:
confusion_matrix(y_test,pred_lr)

array([[295, 819],
       [159, 480]], dtype=int64)

In [13]:
import plotly.express as px

In [34]:
cm = pd.DataFrame(confusion_matrix(y_test, pred_lr))
# Rename columns to predicted values - 0 = No Risk, 1 = Risk
cm.rename({0: 'No - True', 1: 'Yes - True'}, axis=1, inplace=True)
# Rename rows to real values - 0 = No Risk, 1 = Risk
cm.rename({0: 'No - Pred', 1: 'Yes - Pred'}, axis=0, inplace=True)
px.imshow(cm, text_auto=True, color_continuous_scale='RdBu', color_continuous_midpoint=0)

## Grid Search

In [15]:
grid = {"penalty":["l1", "l2"],
        "C":[0.01,0.1,1,10,100]}

In [None]:
log_lr = LogisticRegression(random_state=10)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = GridSearchCV(estimator = log_lr, param_grid = grid, cv=5)

In [None]:
model.fit(X_train_SMOTE, y_train_SMOTE)

In [None]:
model.best_params_

In [None]:
best_model = model.best_estimator_

In [None]:
pred = best_model.predict(X_test_norm)

print(classification_report(y_test, pred))

In [None]:
sns.heatmap(confusion_matrix(y_test, pred), annot=True,fmt='g')

In [35]:
df.columns

Index(['Unnamed: 0', 'age', 'cholesterol', 'heart_rate', 'diabetes',
       'family_history', 'smoking', 'obesity', 'alcohol_consumption',
       'exercise_hours_per_week', 'previous_heart_problems', 'medication_use',
       'stress_level', 'sedentary_hours_per_day', 'income', 'bmi',
       'triglycerides', 'physical_activity_days_per_week',
       'sleep_hours_per_day', 'heart_attack_risk', 'sex_Male',
       'continent_Asia', 'continent_Australia', 'continent_Europe',
       'continent_North America', 'continent_South America',
       'hemisphere_Southern Hemisphere', 'diet_Healthy', 'diet_Unhealthy',
       'blood_pressure_class_B', 'blood_pressure_class_C',
       'blood_pressure_class_D'],
      dtype='object')

In [36]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,age,cholesterol,heart_rate,diabetes,family_history,smoking,obesity,alcohol_consumption,exercise_hours_per_week,...,continent_Australia,continent_Europe,continent_North America,continent_South America,hemisphere_Southern Hemisphere,diet_Healthy,diet_Unhealthy,blood_pressure_class_B,blood_pressure_class_C,blood_pressure_class_D
0,0,67,208,72,0,0,1,0,0,4.168189,...,False,False,False,True,True,False,False,False,False,True
1,1,21,389,98,1,1,1,1,1,1.813242,...,False,False,True,False,False,False,True,False,False,True
2,2,21,324,72,1,0,0,0,0,2.078353,...,False,True,False,False,False,True,False,False,False,True


In [None]:
list_cag = ['diabetes','family_history','smoking','obesity','alcohol_consumption','medication_use','stress_level',]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Assuming X contains your features and y contains your target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the columns that you want to give more weight
columns_to_scale = ['feature1', 'feature2', 'feature3']

# Scale the selected columns using StandardScaler
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[columns_to_scale] = scaler.fit_transform(X_train_scaled[columns_to_scale])
X_test_scaled[columns_to_scale] = scaler.transform(X_test_scaled[columns_to_scale])

# Fit the logistic regression model
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)

# Evaluate the model
accuracy = lr_model.score(X_test_scaled, y_test)


