In [1]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix , classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
# Load the data

file_path = ('lifestyle.csv')
df = pd.read_csv(file_path)
print(df.shape)


# Drop the null rows
df = df.dropna()

#Drop the client id row
df = df.drop(columns=["client_id"])



df.head()

(253681, 8)


Unnamed: 0,diabetes_012,cholcheck,smoker,physactivity,fruits,veggies,hvyalcoholconsump
0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,1.0,1.0,1.0,0.0
4,0.0,1.0,0.0,1.0,1.0,1.0,0.0


In [3]:
#Split the data into features(x) and Target(y)
y = df["diabetes_012"]
X = df.drop(['diabetes_012'], axis = 1)

In [4]:
df.dtypes

diabetes_012         float64
cholcheck            float64
smoker               float64
physactivity         float64
fruits               float64
veggies              float64
hvyalcoholconsump    float64
dtype: object

In [5]:
X.describe()

Unnamed: 0,cholcheck,smoker,physactivity,fruits,veggies,hvyalcoholconsump
count,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0
mean,0.96267,0.443169,0.756544,0.634256,0.81142,0.056197
std,0.189571,0.496761,0.429169,0.481639,0.391175,0.230302
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,1.0,0.0,1.0,0.0
50%,1.0,0.0,1.0,1.0,1.0,0.0
75%,1.0,1.0,1.0,1.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
y.value_counts()

0.0    218334
1.0     35346
Name: diabetes_012, dtype: int64

In [7]:
#Split the data into test and training
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50,random_state=1, stratify=y)
Counter(y_train)

Counter({0.0: 109167, 1.0: 17673})

In [8]:
X_test

Unnamed: 0,cholcheck,smoker,physactivity,fruits,veggies,hvyalcoholconsump
5404,1.0,1.0,1.0,0.0,1.0,1.0
134844,1.0,0.0,1.0,1.0,1.0,0.0
129255,1.0,0.0,1.0,1.0,1.0,0.0
224524,1.0,0.0,1.0,1.0,1.0,0.0
92752,1.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...
103802,1.0,1.0,0.0,0.0,1.0,0.0
7476,1.0,1.0,1.0,1.0,1.0,0.0
96653,1.0,0.0,1.0,1.0,1.0,0.0
87326,1.0,1.0,1.0,1.0,1.0,0.0


In [9]:
#Initialize Logistic Regression Model
model = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)



In [10]:
#Fit the model
model.fit(X_train,y_train)

LogisticRegression(max_iter=200, random_state=1)

In [11]:
predictions = model.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
results.head(10)



Unnamed: 0,Prediction,Actual
5404,0.0,0.0
134844,0.0,0.0
129255,0.0,0.0
224524,0.0,0.0
92752,0.0,0.0
151708,0.0,0.0
55470,0.0,1.0
30688,0.0,0.0
94938,0.0,0.0
245034,0.0,0.0


In [12]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.8606669820245979

In [13]:
from imblearn.metrics import classification_report_imbalanced
matrix = confusion_matrix(y_test, predictions)
print(matrix)

[[109167      0]
 [ 17673      0]]


In [14]:
report = classification_report(y_test, predictions)
print(report)

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.86      1.00      0.93    109167
         1.0       0.00      0.00      0.00     17673

    accuracy                           0.86    126840
   macro avg       0.43      0.50      0.46    126840
weighted avg       0.74      0.86      0.80    126840



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


   ### Oversampling

In [15]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)



In [16]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)



LogisticRegression(random_state=1)

In [17]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5927022262663242

In [18]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[71176, 37991],
       [ 8246,  9427]], dtype=int64)

In [19]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.90      0.65      0.53      0.75      0.59      0.35    109167
        1.0       0.20      0.53      0.65      0.29      0.59      0.34     17673

avg / total       0.80      0.64      0.55      0.69      0.59      0.35    126840



### SMOTE Oversampling

In [20]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({0.0: 109167, 1.0: 109167})

In [21]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [22]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5927022262663242

In [23]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[71176, 37991],
       [ 8246,  9427]], dtype=int64)

In [24]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.90      0.65      0.53      0.75      0.59      0.35    109167
        1.0       0.20      0.53      0.65      0.29      0.59      0.34     17673

avg / total       0.80      0.64      0.55      0.69      0.59      0.35    126840



### Undersampling

In [None]:
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)


In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

### Combination

In [None]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))