In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import os
os.environ["OMP_NUM_THREADS"] = '1'
pd.set_option('display.max_columns', None)
from sqlalchemy import create_engine

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `csv` data from the  into a Pandas DataFrame.

In [2]:
diabetes = pd.read_csv("diabetes_binary_5050split_health_indicators_BRFSS2015.csv")

In [3]:
# sql_file_path = "Diabetes_Indicator_db"


# engine = create_engine('sqlite:///:memory:')

# # Read the .sql file into a pandas DataFrame
# diabetes = pd.read_sql(sql_file_path, engine)

In [4]:
# sql_file_path = 'Diabetes_Indicator_db.sql'

# # Read the content of the SQL file
# with open(sql_file_path, 'r') as file:
#     sql_query = file.read()

# # Create a SQLAlchemy engine
# engine = create_engine('sqlite:///:memory:')

# # Execute the SQL query and read the result into a DataFrame
# df = pd.read_sql_query(sql_query, engine)

In [5]:
diabetes.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


### Step 2: Create the labels set (`y`)  from the “Diabetes_binary” column, and then create the features (`X`) DataFrame from the remaining columns.

In [6]:
df_data_scaled = StandardScaler().fit_transform(diabetes[['BMI', 'GenHlth',
       'MentHlth', 'PhysHlth','Age', 'Education',
       'Income']])

In [7]:
df_data_scaled = pd.DataFrame(df_data_scaled, columns=['BMI', 'GenHlth',
       'MentHlth', 'PhysHlth','Age', 'Education',
       'Income'])


In [8]:
# Separate the data into labels and features

# Separate the y variable, the labels

y = diabetes["Diabetes_binary"]

# Separate the X variable, the features

X = diabetes.drop(columns=["Diabetes_binary",'BMI', 'GenHlth',
       'MentHlth', 'PhysHlth','Age', 'Education',
       'Income'])

In [9]:
X = pd.concat([X, df_data_scaled], axis=1)

In [10]:
# Review the y variable Series
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Diabetes_binary, dtype: float64

In [11]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,DiffWalk,Sex,BMI,GenHlth,MentHlth,PhysHlth,Age,Education,Income
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,-0.542176,0.146304,0.15302,2.404008,-1.607237,1.048562,1.05816
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.542176,0.146304,-0.460058,-0.577451,1.197681,1.048562,1.05816
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,-0.542176,-1.649743,-0.460058,0.416369,1.548296,1.048562,1.05816
3,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,-0.261036,0.146304,-0.460058,-0.279305,0.847066,1.048562,1.05816
4,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,-0.120466,-0.751719,-0.460058,-0.577451,-0.204778,0.076814,1.05816


### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [12]:
# Check the balance of our target values
y.value_counts()

Diabetes_binary
0.0    35346
1.0    35346
Name: count, dtype: int64

### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [13]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [14]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model

logistic_regression_model = LogisticRegression(random_state=1,max_iter=1000,)


# Fit the model using training data

lr_model = logistic_regression_model.fit(X_train, y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [15]:
# Make a prediction using the testing data

testing_predictions = lr_model.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [16]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, testing_predictions)

0.7498067698559439

In [17]:
# Generate a confusion matrix for the model
test_matrix = confusion_matrix(y_test, testing_predictions)

In [18]:
# Print the classification report for the model
testing_report = classification_report(y_test, testing_predictions)
print(testing_report)

              precision    recall  f1-score   support

         0.0       0.76      0.73      0.75      8913
         1.0       0.74      0.77      0.75      8760

    accuracy                           0.75     17673
   macro avg       0.75      0.75      0.75     17673
weighted avg       0.75      0.75      0.75     17673



---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [19]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
## Assign a random_state parameter of 1 to the model

random_oversampler = RandomOverSampler(random_state=1)



# Fit the original training data to the random_oversampler model

X_train_resampled, y_train_resampled = random_oversampler.fit_resample(X_train, y_train)


In [20]:
# Count the distinct values of the resampled labels data
y_train_resampled.value_counts()

Diabetes_binary
0.0    26586
1.0    26586
Name: count, dtype: int64

### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [21]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model

logistic_regression_model = LogisticRegression(random_state=1)

# Fit the model using the resampled training data

lr_model = logistic_regression_model.fit(X_train_resampled, y_train_resampled)

# Make a prediction using the testing data

testing_predictions = lr_model.predict(X_train_resampled)


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [22]:
# Print the balanced_accuracy score of the model 

balanced_accuracy_score(y_train_resampled, testing_predictions)

0.7482133453697435

In [23]:
# Generate a confusion matrix for the model

test_matrix = confusion_matrix(y_train_resampled, testing_predictions)

In [24]:
# Print the classification report for the model

testing_report = classification_report(y_train_resampled, testing_predictions)
print(testing_report)

              precision    recall  f1-score   support

         0.0       0.76      0.73      0.74     26586
         1.0       0.74      0.77      0.75     26586

    accuracy                           0.75     53172
   macro avg       0.75      0.75      0.75     53172
weighted avg       0.75      0.75      0.75     53172

