In [2]:
!pip install psycopg2-binary sqlalchemy



In [3]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [4]:
# Define the connection details
dbname = 'project4'
user = 'postgres'
password = 'postgres'
host = 'localhost'
port = 5432

# Create the connection string
connection_string = f'postgresql://{user}:{password}@{host}:{port}/{dbname}'

# Create the database engine
engine = create_engine(connection_string)

# Execute the query to get all tables in the pg_catalog.pg_tables
query = "SELECT * FROM dementia"

# Read the query results into a DataFrame
dementia_df = pd.read_sql(query, engine)

# Print the DataFrame
dementia_df.head()

Unnamed: 0,Diabetic,AlcoholLevel,HeartRate,BloodOxygenLevel,BodyTemperature,Weight,MRI_Delay,Prescription,Dosage_in_mg,Age,...,Smoking_Status,APOE_4,Physical_Activity,Depression_Status,Cognitive_Test_Scores,Medication_History,Nutrition_Diet,Sleep_Quality,Chronic_Health_Conditions,Dementia
0,1,0.084974,98,96.230743,36.224852,57.563978,36.421028,,,60,...,Current Smoker,Negative,Sedentary,No,10,No,Low-Carb Diet,Poor,Diabetes,0
1,0,0.016973,78,93.032122,36.183874,56.832335,31.157633,Galantamine,12.0,61,...,Former Smoker,Positive,Moderate Activity,No,1,Yes,Low-Carb Diet,Poor,Heart Disease,1
2,0,0.009,89,93.566504,37.326321,59.759066,37.640435,,,69,...,Former Smoker,Negative,Moderate Activity,No,8,No,Mediterranean Diet,Poor,Heart Disease,0
3,0,0.086437,60,93.90651,37.03062,58.266471,50.673992,Donepezil,23.0,78,...,Never Smoked,Negative,Mild Activity,Yes,5,Yes,Balanced Diet,Poor,Hypertension,1
4,1,0.150747,67,97.508994,36.062121,67.705027,27.810601,Memantine,20.0,77,...,Never Smoked,Positive,Mild Activity,No,0,Yes,Low-Carb Diet,Good,Diabetes,1


In [5]:
dementia_df.columns

Index(['Diabetic', 'AlcoholLevel', 'HeartRate', 'BloodOxygenLevel',
       'BodyTemperature', 'Weight', 'MRI_Delay', 'Prescription',
       'Dosage_in_mg', 'Age', 'Education_Level', 'Dominant_Hand', 'Gender',
       'Family_History', 'Smoking_Status', 'APOE_4', 'Physical_Activity',
       'Depression_Status', 'Cognitive_Test_Scores', 'Medication_History',
       'Nutrition_Diet', 'Sleep_Quality', 'Chronic_Health_Conditions',
       'Dementia'],
      dtype='object')

In [6]:
# Generate our categorical variable lists
dementia_cat = dementia_df.dtypes[dementia_df.dtypes == "object"].index.tolist()

In [7]:
# Check the number of unique values in each column
dementia_df[dementia_cat].nunique()

Prescription                 4
Education_Level              4
Dominant_Hand                2
Gender                       2
Family_History               2
Smoking_Status               3
APOE_4                       2
Physical_Activity            3
Depression_Status            2
Medication_History           2
Nutrition_Diet               3
Sleep_Quality                2
Chronic_Health_Conditions    4
dtype: int64

In [8]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(dementia_df[dementia_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(dementia_cat)
encode_df.head()



Unnamed: 0,Prescription_Donepezil,Prescription_Galantamine,Prescription_Memantine,Prescription_Rivastigmine,Prescription_None,Education_Level_Diploma/Degree,Education_Level_No School,Education_Level_Primary School,Education_Level_Secondary School,Dominant_Hand_Left,...,Medication_History_Yes,Nutrition_Diet_Balanced Diet,Nutrition_Diet_Low-Carb Diet,Nutrition_Diet_Mediterranean Diet,Sleep_Quality_Good,Sleep_Quality_Poor,Chronic_Health_Conditions_Diabetes,Chronic_Health_Conditions_Heart Disease,Chronic_Health_Conditions_Hypertension,Chronic_Health_Conditions_None
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [9]:
# Merge one-hot encoded features and drop the originals
dementia_df = dementia_df.merge(encode_df,left_index=True, right_index=True)
dementia_df = dementia_df.drop(dementia_cat, axis=1)
dementia_df.head()

Unnamed: 0,Diabetic,AlcoholLevel,HeartRate,BloodOxygenLevel,BodyTemperature,Weight,MRI_Delay,Dosage_in_mg,Age,Cognitive_Test_Scores,...,Medication_History_Yes,Nutrition_Diet_Balanced Diet,Nutrition_Diet_Low-Carb Diet,Nutrition_Diet_Mediterranean Diet,Sleep_Quality_Good,Sleep_Quality_Poor,Chronic_Health_Conditions_Diabetes,Chronic_Health_Conditions_Heart Disease,Chronic_Health_Conditions_Hypertension,Chronic_Health_Conditions_None
0,1,0.084974,98,96.230743,36.224852,57.563978,36.421028,,60,10,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,0,0.016973,78,93.032122,36.183874,56.832335,31.157633,12.0,61,1,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0,0.009,89,93.566504,37.326321,59.759066,37.640435,,69,8,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0,0.086437,60,93.90651,37.03062,58.266471,50.673992,23.0,78,5,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1,0.150747,67,97.508994,36.062121,67.705027,27.810601,20.0,77,0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [16]:
# Convert all NaN values in column "Dosage_in_mg" to be "0"
dementia_df["Dosage_in_mg"].fillna(0, inplace=True)
dementia_df

Unnamed: 0,Diabetic,AlcoholLevel,HeartRate,BloodOxygenLevel,BodyTemperature,Weight,MRI_Delay,Dosage_in_mg,Age,Cognitive_Test_Scores,...,Medication_History_Yes,Nutrition_Diet_Balanced Diet,Nutrition_Diet_Low-Carb Diet,Nutrition_Diet_Mediterranean Diet,Sleep_Quality_Good,Sleep_Quality_Poor,Chronic_Health_Conditions_Diabetes,Chronic_Health_Conditions_Heart Disease,Chronic_Health_Conditions_Hypertension,Chronic_Health_Conditions_None
0,1,0.084974,98,96.230743,36.224852,57.563978,36.421028,0.0,60,10,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,0,0.016973,78,93.032122,36.183874,56.832335,31.157633,12.0,61,1,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0,0.009000,89,93.566504,37.326321,59.759066,37.640435,0.0,69,8,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0,0.086437,60,93.906510,37.030620,58.266471,50.673992,23.0,78,5,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1,0.150747,67,97.508994,36.062121,67.705027,27.810601,20.0,77,0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,0.081825,87,93.851963,36.495134,50.380106,42.318663,10.0,88,5,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
996,1,0.145249,97,94.522391,36.270804,94.006484,52.812568,0.0,80,9,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
997,1,0.073692,65,98.578388,37.065703,80.088613,13.640229,0.0,67,8,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
998,0,0.037347,71,91.298580,37.037202,95.322210,17.445715,20.0,62,2,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [17]:
# Seperate the features, X, from the target variable, y
y = dementia_df["Dementia"]
X = dementia_df.drop(columns = "Dementia")

In [18]:
# Preview the features data
X.head()

Unnamed: 0,Diabetic,AlcoholLevel,HeartRate,BloodOxygenLevel,BodyTemperature,Weight,MRI_Delay,Dosage_in_mg,Age,Cognitive_Test_Scores,...,Medication_History_Yes,Nutrition_Diet_Balanced Diet,Nutrition_Diet_Low-Carb Diet,Nutrition_Diet_Mediterranean Diet,Sleep_Quality_Good,Sleep_Quality_Poor,Chronic_Health_Conditions_Diabetes,Chronic_Health_Conditions_Heart Disease,Chronic_Health_Conditions_Hypertension,Chronic_Health_Conditions_None
0,1,0.084974,98,96.230743,36.224852,57.563978,36.421028,0.0,60,10,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,0,0.016973,78,93.032122,36.183874,56.832335,31.157633,12.0,61,1,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0,0.009,89,93.566504,37.326321,59.759066,37.640435,0.0,69,8,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0,0.086437,60,93.90651,37.03062,58.266471,50.673992,23.0,78,5,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1,0.150747,67,97.508994,36.062121,67.705027,27.810601,20.0,77,0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [19]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [31]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [32]:
# Create a Logistic Regression Model with the Data
### Step 1: Fit a logistic regression model by using the training data (X_train and y_train).

# Ref: Module 20/Day1/Act-04

# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=500,
                                random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

In [33]:
# Score the model
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 1.0


In [34]:
### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [35]:
# Ref: Module 20/Day1/Act-04
y_predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,1,1
1,0,0
2,1,1
3,1,1
4,1,1
5,0,0
6,0,0
7,0,0
8,1,1
9,0,0


In [36]:
### Step 3: Evaluate the model’s performance by doing the following:

# * Generate a confusion matrix.
# * Print the classification report.

In [37]:
# Refs: Module 20/Day1/Act-05

# Generate a confusion matrix for the model
from sklearn.metrics import confusion_matrix, classification_report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_predictions))

Confusion Matrix:
[[128   0]
 [  0 122]]


In [38]:
# Refs: Module 20/Day1/Act-05
# Print the classification report for the model
target_names = ["Does Not Have Dementia [0]", "Has Dementia [1]"]
print(classification_report(y_test, y_predictions, target_names = target_names))

                            precision    recall  f1-score   support

Does Not Have Dementia [0]       1.00      1.00      1.00       128
          Has Dementia [1]       1.00      1.00      1.00       122

                  accuracy                           1.00       250
                 macro avg       1.00      1.00      1.00       250
              weighted avg       1.00      1.00      1.00       250



In [39]:
# Check the original DataFrame to see how many people loans were Healthy Loans (0) vs High-Risk Loans (1)
dementia_df["Dementia"].value_counts()

Dementia
0    515
1    485
Name: count, dtype: int64