In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

# DISCLAIMER
This is for all of the data in the dataset

## Split the data into training and testing sets

In [2]:
# read the CSV file into a Pandas Dataframe
alzheimer_path = Path("./alzheimer_clean.csv")
alzheimer_df = pd.read_csv(alzheimer_path)

alzheimer_df

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,6895,61,0,0,1,39.121757,0,1.561126,4.049964,6.555306,...,0,0,4.492838,1,0,0,0,0,1,XXXConfid
2145,6896,75,0,0,2,17.857903,0,18.767261,1.360667,2.904662,...,0,1,9.204952,0,0,0,0,0,1,XXXConfid
2146,6897,77,0,0,1,15.476479,0,4.594670,9.886002,8.120025,...,0,0,5.036334,0,0,0,0,0,1,XXXConfid
2147,6898,78,1,3,1,15.299911,0,8.674505,6.354282,1.263427,...,0,0,3.785399,0,0,0,0,1,1,XXXConfid


In [3]:
# drop "DoctorInCharge" from file as it is not necessary
alzheimer_df = alzheimer_df.drop(columns=["PatientID","DoctorInCharge"])

In [4]:
# seperate the data into labels and features

# seperate the y variable, the labels
y = alzheimer_df["Diagnosis"]

# seperate the X variable, the features
X = alzheimer_df.drop(columns="Diagnosis")

In [5]:
# review the y variable series
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: Diagnosis, dtype: int64

In [6]:
# review the X variable DataFrame
X.head()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness
0,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,9.025679,...,21.463532,6.518877,0,0,1.725883,0,0,0,1,0
1,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,7.151293,...,20.613267,7.118696,0,0,2.592424,0,0,0,0,1
2,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,9.673574,...,7.356249,5.895077,0,0,7.119548,0,1,0,1,0
3,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,8.392554,...,13.991127,8.965106,0,1,6.481226,0,0,0,0,0
4,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,5.597238,...,13.517609,6.045039,0,0,0.014691,0,0,1,1,0


In [7]:
y.value_counts()

Diagnosis
0    1389
1     760
Name: count, dtype: int64

In [8]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Logistic Regression

In [9]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression = LogisticRegression(solver="sag", max_iter=1000,random_state=1)
logistic_regression

# Fit the model using training data
LR_model = logistic_regression.fit(X_train,y_train)



In [10]:
# Make a prediction using the testing data
predictions = logistic_regression.predict(X_test)
LR_predictions = pd.DataFrame({"Predictions": predictions, "Actual":y_test}).reset_index(drop=True)
LR_predictions

Unnamed: 0,Predictions,Actual
0,1,1
1,1,1
2,0,0
3,0,1
4,0,0
...,...,...
533,0,0
534,1,0
535,0,0
536,0,0


In [11]:
# Generate a confusion matrix for the model
confusion_matrix(predictions,y_test)

array([[314,  76],
       [ 34, 114]], dtype=int64)

In [12]:
# Print the classification report for the model
print(classification_report(predictions,y_test))

              precision    recall  f1-score   support

           0       0.90      0.81      0.85       390
           1       0.60      0.77      0.67       148

    accuracy                           0.80       538
   macro avg       0.75      0.79      0.76       538
weighted avg       0.82      0.80      0.80       538



### results
The Logistic regression nwas only able to recall 81% and 77% of the data leaving an accuracy of 80%

# Logistic Regression using RandomOverSampler
    according to Google, imblearn is a library designed to deal with imbalanced datasets. RandomOverSampler's objective is to over-sample the dataset by picking samples at random with replacements.

In [13]:
from imblearn.over_sampling import RandomOverSampler

ros_model = RandomOverSampler(random_state=1)

X_train_resampled, y_train_resampled = ros_model.fit_resample(X_train,y_train)

In [14]:
y_train_resampled.value_counts()

Diagnosis
1    1041
0    1041
Name: count, dtype: int64

In [15]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression2 = LogisticRegression(solver='lbfgs', random_state=1)
logistic_regression2

# Fit the model using training data
LR_model = logistic_regression2.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
# Make a prediction using the testing data
predictions2 = logistic_regression2.predict(X_test)
LR_predictions2 = pd.DataFrame({"Predictions": predictions2, "Actual":y_test}).reset_index(drop=True)
LR_predictions2

Unnamed: 0,Predictions,Actual
0,1,1
1,0,1
2,0,0
3,0,1
4,0,0
...,...,...
533,0,0
534,0,0
535,0,0
536,0,0


In [17]:
# Generate a confusion matrix for the model
confusion_matrix(predictions2,y_test)

array([[314,  87],
       [ 34, 103]], dtype=int64)

In [18]:
# Print the classification report for the model
print(classification_report(predictions2,y_test))

              precision    recall  f1-score   support

           0       0.90      0.78      0.84       401
           1       0.54      0.75      0.63       137

    accuracy                           0.78       538
   macro avg       0.72      0.77      0.73       538
weighted avg       0.81      0.78      0.79       538



### results
with resampling the data, The recall was 78% and 75% with an accuracy of 78%

# RandomForestClassifier

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score


In [20]:
scaler = StandardScaler()

In [21]:
X_scaler = scaler.fit(X_train)

In [22]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [23]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=278)

In [24]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

  rf_model = rf_model.fit(X_train_scaled, y_train.ravel())


In [25]:
# Making predictions using the testing data
predictions3 = rf_model.predict(X_test_scaled)

In [26]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions3)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions3)

In [27]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions3))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,336,12
Actual 1,27,163


Accuracy Score : 0.9275092936802974
Classification Report
              precision    recall  f1-score   support

           0       0.93      0.97      0.95       348
           1       0.93      0.86      0.89       190

    accuracy                           0.93       538
   macro avg       0.93      0.91      0.92       538
weighted avg       0.93      0.93      0.93       538



### results
RandomForestClassifier had a recall of 96% and 86% with an accuracy of 92%

# RandomForestClassifier using RandomOverSampler

In [28]:
X_scaler_resampled = scaler.fit(X_train_resampled)

In [29]:
# Scale the training data
X_train_scaled_resampled = X_scaler_resampled.transform(X_train)
X_test_scaled_resampled = X_scaler_resampled.transform(X_test)

In [30]:
# Create the random forest classifier instance
rf_model_resampled = RandomForestClassifier(n_estimators=500, random_state=278)

In [31]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model_resampled = rf_model_resampled.fit(X_train_scaled_resampled, y_train.ravel())

  rf_model_resampled = rf_model_resampled.fit(X_train_scaled_resampled, y_train.ravel())


In [32]:
# Making predictions using the testing data
predictions_ros_resampled = rf_model_resampled.predict(X_test_scaled_resampled)

In [33]:
# Calculating the confusion matrix
cm_resampled = confusion_matrix(y_test, predictions_ros_resampled)
cm_resampled_df = pd.DataFrame(
    cm_resampled, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score_resampled = accuracy_score(y_test, predictions_ros_resampled)

In [34]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score_resampled}")
print("Classification Report")
print(classification_report(y_test, predictions_ros_resampled))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,336,12
Actual 1,27,163


Accuracy Score : 0.9275092936802974
Classification Report
              precision    recall  f1-score   support

           0       0.93      0.97      0.95       348
           1       0.93      0.86      0.89       190

    accuracy                           0.93       538
   macro avg       0.93      0.91      0.92       538
weighted avg       0.93      0.93      0.93       538



### results
With the resampled data, the RandomForestClassifier model reacelled 97% and 86% of the data with 93% accuracy

# KNeighborsClassifier

In [35]:
from sklearn.cluster import KMeans
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from sklearn.neighbors import KNeighborsClassifier

In [36]:
# Create a list to store inertia values
inertia = []

# Create a list to store the values of k
k = list(range(1, 11))

In [37]:
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(alzheimer_df)
    inertia.append(k_model.inertia_)

In [38]:
# Create a Dictionary that holds the list values for k and inertia
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame using the elbow_data Dictionary
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,34134970.0
1,2,17269230.0
2,3,14256750.0
3,4,13073530.0
4,5,12308190.0


In [39]:
# Plot the DataFrame
import hvplot.pandas

df_elbow.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve",
    xticks=k
)

In [40]:
# Create the StandardScaler instance
scaler_knn = StandardScaler()
# Fit the Standard Scaler with the training data
X_knn_scaler = scaler_knn.fit(X_train)
# Scale the training data
X_train_scaled_knn = X_knn_scaler.transform(X_train)
X_test_scaled_knn = X_knn_scaler.transform(X_test)

In [41]:
# Instantiate the model with k = 2 neighbors
model = KNeighborsClassifier(n_neighbors=2)

In [42]:
# Train the model
model.fit(X_train_scaled, y_train)

In [43]:
# Create predictions
y_pred = model.predict(X_test_scaled_knn)

# Review the predictions
y_pred

array([0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,

In [44]:
# Print confusion matrix
confusion_matrix(y_pred,y_test)

array([[328, 144],
       [ 20,  46]], dtype=int64)

In [45]:
# Print classification report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.94      0.69      0.80       472
           1       0.24      0.70      0.36        66

    accuracy                           0.70       538
   macro avg       0.59      0.70      0.58       538
weighted avg       0.86      0.70      0.75       538



### results
KneighborsClassifier had a recall of 69%and70% with an accuracy of 70%

# Decision Trees

In [46]:
from sklearn import tree

In [47]:
# Define target vector
y_tree = alzheimer_df["Diagnosis"].values.reshape(-1, 1)
y_tree[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]], dtype=int64)

In [48]:
# Define features set
X_Tree = alzheimer_df.copy()
X_Tree.drop("Diagnosis", axis=1, inplace=True)
X_Tree.head()


Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness
0,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,9.025679,...,21.463532,6.518877,0,0,1.725883,0,0,0,1,0
1,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,7.151293,...,20.613267,7.118696,0,0,2.592424,0,0,0,0,1
2,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,9.673574,...,7.356249,5.895077,0,0,7.119548,0,1,0,1,0
3,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,8.392554,...,13.991127,8.965106,0,1,6.481226,0,0,0,0,0
4,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,5.597238,...,13.517609,6.045039,0,0,0.014691,0,0,1,1,0


In [49]:
# Splitting into Train and Test sets
X_Tree_train, X_Tree_test, y_tree_train, y_tree_test = train_test_split(X_Tree, y_tree, random_state=78)

In [50]:
# Creating StandardScaler instance
scaler_tree = StandardScaler()

In [51]:
# Fitting Standard Scaller
X_scaler_tree = scaler_tree.fit(X_Tree_train)

In [52]:
# Scaling data
X_train_scaled_tree = X_scaler_tree.transform(X_Tree_train)
X_test_scaled_tree = X_scaler_tree.transform(X_Tree_test)

In [53]:
# Creating the decision tree classifier instance
model_tree = tree.DecisionTreeClassifier()

In [54]:
# Fitting the model
model_tree = model_tree.fit(X_train_scaled_tree, y_tree_train)

In [55]:
# Making predictions using the testing data
predictions_tree = model_tree.predict(X_test_scaled_tree)

In [56]:
# Calculating the confusion matrix
cm_tree = confusion_matrix(y_tree_test, predictions_tree)
cm__tree_df = pd.DataFrame(
    cm_tree, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score_tree = accuracy_score(y_tree_test, predictions_tree)

In [57]:
# Displaying results
print("Confusion Matrix")
display(cm__tree_df)
print(f"Accuracy Score : {acc_score_tree}")
print("Classification Report")
print(classification_report(y_tree_test, predictions_tree))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,334,13
Actual 1,19,172


Accuracy Score : 0.9405204460966543
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.96      0.95       347
           1       0.93      0.90      0.91       191

    accuracy                           0.94       538
   macro avg       0.94      0.93      0.93       538
weighted avg       0.94      0.94      0.94       538



### results
The Decision Tree recalled 96% and 90% of the data with an accuracy of 94%