### Required Libraries

In [64]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler

### Looking at the data

In [6]:
data = pd.read_csv("../../Datasets/alzheimers_disease_data.csv")

In [7]:
data.shape

(2149, 35)

In [8]:
data.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [14]:
data.describe()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
count,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,...,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0
mean,74.908795,0.506282,0.697534,1.286645,27.655697,0.288506,10.039442,4.920202,4.993138,7.051081,...,5.080055,0.208004,0.156817,4.982958,0.205212,0.158213,0.150768,0.158678,0.301536,0.353653
std,8.990221,0.500077,0.996128,0.904527,7.217438,0.453173,5.75791,2.857191,2.909055,1.763573,...,2.892743,0.405974,0.363713,2.949775,0.40395,0.365026,0.357906,0.365461,0.459032,0.478214
min,60.0,0.0,0.0,0.0,15.008851,0.0,0.002003,0.003616,0.009385,4.002629,...,0.00046,0.0,0.0,0.001288,0.0,0.0,0.0,0.0,0.0,0.0
25%,67.0,0.0,0.0,1.0,21.611408,0.0,5.13981,2.570626,2.458455,5.482997,...,2.566281,0.0,0.0,2.342836,0.0,0.0,0.0,0.0,0.0,0.0
50%,75.0,1.0,0.0,1.0,27.823924,0.0,9.934412,4.766424,5.076087,7.115646,...,5.094439,0.0,0.0,5.038973,0.0,0.0,0.0,0.0,0.0,0.0
75%,83.0,1.0,1.0,2.0,33.869778,1.0,15.157931,7.427899,7.558625,8.562521,...,7.546981,0.0,0.0,7.58149,0.0,0.0,0.0,0.0,1.0,1.0
max,90.0,1.0,3.0,3.0,39.992767,1.0,19.989293,9.987429,9.998346,9.99984,...,9.996467,1.0,1.0,9.999747,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
data.isnull().sum()

PatientID                    0
Age                          0
Gender                       0
Ethnicity                    0
EducationLevel               0
BMI                          0
Smoking                      0
AlcoholConsumption           0
PhysicalActivity             0
DietQuality                  0
SleepQuality                 0
FamilyHistoryAlzheimers      0
CardiovascularDisease        0
Diabetes                     0
Depression                   0
HeadInjury                   0
Hypertension                 0
SystolicBP                   0
DiastolicBP                  0
CholesterolTotal             0
CholesterolLDL               0
CholesterolHDL               0
CholesterolTriglycerides     0
MMSE                         0
FunctionalAssessment         0
MemoryComplaints             0
BehavioralProblems           0
ADL                          0
Confusion                    0
Disorientation               0
PersonalityChanges           0
DifficultyCompletingTasks    0
Forgetfu

In [13]:
# Remove useless columns
data.drop(["PatientID", "DoctorInCharge"], axis=1, inplace=True)

In [15]:
diagnosis = data.Diagnosis
data.drop("Diagnosis", axis = 1, inplace=True)

In [16]:
data.head()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness
0,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,9.025679,...,21.463532,6.518877,0,0,1.725883,0,0,0,1,0
1,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,7.151293,...,20.613267,7.118696,0,0,2.592424,0,0,0,0,1
2,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,9.673574,...,7.356249,5.895077,0,0,7.119548,0,1,0,1,0
3,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,8.392554,...,13.991127,8.965106,0,1,6.481226,0,0,0,0,0
4,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,5.597238,...,13.517609,6.045039,0,0,0.014691,0,0,1,1,0


In [17]:
diagnosis.head()

0    0
1    0
2    0
3    0
4    0
Name: Diagnosis, dtype: int64

### Splitting the data

In [19]:
X_train, X_test, y_train, y_test = train_test_split(data,diagnosis, test_size=0.2, random_state=42, shuffle=False)

In [20]:
X_train.tail()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness
1714,81,1,0,3,17.116407,0,1.276614,1.491992,2.734151,8.29965,...,12.649628,3.098182,0,0,6.667228,0,0,0,0,1
1715,73,0,0,2,36.314352,0,1.919937,3.646559,1.649193,9.194689,...,14.685309,1.526947,0,1,3.707218,0,0,1,0,1
1716,86,0,3,1,24.066976,0,12.008247,7.920178,3.625059,9.82267,...,23.631086,9.99261,0,0,9.621897,0,0,1,0,1
1717,60,0,0,1,23.949421,0,15.872405,3.504292,0.157157,5.188237,...,3.803759,9.519397,0,0,4.570167,0,0,0,0,1
1718,65,0,0,0,37.824886,0,10.65073,2.900281,8.652653,4.611515,...,22.072325,7.930717,0,0,7.581196,0,0,0,0,1


In [21]:
X_test.head()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness
1719,66,1,0,1,16.665503,0,1.871312,8.6989,6.968653,6.580947,...,1.498373,4.455211,0,0,1.06336,0,1,0,0,1
1720,90,1,1,1,17.23599,0,6.954826,1.768237,8.647668,7.123329,...,3.070327,7.93703,0,1,3.377024,0,0,1,0,0
1721,77,1,0,1,28.536407,1,19.596522,0.406111,2.929292,6.644174,...,23.258044,8.64097,0,0,0.512065,1,0,0,0,1
1722,83,0,2,2,18.905002,1,8.797696,7.70513,2.079585,7.531138,...,27.227884,3.060459,0,0,6.635915,1,0,0,0,0
1723,62,0,0,2,35.228954,0,7.446076,7.431135,4.486564,5.583009,...,2.018614,0.357186,0,0,0.320422,0,0,0,0,0


### Testing Models

#### Logistic Regression

In [73]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [74]:
lr_model.score(X_test,y_test)

0.6953488372093023

#### Decision Tree

In [75]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

In [76]:
dt_model.score(X_test, y_test)

0.7930232558139535

#### Support Vector Machines

In [77]:
svc_model = SVC()
svc_model.fit(X_train, y_train)

In [78]:
svc_model.score(X_test, y_test)

0.6023255813953489

Needs scaling I think

#### Random Forest

In [79]:
rf_model = RandomForestClassifier(n_estimators=75)
rf_model.fit(X_train,y_train)

In [80]:
rf_model.score(X_test,y_test)

0.8

#### XGBoost

In [81]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train,y_train)

In [82]:
xgb_model.score(X_test, y_test)

0.8162790697674419

#### KNN

In [83]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

In [84]:
knn_model.score(X_test, y_test)

0.5767441860465117

#### Naive Bayes

In [85]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

In [86]:
nb_model.score(X_test, y_test)

0.7372093023255814

### Scaling Data

In [87]:
min_max_scaler = MinMaxScaler()

In [88]:
X_train_scaled = min_max_scaler.fit_transform(X_train)
X_test_scaled = min_max_scaler.transform(X_test)

### Testing Models

#### Logistic Regression

In [89]:
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)

In [90]:
lr_model.score(X_test_scaled,y_test)

0.7186046511627907

#### Decision Tree

In [91]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_scaled, y_train)

In [92]:
dt_model.score(X_test_scaled, y_test)

0.7953488372093023

#### Support Vector Machines

In [93]:
svc_model = SVC()
svc_model.fit(X_train_scaled, y_train)

In [94]:
svc_model.score(X_test_scaled, y_test)

0.7348837209302326

Ass even with scaling

#### Random Forest

In [95]:
rf_model = RandomForestClassifier(n_estimators=75)
rf_model.fit(X_train_scaled,y_train)

In [96]:
rf_model.score(X_test_scaled,y_test)

0.7953488372093023

#### XGBoost

In [97]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train_scaled,y_train)

In [98]:
xgb_model.score(X_test_scaled, y_test)

0.8162790697674419

#### KNN

In [99]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_scaled, y_train)

In [100]:
knn_model.score(X_test_scaled, y_test)

0.6395348837209303

#### Naive Bayes

In [101]:
nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)

In [102]:
nb_model.score(X_test_scaled, y_test)

0.7372093023255814

### Results 

With No Scaling : 

1. LR - 0.695
2. DT - 0.790
3. SVC - 0.602
4. RF - 0.806
5. XGB - 0.816
6. KNN - 0.576
7. NB - 0.737

With Scaling : 

1. LR - 0.718
2. DT - 0.790
3. SVC - 0.734
4. RF - 0.802
5. XGB - 0.816
6. KNN - 0.639
7. NB - 0.737

### Theory

#### Parameters to tune for increasing accuracy

##### **1\. Logistic Regression**

*   C: Inverse of regularization strength (higher values reduce regularization).
    
*   penalty: Type of regularization (l1, l2, elasticnet, none).
    
*   solver: Optimization algorithm (liblinear, lbfgs, saga, etc.).
    
*   max\_iter: Number of iterations for optimization.
    

**Tuning approach:**

*   Try different values of C (e.g., 0.01, 0.1, 1, 10).
    
*   Test l1 vs l2 penalty for sparsity effects.
    

##### **2\. k-Nearest Neighbors (kNN)**

*   n\_neighbors: Number of neighbors (e.g., 3, 5, 7).
    
*   weights: How neighbors are weighted (uniform vs. distance).
    
*   metric: Distance metric (euclidean, manhattan, minkowski).
    

**Tuning approach:**

*   Try different values of n\_neighbors (usually odd numbers work best).
    
*   Use distance weighting to give closer points more importance.
    

##### **3\. Decision Tree**

*   max\_depth: Maximum depth of the tree (to control overfitting).
    
*   min\_samples\_split: Minimum samples required to split a node.
    
*   min\_samples\_leaf: Minimum samples required in a leaf node.
    
*   criterion: Splitting strategy (gini vs entropy).
    

**Tuning approach:**

*   Use grid search to find the optimal max\_depth (e.g., 3-10).
    
*   Increase min\_samples\_split to reduce overfitting.
    

##### **4\. Naïve Bayes (GaussianNB)**

*   var\_smoothing: Portion of the largest variance of all features added to variance for stability.
    

**Tuning approach:**

*   Try adjusting var\_smoothing (values close to 1e-9).
    
*   Works well when features are normally distributed.
    

##### **5\. Support Vector Machine (SVM)**

*   C: Regularization parameter (higher means less regularization).
    
*   kernel: Kernel type (linear, rbf, poly, sigmoid).
    
*   gamma: Kernel coefficient (used for rbf, poly, sigmoid kernels).
    
*   degree: Degree of polynomial kernel (for poly kernel).
    

**Tuning approach:**

*   Use GridSearchCV to find the best C and gamma.
    
*   Try different kernels (linear for high-dimensional data, rbf for complex decision boundaries).
    

##### **6\. Random Forest**

*   n\_estimators: Number of trees in the forest.
    
*   max\_depth: Maximum depth of trees.
    
*   min\_samples\_split: Minimum samples to split a node.
    
*   min\_samples\_leaf: Minimum samples per leaf node.
    
*   max\_features: Number of features considered for best split.
    

**Tuning approach:**

*   Increase n\_estimators (start with 100, try 300, 500).
    
*   Reduce max\_depth to prevent overfitting.
    

##### **7\. XGBoost**

*   n\_estimators: Number of boosting rounds.
    
*   learning\_rate: Step size shrinkage (e.g., 0.01, 0.1).
    
*   max\_depth: Maximum depth of trees.
    
*   subsample: Fraction of samples used per tree.
    
*   colsample\_bytree: Fraction of features used per tree.
    
*   gamma: Minimum loss reduction to split a node.
    
*   lambda: L2 regularization.
    
*   alpha: L1 regularization.
    

**Tuning approach:**

*   Start with n\_estimators=100 and tune learning\_rate (0.01 to 0.2).
    
*   Use GridSearchCV for max\_depth, subsample, and colsample\_bytree.

#### Reducing Features 

##### **When Removing Low-Correlation Features Improves Accuracy**

✅ **If the dataset has irrelevant or redundant features**

*   Features with very low correlation to the target label may add noise.
    
*   Removing them can help models generalize better.
    

✅ **If the dataset has a small number of samples**

*   High-dimensional data with few samples can lead to overfitting.
    
*   Feature selection reduces complexity and improves generalization.
    

✅ **For simpler models like Logistic Regression, kNN, and Naïve Bayes**

*   These models perform better with fewer but more relevant features.
    
*   Reducing features can improve speed and interpretability.
    

✅ **For models sensitive to irrelevant features**

*   Example: SVM and kNN can suffer from the curse of dimensionality, where too many features can lead to worse performance.
    

##### **When Removing Low-Correlation Features Might Not Help**

❌ **If the removed features contain non-linear relationships with the label**

*   Linear correlation (Pearson’s correlation) may not capture complex relationships.
    
*   Some features may have weak correlation individually but be useful in combination with other features.
    

❌ **For tree-based models (Decision Trees, Random Forest, XGBoost)**

*   These models handle irrelevant features well.
    
*   They automatically prioritize important features via feature importance.
    
*   Removing features **may not** improve accuracy significantly.
    

❌ **If correlation is calculated using Pearson’s method for categorical data**

*   Pearson correlation works well for numerical data but not categorical data.
    
*   Use methods like **mutual information** or **Chi-square test** for categorical features.

#### Accuracy vs Precision vs Recall vs F1 Score

Great question — **Accuracy, Precision, Recall, and F1 score** all evaluate classification models, but they shine in **different scenarios**. Here's a simple guide on **when to use each metric**:

##### **1\. Accuracy**

*   **Use when**:
    
    *   Classes are **balanced** (equal number of examples in each class).
        
    *   All errors are **equally important**.
        
*   **Don't use when**:
    
    *   The dataset is **imbalanced** (e.g., 95% Class A, 5% Class B).
        
*   **Example**:
    
    *   Recognizing handwritten digits (0–9) with fairly equal data per digit.
        

##### **2\. Precision**

*   **What it answers**: "Out of all predicted positives, how many were actually positive?"
    
*   **Use when**:
    
    *   **False positives are costly** (you want fewer _wrong positive_ predictions).
        
*   **Example**:
    
    *   Email spam filter (predicting a legit email as spam is worse than missing a spam).
        
    *   Medical tests for rare but serious diseases — you don't want to wrongly tell someone they have a disease.
        

##### **3\. Recall (Sensitivity / True Positive Rate)**

*   **What it answers**: "Out of all actual positives, how many did we correctly identify?"
    
*   **Use when**:
    
    *   **False negatives are costly** (you don’t want to _miss positives_).
        
*   **Example**:
    
    *   Cancer detection — it's better to have some false alarms than to miss a real case.
        
    *   Fraud detection — better to investigate more suspicious transactions than miss fraud.
        

##### **4\. F1 Score**

*   **What it does**: Harmonic mean of **Precision** and **Recall**.
    
*   **Use when**:
    
    *   There’s a **class imbalance**.
        
    *   You want a **balance between precision and recall**.
        
*   **Example**:
    
    *   Any binary classification with skewed classes (e.g., anomaly detection).
        
    *   NLP tasks like sentiment classification when one class is much more frequent than others.


In the case of **imbalanced classes**, the **best metric** depends on the **specific problem and what kind of error is more critical** — but generally:

##### **Best overall: F1 Score**

*   **Why**: It balances **precision** and **recall**, which is crucial when one class is much more frequent than the other.
    
*   **When to use**: You care about both **catching positives** and **not raising too many false alarms**.
    

##### **Use Recall when**:

*   **False negatives are more dangerous** than false positives.
    
*   **Example**: Disease detection, fraud detection (missing a positive is costly).
    

##### **Use Precision when**:

*   **False positives are more dangerous** than false negatives.
    
*   **Example**: Spam detection (you don’t want to flag legit emails as spam).
    

##### **Avoid using Accuracy** in imbalanced datasets because:

*   It can be **very misleading**.
    
*   Example: In a dataset where 95% are Class A and 5% are Class B, a model that always predicts Class A will have **95% accuracy**, but **0% recall for Class B**.
    

##### Optional: Use **AUC-ROC** or **PR-AUC** for better insight:

*   **AUC-ROC**: Good when classes are **reasonably balanced** and you care about ranking predictions.
    
*   **PR-AUC** (Precision-Recall curve): Better than ROC for **highly imbalanced** datasets.

### Code block for seeing the performance metrics

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Metrics
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))