## **ModelX - Dementia Prediction (XPredators)**

# Data Exploration and Flow

## Loading the Dataset

In [1]:
!pip install gdown
import gdown
import pandas as pd

file_id = "19mKGPNFb35kG__3Eihazyv5O69ZUxDcF"
url = f"https://drive.google.com/uc?id={file_id}"

gdown.download(url, "temp.csv", quiet=False)

df = pd.read_csv("temp.csv")
df.head()




Downloading...
From (original): https://drive.google.com/uc?id=19mKGPNFb35kG__3Eihazyv5O69ZUxDcF
From (redirected): https://drive.google.com/uc?id=19mKGPNFb35kG__3Eihazyv5O69ZUxDcF&confirm=t&uuid=2ea09e69-f056-4d23-9a27-3bdf6b5c8e9b
To: /content/temp.csv
100%|██████████| 509M/509M [00:05<00:00, 98.1MB/s]
  df = pd.read_csv("temp.csv")


Unnamed: 0,NACCID,NACCADC,PACKET,FORMVER,VISITMO,VISITDAY,VISITYR,NACCVNUM,NACCAVST,NACCNVST,...,NPATGAM1,NPATGAM2,NPATGAM3,NPATGAM4,NPATGAM5,NPATGFRN,NPATGFR1,NPATGFR2,NPATGFR3,NPATGFR4
0,NACC002909,186,I,3.0,12,28,2022,1,2,2,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
1,NACC002909,186,F,3.0,1,23,2024,2,2,2,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
2,NACC003487,186,I,3.0,11,15,2023,1,1,1,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
3,NACC004352,186,I,3.0,10,5,2021,1,1,1,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
4,NACC004687,186,I,3.0,11,14,2022,1,1,1,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4


## Checking dtype counts of columns

In [2]:
df.dtypes.value_counts()

Unnamed: 0,count
int64,747
object,142
float64,135


In [3]:
object_columns = df.select_dtypes(include='object').columns.tolist()
print(object_columns)

['NACCID', 'PACKET', 'HISPORX', 'RACEX', 'RACESECX', 'RACETERX', 'PRIMLANX', 'INHISPOX', 'INRACEX', 'INRASECX', 'INRATERX', 'INRELTOX', 'NACCAMX', 'NACCAMSX', 'NACCFMX', 'NACCFMSX', 'NACCOMX', 'NACCOMSX', 'DRUG1', 'DRUG2', 'DRUG3', 'DRUG4', 'DRUG5', 'DRUG6', 'DRUG7', 'DRUG8', 'DRUG9', 'DRUG10', 'DRUG11', 'DRUG12', 'DRUG13', 'DRUG14', 'DRUG15', 'DRUG16', 'DRUG17', 'DRUG18', 'DRUG19', 'DRUG20', 'DRUG21', 'DRUG22', 'DRUG23', 'DRUG24', 'DRUG25', 'DRUG26', 'DRUG27', 'DRUG28', 'DRUG29', 'DRUG30', 'DRUG31', 'DRUG32', 'DRUG33', 'DRUG34', 'DRUG35', 'DRUG36', 'DRUG37', 'DRUG38', 'DRUG39', 'DRUG40', 'CVOTHRX', 'NCOTHRX', 'ARTHTYPX', 'OTHSLEEX', 'ABUSX', 'PSYCDISX', 'CVDIMAGX', 'SPEECHX', 'FACEXPX', 'TRESTFAX', 'TRESTRHX', 'TRESTLHX', 'TRESTRFX', 'TRESTLFX', 'TRACTRHX', 'TRACTLHX', 'RIGDNEX', 'RIGDUPRX', 'RIGDUPLX', 'RIGDLORX', 'RIGDLOLX', 'TAPSRTX', 'TAPSLFX', 'HANDMVRX', 'HANDMVLX', 'HANDATRX', 'HANDATLX', 'LEGRTX', 'LEGLFX', 'ARISINGX', 'POSTUREX', 'GAITX', 'POSSTABX', 'BRADYKIX', 'NPIQINFX', '

In [4]:
df['NACCUDSD'].value_counts()

Unnamed: 0_level_0,count
NACCUDSD,Unnamed: 1_level_1
1,94933
4,57590
3,34106
2,8567


1 = Normal cognition
2 = Impaired-not-MCI
3 = MCI
4 = Dementia

In [5]:
df['DEMENTED'].value_counts()

Unnamed: 0_level_0,count
DEMENTED,Unnamed: 1_level_1
0,137606
1,57590


## Dropping all the medical-related columns (Keep only non-medical variables)

In [6]:
medical_cols = [
    # --- A5: Subject Health History ---
    "THYROID","ARTHRIT","ARTHTYPE","ARTHTYPX","ARTHUPEX","ARTHLOEX","ARTHSPIN","ARTHUNK",
    "INCONTU","INCONTF","APNEA","RBD","INSOMN","OTHSLEEP","OTHSLEEX",
    "ALCOHOL","ABUSOTHR","ABUSX","PTSD","BIPOLAR","SCHIZ","DEP2YRS",
    "DEPOTHR","ANXIETY","OCD","NPSYDEV","PSYCDIS","PSYCDISX",
    "TOBAC30","TOBAC100","SMOKYRS","PACKSPER","QUITSMOK",
    "ALCOCCAS","ALCFREQ",
    "CVHATT","HATTMULT","HATTYEAR","CVAFIB","CVANGIO","CVBYPASS",
    "CVPACDEF","CVPACE","CVCHF","CVANGINA","CVHVALVE",
    "CVOTHR","CVOTHRX",
    "CBSTROKE","STROKMUL","NACCSTYR",
    "CBTIA","TIAMULT","NACCTIYR",
    "PD","PDYR","PDOTHR","PDOTHRYR",
    "SEIZURES",
    "TBI","TBIBRIEF","TRAUMBRF","TBIEXTEN","TRAUMEXT","TBIWOLOS","TRAUMCHR","TBIYEAR",
    "NCOTHR","NCOTHRX",
    "DIABETES","DIABTYPE",
    "HYPERTEN","HYPERCHO","B12DEF",

    # --- A4: Medications ---
    "ANYMEDS"
] + [f"DRUG{i}" for i in range(1, 41)] + [
    "NACCAPSY","NACCAANX","NACCADMD","NACCPDMD","NACCEMD","NACCEPMD",
    "NACCDBMD","NACCAMD","NACCAHTN","NACCHTNC",

    # --- D2: Clinician Medical Conditions ---
    "CANCER","CANCSITE","DIABET","MYOINF","CONGHRT","AFIBRILL",
    "HYPERT","ANGINA","HYPCHOL","VB12DEF","THYDIS","ARTH","ARTYPE",
    "ARTYPEX","ARTUPEX","ARTLOEX","ARTSPIN","ARTUNKN",
    "URINEINC","BOWLINC","SLEEPAP","REMDIS","HYPOSOM","SLEEPOTH","SLEEPOTX",
    "ANGIOCP","ANGIOPCI","PACEMAKE","HVALVE","ANTIENC",

    # --- Physical Exam (b1 section) ---
    "HEIGHT","WEIGHT","NACCBMI","BPSYS","BPDIAS","HRATE",
    "VISION","VISCORR","VISWCORR"
]
df = df.drop(columns=[col for col in medical_cols if col in df.columns])


In [7]:
df.shape

(195196, 860)

In [8]:
df.dtypes.value_counts()

Unnamed: 0,count
int64,665
float64,102
object,93


In [9]:
object_columns = df.select_dtypes(include='object').columns.tolist()
print(object_columns)

['NACCID', 'PACKET', 'HISPORX', 'RACEX', 'RACESECX', 'RACETERX', 'PRIMLANX', 'INHISPOX', 'INRACEX', 'INRASECX', 'INRATERX', 'INRELTOX', 'NACCAMX', 'NACCAMSX', 'NACCFMX', 'NACCFMSX', 'NACCOMX', 'NACCOMSX', 'CVDIMAGX', 'SPEECHX', 'FACEXPX', 'TRESTFAX', 'TRESTRHX', 'TRESTLHX', 'TRESTRFX', 'TRESTLFX', 'TRACTRHX', 'TRACTLHX', 'RIGDNEX', 'RIGDUPRX', 'RIGDUPLX', 'RIGDLORX', 'RIGDLOLX', 'TAPSRTX', 'TAPSLFX', 'HANDMVRX', 'HANDMVLX', 'HANDATRX', 'HANDATLX', 'LEGRTX', 'LEGLFX', 'ARISINGX', 'POSTUREX', 'GAITX', 'POSSTABX', 'BRADYKIX', 'NPIQINFX', 'OTHNEURX', 'COGOTHRX', 'NACCCGFX', 'COGMODEX', 'BEOTHRX', 'NACCBEFX', 'BEMODEX', 'MOMODEX', 'MMSELANX', 'NPSYLANX', 'MOCALANX', 'RESPOTHX', 'OTHBIOMX', 'OTHMUTX', 'FTLDSUBX', 'OTHCOGX', 'OTHPSYX', 'COGOTHX', 'COGOTH2X', 'COGOTH3X', 'ANTIENCX', 'OTHCONDX', 'ADGCRND', 'ADGCEXR', 'NGDSGWAC', 'NGDSEXAC', 'NGDSWGAC', 'NGDSWEAC', 'NPFIXX', 'NPTANX', 'NPABANX', 'NPASANX', 'NPTDPANX', 'NPHISOX', 'NPPATHOX', 'NACCWRI1', 'NACCWRI2', 'NACCWRI3', 'NPFAUT1', 'NPFAUT2

In [10]:
len(object_columns)

93

## Removing all non-int data

In [24]:
import numpy as np

object_cols = df.select_dtypes(include=['object']).columns
df = df.drop(columns=object_cols)

df = df.select_dtypes(include=[np.number])



## Handling missing values and initilizing dependant and in-dependant features

In [11]:
if "DEMENTED" in df.columns:
    target = "DEMENTED"
else:
    target = "NACCUDSD"   # fallback multi-class clinical diagnosis

features = df.drop(columns=[target])
y = df[target]


X = features.fillna(features.median())

# ---------------------------------------
# 7. Final ML-ready data
# ---------------------------------------
print("Features shape:", X.shape)
print("Target shape:", y.shape)


Features shape: (195196, 766)
Target shape: (195196,)


## Approaching PCA technique

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # Scaling between 1 and 0

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

loadings = pd.DataFrame(
    pca.components_.T,
    columns=[f"PC{i+1}" for i in range(pca.n_components_)],
    index=X.columns
)

top_PC1 = loadings["PC1"].abs().sort_values(ascending=False).head(10)
print("Top PC1 Features:\n", top_PC1)

print("Explained Variance Ratio:", pca.explained_variance_ratio_)


# Model Training

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


## Logistic Regression

In [19]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)

print(classification_report(y_test, lr_preds))


              precision    recall  f1-score   support

           0       0.94      0.93      0.94     41285
           1       0.85      0.86      0.85     17274

    accuracy                           0.91     58559
   macro avg       0.89      0.90      0.90     58559
weighted avg       0.91      0.91      0.91     58559



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Model performance





In [23]:
confusion_matrix(y_test,lr_preds)

array([[38599,  2686],
       [ 2407, 14867]])