## **ModelX - Dementia Prediction (XPredators)**

# Data Exploration and Flow

## Loading the Dataset

In [1]:
!pip install gdown
import gdown
import pandas as pd

file_id = "19mKGPNFb35kG__3Eihazyv5O69ZUxDcF"
url = f"https://drive.google.com/uc?id={file_id}"

gdown.download(url, "temp.csv", quiet=False)

df = pd.read_csv("temp.csv")
df.head()




Downloading...
From (original): https://drive.google.com/uc?id=19mKGPNFb35kG__3Eihazyv5O69ZUxDcF
From (redirected): https://drive.google.com/uc?id=19mKGPNFb35kG__3Eihazyv5O69ZUxDcF&confirm=t&uuid=4c6bfc7d-4a74-409f-93b4-e21c21ba3e67
To: /content/temp.csv
100%|██████████| 509M/509M [00:08<00:00, 57.6MB/s]
  df = pd.read_csv("temp.csv")


Unnamed: 0,NACCID,NACCADC,PACKET,FORMVER,VISITMO,VISITDAY,VISITYR,NACCVNUM,NACCAVST,NACCNVST,...,NPATGAM1,NPATGAM2,NPATGAM3,NPATGAM4,NPATGAM5,NPATGFRN,NPATGFR1,NPATGFR2,NPATGFR3,NPATGFR4
0,NACC002909,186,I,3.0,12,28,2022,1,2,2,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
1,NACC002909,186,F,3.0,1,23,2024,2,2,2,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
2,NACC003487,186,I,3.0,11,15,2023,1,1,1,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
3,NACC004352,186,I,3.0,10,5,2021,1,1,1,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
4,NACC004687,186,I,3.0,11,14,2022,1,1,1,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4


## Checking dtype counts of columns

In [6]:
df.dtypes.value_counts()

Unnamed: 0,count
int64,665
float64,102
object,93


In [7]:
object_columns = df.select_dtypes(include='object').columns.tolist()
print(object_columns)

['NACCID', 'PACKET', 'HISPORX', 'RACEX', 'RACESECX', 'RACETERX', 'PRIMLANX', 'INHISPOX', 'INRACEX', 'INRASECX', 'INRATERX', 'INRELTOX', 'NACCAMX', 'NACCAMSX', 'NACCFMX', 'NACCFMSX', 'NACCOMX', 'NACCOMSX', 'CVDIMAGX', 'SPEECHX', 'FACEXPX', 'TRESTFAX', 'TRESTRHX', 'TRESTLHX', 'TRESTRFX', 'TRESTLFX', 'TRACTRHX', 'TRACTLHX', 'RIGDNEX', 'RIGDUPRX', 'RIGDUPLX', 'RIGDLORX', 'RIGDLOLX', 'TAPSRTX', 'TAPSLFX', 'HANDMVRX', 'HANDMVLX', 'HANDATRX', 'HANDATLX', 'LEGRTX', 'LEGLFX', 'ARISINGX', 'POSTUREX', 'GAITX', 'POSSTABX', 'BRADYKIX', 'NPIQINFX', 'OTHNEURX', 'COGOTHRX', 'NACCCGFX', 'COGMODEX', 'BEOTHRX', 'NACCBEFX', 'BEMODEX', 'MOMODEX', 'MMSELANX', 'NPSYLANX', 'MOCALANX', 'RESPOTHX', 'OTHBIOMX', 'OTHMUTX', 'FTLDSUBX', 'OTHCOGX', 'OTHPSYX', 'COGOTHX', 'COGOTH2X', 'COGOTH3X', 'ANTIENCX', 'OTHCONDX', 'ADGCRND', 'ADGCEXR', 'NGDSGWAC', 'NGDSEXAC', 'NGDSWGAC', 'NGDSWEAC', 'NPFIXX', 'NPTANX', 'NPABANX', 'NPASANX', 'NPTDPANX', 'NPHISOX', 'NPPATHOX', 'NACCWRI1', 'NACCWRI2', 'NACCWRI3', 'NPFAUT1', 'NPFAUT2

In [8]:
df['NACCUDSD'].value_counts()

Unnamed: 0_level_0,count
NACCUDSD,Unnamed: 1_level_1
1,94933
4,57590
3,34106
2,8567


1 = Normal cognition
2 = Impaired-not-MCI
3 = MCI
4 = Dementia

In [9]:
df['DEMENTED'].value_counts()

Unnamed: 0_level_0,count
DEMENTED,Unnamed: 1_level_1
0,137606
1,57590


## Dropping all the medical-related columns (Keep only non-medical variables)

In [10]:
medical_cols = [
    # --- A5: Subject Health History ---
    "THYROID","ARTHRIT","ARTHTYPE","ARTHTYPX","ARTHUPEX","ARTHLOEX","ARTHSPIN","ARTHUNK",
    "INCONTU","INCONTF","APNEA","RBD","INSOMN","OTHSLEEP","OTHSLEEX",
    "ALCOHOL","ABUSOTHR","ABUSX","PTSD","BIPOLAR","SCHIZ","DEP2YRS",
    "DEPOTHR","ANXIETY","OCD","NPSYDEV","PSYCDIS","PSYCDISX",
    "TOBAC30","TOBAC100","SMOKYRS","PACKSPER","QUITSMOK",
    "ALCOCCAS","ALCFREQ",
    "CVHATT","HATTMULT","HATTYEAR","CVAFIB","CVANGIO","CVBYPASS",
    "CVPACDEF","CVPACE","CVCHF","CVANGINA","CVHVALVE",
    "CVOTHR","CVOTHRX",
    "CBSTROKE","STROKMUL","NACCSTYR",
    "CBTIA","TIAMULT","NACCTIYR",
    "PD","PDYR","PDOTHR","PDOTHRYR",
    "SEIZURES",
    "TBI","TBIBRIEF","TRAUMBRF","TBIEXTEN","TRAUMEXT","TBIWOLOS","TRAUMCHR","TBIYEAR",
    "NCOTHR","NCOTHRX",
    "DIABETES","DIABTYPE",
    "HYPERTEN","HYPERCHO","B12DEF",

    # --- A4: Medications ---
    "ANYMEDS"
] + [f"DRUG{i}" for i in range(1, 41)] + [
    "NACCAPSY","NACCAANX","NACCADMD","NACCPDMD","NACCEMD","NACCEPMD",
    "NACCDBMD","NACCAMD","NACCAHTN","NACCHTNC",

    # --- D2: Clinician Medical Conditions ---
    "CANCER","CANCSITE","DIABET","MYOINF","CONGHRT","AFIBRILL",
    "HYPERT","ANGINA","HYPCHOL","VB12DEF","THYDIS","ARTH","ARTYPE",
    "ARTYPEX","ARTUPEX","ARTLOEX","ARTSPIN","ARTUNKN",
    "URINEINC","BOWLINC","SLEEPAP","REMDIS","HYPOSOM","SLEEPOTH","SLEEPOTX",
    "ANGIOCP","ANGIOPCI","PACEMAKE","HVALVE","ANTIENC",

    # --- Physical Exam (b1 section) ---
    "HEIGHT","WEIGHT","NACCBMI","BPSYS","BPDIAS","HRATE",
    "VISION","VISCORR","VISWCORR"
]
df = df.drop(columns=[col for col in medical_cols if col in df.columns])


In [11]:
df.shape

(195196, 860)

In [12]:
df.dtypes.value_counts()

Unnamed: 0,count
int64,665
float64,102
object,93


In [13]:
object_columns = df.select_dtypes(include='object').columns.tolist()
print(object_columns)

['NACCID', 'PACKET', 'HISPORX', 'RACEX', 'RACESECX', 'RACETERX', 'PRIMLANX', 'INHISPOX', 'INRACEX', 'INRASECX', 'INRATERX', 'INRELTOX', 'NACCAMX', 'NACCAMSX', 'NACCFMX', 'NACCFMSX', 'NACCOMX', 'NACCOMSX', 'CVDIMAGX', 'SPEECHX', 'FACEXPX', 'TRESTFAX', 'TRESTRHX', 'TRESTLHX', 'TRESTRFX', 'TRESTLFX', 'TRACTRHX', 'TRACTLHX', 'RIGDNEX', 'RIGDUPRX', 'RIGDUPLX', 'RIGDLORX', 'RIGDLOLX', 'TAPSRTX', 'TAPSLFX', 'HANDMVRX', 'HANDMVLX', 'HANDATRX', 'HANDATLX', 'LEGRTX', 'LEGLFX', 'ARISINGX', 'POSTUREX', 'GAITX', 'POSSTABX', 'BRADYKIX', 'NPIQINFX', 'OTHNEURX', 'COGOTHRX', 'NACCCGFX', 'COGMODEX', 'BEOTHRX', 'NACCBEFX', 'BEMODEX', 'MOMODEX', 'MMSELANX', 'NPSYLANX', 'MOCALANX', 'RESPOTHX', 'OTHBIOMX', 'OTHMUTX', 'FTLDSUBX', 'OTHCOGX', 'OTHPSYX', 'COGOTHX', 'COGOTH2X', 'COGOTH3X', 'ANTIENCX', 'OTHCONDX', 'ADGCRND', 'ADGCEXR', 'NGDSGWAC', 'NGDSEXAC', 'NGDSWGAC', 'NGDSWEAC', 'NPFIXX', 'NPTANX', 'NPABANX', 'NPASANX', 'NPTDPANX', 'NPHISOX', 'NPPATHOX', 'NACCWRI1', 'NACCWRI2', 'NACCWRI3', 'NPFAUT1', 'NPFAUT2

In [14]:
len(object_columns)

93

## Removing all non-int data

In [15]:
import numpy as np

object_cols = df.select_dtypes(include=['object']).columns
df = df.drop(columns=object_cols)

df = df.select_dtypes(include=[np.number])



## Handling missing values and initilizing dependant and in-dependant features

In [16]:
if "DEMENTED" in df.columns:
    target = "DEMENTED"
else:
    target = "NACCUDSD"   # fallback multi-class clinical diagnosis

features = df.drop(columns=[target])
y = df[target]


X = features.fillna(features.median())

# ---------------------------------------
# 7. Final ML-ready data
# ---------------------------------------
print("Features shape:", X.shape)
print("Target shape:", y.shape)


Features shape: (195196, 766)
Target shape: (195196,)


## Applying VarianceThreshold

In [17]:
from sklearn.feature_selection import VarianceThreshold
# Example: drop features with variance < 1
selector = VarianceThreshold(threshold=1)
X_reduced = selector.fit_transform(X)

# Keep only the selected columns
selected_cols = X.columns[selector.get_support()]
X_selected = X[selected_cols]

print("Remaining columns:", len(selected_cols))
print("Dropped columns:", len(X.columns) - len(selected_cols))


Remaining columns: 645
Dropped columns: 121


## Approaching PCA technique

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected) # Scaling between 1 and 0

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

loadings = pd.DataFrame(
    pca.components_.T,
    columns=[f"PC{i+1}" for i in range(pca.n_components_)],
    index=X_selected.columns
)

top_PC1 = loadings["PC1"].abs().sort_values(ascending=False).head(10)
print("Top PC1 Features:\n", top_PC1)

print("Explained Variance Ratio:", pca.explained_variance_ratio_)


Top PC1 Features:
 DELIRIF     0.069433
MSAIF       0.069433
HIVIF       0.069426
FTLDMOIF    0.069420
SCHIZOIF    0.069418
IMPSUBIF    0.069412
EPILEPIF    0.069409
PTSDDXIF    0.069394
BIPOLDIF    0.069381
ESSTREIF    0.069347
Name: PC1, dtype: float64
Explained Variance Ratio: [0.29840745 0.15479302 0.07302789 0.05531144 0.04578605 0.03658191
 0.02853694 0.02459547 0.01600897 0.01272589 0.01153811 0.0103014
 0.00976755 0.00832827 0.00770637 0.0072006  0.00652689 0.00616085
 0.00533562 0.00501557 0.00461612 0.00453236 0.00429953 0.00418977
 0.00403184 0.00390578 0.00384545 0.00365486 0.00340461 0.00331237
 0.00304659 0.00292635 0.0028285  0.00278748 0.00272212 0.00264902
 0.00257168 0.00254132 0.00240037 0.00232889 0.0022083  0.00214355
 0.00212024 0.00206038 0.0020463  0.00193124 0.00186927 0.00183369
 0.00179276 0.00168254 0.00164998 0.00163572 0.0016002  0.00157004
 0.00155105 0.00153027 0.00149196 0.0014637  0.00145489 0.00141947
 0.00137044 0.00134828 0.00132105 0.00129893 0.001

In [20]:
loadings

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC73,PC74,PC75,PC76,PC77,PC78,PC79,PC80,PC81,PC82
NACCADC,-0.006004,0.002704,0.002979,0.006917,0.002231,-0.023504,0.013546,0.017615,0.026510,0.004422,...,0.042570,0.037153,-0.090989,0.015177,0.087322,-0.010368,0.105419,-0.027231,-0.025487,0.073888
VISITMO,-0.000140,-0.000323,0.000061,-0.001445,0.001123,0.000680,0.000578,0.000249,-0.002716,-0.002314,...,-0.002790,-0.003877,-0.003470,-0.000338,0.007786,-0.001394,-0.015286,-0.011734,-0.007191,0.003696
VISITDAY,0.000010,-0.000162,0.000780,-0.000545,0.000513,-0.001516,0.000266,-0.000901,-0.001873,-0.002855,...,-0.002250,0.005686,-0.006641,0.001034,0.002624,0.004588,-0.007772,-0.002929,-0.006023,0.000067
VISITYR,0.060817,0.013461,-0.007894,0.023058,-0.012764,0.008368,0.008754,-0.002003,0.053872,0.047350,...,0.013805,0.018636,0.007073,0.043409,0.048313,0.053526,0.009679,-0.068393,0.014111,-0.006392
NACCVNUM,0.020228,0.014121,-0.010280,0.032026,-0.015955,-0.024043,0.021098,0.019693,0.127731,0.066507,...,0.025567,0.031486,0.029661,0.043149,0.066279,0.061853,0.001482,-0.084703,0.018138,-0.013608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NPPOTH2,-0.016436,-0.003924,0.131693,0.009078,-0.017639,-0.019601,0.022880,0.018119,-0.000409,0.009256,...,-0.001571,-0.001410,-0.000903,0.001052,0.000421,-0.000949,0.000800,0.001526,-0.000761,0.000932
NPCOTH2,-0.016417,-0.003918,0.131558,0.009061,-0.017586,-0.019570,0.022790,0.017973,-0.000431,0.009044,...,-0.002001,-0.001257,-0.001095,0.001119,0.000079,-0.000677,0.000621,0.001376,-0.000467,0.000886
NPPOTH3,-0.016428,-0.003922,0.131671,0.009077,-0.017635,-0.019605,0.022879,0.018110,-0.000403,0.009208,...,-0.001647,-0.001483,-0.000927,0.001136,0.000292,-0.001065,0.000701,0.001519,-0.000734,0.000832
NPCOTH3,-0.016421,-0.003920,0.131605,0.009058,-0.017583,-0.019572,0.022864,0.017999,-0.000367,0.009084,...,-0.001605,-0.001445,-0.000973,0.001135,0.000108,-0.001219,0.001060,0.001405,-0.000749,0.000826


# Model Training

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)


In [39]:
X_train.shape

(136637, 645)

## Logistic Regression

In [40]:
lr_model = LogisticRegression()
lr_history = lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)

print(classification_report(y_test, lr_preds))


              precision    recall  f1-score   support

           0       0.94      0.93      0.93     41285
           1       0.84      0.85      0.84     17274

    accuracy                           0.91     58559
   macro avg       0.89      0.89      0.89     58559
weighted avg       0.91      0.91      0.91     58559



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### LR - Model performance





In [41]:
confusion_matrix(y_test,lr_preds)

array([[38472,  2813],
       [ 2635, 14639]])

## RandomForest Model

In [42]:
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42
)

rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)



### Rf - Model Performance

In [43]:
print(classification_report(y_test, rf_preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     41285
           1       1.00      1.00      1.00     17274

    accuracy                           1.00     58559
   macro avg       1.00      1.00      1.00     58559
weighted avg       1.00      1.00      1.00     58559



# Model Selection
Chose LogisticRegression Model due to more generelizaiton