## **ModelX - Dementia Prediction (XPredators)**

# Data Exploration and Flow

## Loading the Dataset

In [2]:
!pip install gdown
import gdown
import pandas as pd

file_id = "19mKGPNFb35kG__3Eihazyv5O69ZUxDcF"
url = f"https://drive.google.com/uc?id={file_id}"

gdown.download(url, "temp.csv", quiet=False)

df = pd.read_csv("temp.csv")
df.head()




Downloading...
From (original): https://drive.google.com/uc?id=19mKGPNFb35kG__3Eihazyv5O69ZUxDcF
From (redirected): https://drive.google.com/uc?id=19mKGPNFb35kG__3Eihazyv5O69ZUxDcF&confirm=t&uuid=118bcfe3-73f5-40ce-90b8-dc346cdf1c08
To: /content/temp.csv
100%|██████████| 509M/509M [00:03<00:00, 161MB/s]
  df = pd.read_csv("temp.csv")


Unnamed: 0,NACCID,NACCADC,PACKET,FORMVER,VISITMO,VISITDAY,VISITYR,NACCVNUM,NACCAVST,NACCNVST,...,NPATGAM1,NPATGAM2,NPATGAM3,NPATGAM4,NPATGAM5,NPATGFRN,NPATGFR1,NPATGFR2,NPATGFR3,NPATGFR4
0,NACC002909,186,I,3.0,12,28,2022,1,2,2,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
1,NACC002909,186,F,3.0,1,23,2024,2,2,2,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
2,NACC003487,186,I,3.0,11,15,2023,1,1,1,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
3,NACC004352,186,I,3.0,10,5,2021,1,1,1,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
4,NACC004687,186,I,3.0,11,14,2022,1,1,1,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4


## Checking dtype counts of columns

In [3]:
df.dtypes.value_counts()

Unnamed: 0,count
int64,747
object,142
float64,135


In [4]:
object_columns = df.select_dtypes(include='object').columns.tolist()
print(object_columns)

['NACCID', 'PACKET', 'HISPORX', 'RACEX', 'RACESECX', 'RACETERX', 'PRIMLANX', 'INHISPOX', 'INRACEX', 'INRASECX', 'INRATERX', 'INRELTOX', 'NACCAMX', 'NACCAMSX', 'NACCFMX', 'NACCFMSX', 'NACCOMX', 'NACCOMSX', 'DRUG1', 'DRUG2', 'DRUG3', 'DRUG4', 'DRUG5', 'DRUG6', 'DRUG7', 'DRUG8', 'DRUG9', 'DRUG10', 'DRUG11', 'DRUG12', 'DRUG13', 'DRUG14', 'DRUG15', 'DRUG16', 'DRUG17', 'DRUG18', 'DRUG19', 'DRUG20', 'DRUG21', 'DRUG22', 'DRUG23', 'DRUG24', 'DRUG25', 'DRUG26', 'DRUG27', 'DRUG28', 'DRUG29', 'DRUG30', 'DRUG31', 'DRUG32', 'DRUG33', 'DRUG34', 'DRUG35', 'DRUG36', 'DRUG37', 'DRUG38', 'DRUG39', 'DRUG40', 'CVOTHRX', 'NCOTHRX', 'ARTHTYPX', 'OTHSLEEX', 'ABUSX', 'PSYCDISX', 'CVDIMAGX', 'SPEECHX', 'FACEXPX', 'TRESTFAX', 'TRESTRHX', 'TRESTLHX', 'TRESTRFX', 'TRESTLFX', 'TRACTRHX', 'TRACTLHX', 'RIGDNEX', 'RIGDUPRX', 'RIGDUPLX', 'RIGDLORX', 'RIGDLOLX', 'TAPSRTX', 'TAPSLFX', 'HANDMVRX', 'HANDMVLX', 'HANDATRX', 'HANDATLX', 'LEGRTX', 'LEGLFX', 'ARISINGX', 'POSTUREX', 'GAITX', 'POSSTABX', 'BRADYKIX', 'NPIQINFX', '

In [5]:
df['NACCUDSD'].value_counts()

Unnamed: 0_level_0,count
NACCUDSD,Unnamed: 1_level_1
1,94933
4,57590
3,34106
2,8567


1 = Normal cognition
2 = Impaired-not-MCI
3 = MCI
4 = Dementia

In [6]:
df['DEMENTED'].value_counts()

Unnamed: 0_level_0,count
DEMENTED,Unnamed: 1_level_1
0,137606
1,57590


## Dropping all the medical-related columns (Keep only non-medical variables)

In [7]:
medical_cols = [
    # --- A5: Subject Health History ---
    "THYROID","ARTHRIT","ARTHTYPE","ARTHTYPX","ARTHUPEX","ARTHLOEX","ARTHSPIN","ARTHUNK",
    "INCONTU","INCONTF","APNEA","RBD","INSOMN","OTHSLEEP","OTHSLEEX",
    "ALCOHOL","ABUSOTHR","ABUSX","PTSD","BIPOLAR","SCHIZ","DEP2YRS",
    "DEPOTHR","ANXIETY","OCD","NPSYDEV","PSYCDIS","PSYCDISX",
    "TOBAC30","TOBAC100","SMOKYRS","PACKSPER","QUITSMOK",
    "ALCOCCAS","ALCFREQ",
    "CVHATT","HATTMULT","HATTYEAR","CVAFIB","CVANGIO","CVBYPASS",
    "CVPACDEF","CVPACE","CVCHF","CVANGINA","CVHVALVE",
    "CVOTHR","CVOTHRX",
    "CBSTROKE","STROKMUL","NACCSTYR",
    "CBTIA","TIAMULT","NACCTIYR",
    "PD","PDYR","PDOTHR","PDOTHRYR",
    "SEIZURES",
    "TBI","TBIBRIEF","TRAUMBRF","TBIEXTEN","TRAUMEXT","TBIWOLOS","TRAUMCHR","TBIYEAR",
    "NCOTHR","NCOTHRX",
    "DIABETES","DIABTYPE",
    "HYPERTEN","HYPERCHO","B12DEF",

    # --- A4: Medications ---
    "ANYMEDS"
] + [f"DRUG{i}" for i in range(1, 41)] + [
    "NACCAPSY","NACCAANX","NACCADMD","NACCPDMD","NACCEMD","NACCEPMD",
    "NACCDBMD","NACCAMD","NACCAHTN","NACCHTNC",

    # --- D2: Clinician Medical Conditions ---
    "CANCER","CANCSITE","DIABET","MYOINF","CONGHRT","AFIBRILL",
    "HYPERT","ANGINA","HYPCHOL","VB12DEF","THYDIS","ARTH","ARTYPE",
    "ARTYPEX","ARTUPEX","ARTLOEX","ARTSPIN","ARTUNKN",
    "URINEINC","BOWLINC","SLEEPAP","REMDIS","HYPOSOM","SLEEPOTH","SLEEPOTX",
    "ANGIOCP","ANGIOPCI","PACEMAKE","HVALVE","ANTIENC",

    # --- Physical Exam (b1 section) ---
    "HEIGHT","WEIGHT","NACCBMI","BPSYS","BPDIAS","HRATE",
    "VISION","VISCORR","VISWCORR"
]
df = df.drop(columns=[col for col in medical_cols if col in df.columns])


In [8]:
df.shape

(195196, 860)

In [9]:
df.dtypes.value_counts()

Unnamed: 0,count
int64,665
float64,102
object,93


In [10]:
object_columns = df.select_dtypes(include='object').columns.tolist()
print(object_columns)

['NACCID', 'PACKET', 'HISPORX', 'RACEX', 'RACESECX', 'RACETERX', 'PRIMLANX', 'INHISPOX', 'INRACEX', 'INRASECX', 'INRATERX', 'INRELTOX', 'NACCAMX', 'NACCAMSX', 'NACCFMX', 'NACCFMSX', 'NACCOMX', 'NACCOMSX', 'CVDIMAGX', 'SPEECHX', 'FACEXPX', 'TRESTFAX', 'TRESTRHX', 'TRESTLHX', 'TRESTRFX', 'TRESTLFX', 'TRACTRHX', 'TRACTLHX', 'RIGDNEX', 'RIGDUPRX', 'RIGDUPLX', 'RIGDLORX', 'RIGDLOLX', 'TAPSRTX', 'TAPSLFX', 'HANDMVRX', 'HANDMVLX', 'HANDATRX', 'HANDATLX', 'LEGRTX', 'LEGLFX', 'ARISINGX', 'POSTUREX', 'GAITX', 'POSSTABX', 'BRADYKIX', 'NPIQINFX', 'OTHNEURX', 'COGOTHRX', 'NACCCGFX', 'COGMODEX', 'BEOTHRX', 'NACCBEFX', 'BEMODEX', 'MOMODEX', 'MMSELANX', 'NPSYLANX', 'MOCALANX', 'RESPOTHX', 'OTHBIOMX', 'OTHMUTX', 'FTLDSUBX', 'OTHCOGX', 'OTHPSYX', 'COGOTHX', 'COGOTH2X', 'COGOTH3X', 'ANTIENCX', 'OTHCONDX', 'ADGCRND', 'ADGCEXR', 'NGDSGWAC', 'NGDSEXAC', 'NGDSWGAC', 'NGDSWEAC', 'NPFIXX', 'NPTANX', 'NPABANX', 'NPASANX', 'NPTDPANX', 'NPHISOX', 'NPPATHOX', 'NACCWRI1', 'NACCWRI2', 'NACCWRI3', 'NPFAUT1', 'NPFAUT2

In [11]:
len(object_columns)

93

## Removing all non-int data

In [12]:
import numpy as np

object_cols = df.select_dtypes(include=['object']).columns
df = df.drop(columns=object_cols)

df = df.select_dtypes(include=[np.number])



## Handling missing values and initilizing dependant and in-dependant features

In [13]:
if "DEMENTED" in df.columns:
    target = "DEMENTED"
else:
    target = "NACCUDSD"   # fallback multi-class clinical diagnosis

features = df.drop(columns=[target])
y = df[target]


X = features.fillna(features.median())

# ---------------------------------------
# 7. Final ML-ready data
# ---------------------------------------
print("Features shape:", X.shape)
print("Target shape:", y.shape)


Features shape: (195196, 766)
Target shape: (195196,)


## Applying VarianceThreshold

In [30]:
from sklearn.feature_selection import VarianceThreshold
# Example: drop features with variance < 30
selector = VarianceThreshold(threshold=30)
X_reduced = selector.fit_transform(X)

# Keep only the selected columns
selected_cols = X.columns[selector.get_support()]
X_selected = X[selected_cols]

print("Remaining columns:", len(selected_cols))
print("Dropped columns:", len(X.columns) - len(selected_cols))


Remaining columns: 216
Dropped columns: 550


## Approaching PCA technique

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected) # Scaling between 1 and 0

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

loadings = pd.DataFrame(
    pca.components_.T,
    columns=[f"PC{i+1}" for i in range(pca.n_components_)],
    index=X_selected.columns
)

top_PC1 = loadings["PC1"].abs().sort_values(ascending=False).head(10)
print("Top PC1 Features:\n", top_PC1)

print("Explained Variance Ratio:", pca.explained_variance_ratio_)


Top PC1 Features:
 MSAIF       0.106183
DELIRIF     0.106183
HIVIF       0.106180
SCHIZOIF    0.106168
IMPSUBIF    0.106161
FTLDMOIF    0.106158
EPILEPIF    0.106140
PTSDDXIF    0.106140
BIPOLDIF    0.106115
ESSTREIF    0.106072
Name: PC1, dtype: float64
Explained Variance Ratio: [0.34205997 0.10691707 0.09689627 0.06316316 0.03929836 0.02876998
 0.02106033 0.01885983 0.01673234 0.01505249 0.01321483 0.01211004
 0.01126559 0.01026935 0.00997407 0.00959642 0.00804772 0.00772833
 0.00706866 0.00674394 0.00626548 0.00590818 0.00583204 0.00546167
 0.00537944 0.0051708  0.00506344 0.00473005 0.0042911  0.0041911
 0.00414679 0.00411666 0.00395804 0.00379387 0.0036492  0.00358118
 0.0034544  0.00336048 0.0032949  0.00318846 0.00310072 0.00300489
 0.00288791 0.00254505 0.00245186 0.00243845]


In [24]:
loadings

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC37,PC38,PC39,PC40,PC41,PC42,PC43,PC44,PC45,PC46
NACCADC,-0.007562,0.011852,0.000026,0.011968,-0.006027,0.049844,-0.036311,-0.051959,-0.040521,0.078373,...,0.046359,0.094563,-0.290913,0.337493,0.039122,-0.191650,-0.109798,0.026991,0.090472,0.081252
VISITDAY,0.000022,-0.000164,-0.000615,-0.000725,0.000812,-0.001217,-0.002211,0.004338,-0.001228,0.007133,...,0.010988,0.005066,0.008070,-0.014608,0.003815,0.005822,-0.004651,-0.000058,-0.008529,-0.000680
VISITYR,0.091979,-0.022323,0.028177,0.009177,0.002108,0.052893,0.004071,-0.013327,-0.009200,-0.093505,...,0.033209,0.000246,-0.012620,0.040680,-0.036494,0.027821,0.004756,-0.010848,0.015610,0.013723
NACCDAYS,0.003537,0.001540,0.020908,0.010053,-0.052041,0.222289,-0.030591,-0.121088,-0.015384,0.062232,...,-0.084454,0.050856,0.216677,-0.076653,-0.144242,0.148393,-0.076078,-0.014696,-0.059092,-0.053868
NACCFDYS,0.035844,0.002244,0.028945,0.034173,-0.013700,0.124443,-0.034088,-0.175048,-0.110407,-0.014228,...,-0.119472,0.012368,0.269987,-0.108469,-0.122510,0.206650,-0.169030,-0.054707,-0.041655,-0.020291
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NPINF4D,-0.021089,0.015966,-0.034776,0.232376,-0.040770,0.024785,0.019794,0.089458,-0.044162,-0.029251,...,-0.030386,-0.007909,0.005257,-0.008163,0.007299,0.005170,-0.018396,-0.003447,-0.007619,-0.005867
NPINF4F,-0.021155,0.016009,-0.034868,0.232724,-0.040542,0.024823,0.019823,0.089235,-0.044228,-0.029045,...,-0.030786,-0.007519,0.004572,-0.008792,0.007581,0.004868,-0.017878,-0.003439,-0.007410,-0.005809
NACCDAGE,0.031860,-0.018253,0.035374,-0.164027,-0.020256,0.054011,0.037666,0.155241,-0.099577,-0.138220,...,-0.010851,0.003370,0.020049,0.052259,-0.123537,-0.008458,0.206459,0.004428,-0.001970,-0.007140
NACCINT,0.031616,-0.018220,0.035137,-0.164879,-0.019693,0.052105,0.037948,0.154048,-0.097041,-0.139476,...,-0.006808,0.005802,0.016660,0.051192,-0.114546,-0.010689,0.197152,0.005688,-0.000241,-0.006241


# Model Training

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)


In [33]:
X_train

Unnamed: 0,NACCADC,VISITDAY,NACCDAYS,NACCFDYS,BIRTHYR,HISPOR,RACE,RACESEC,RACETER,EDUC,...,NPINF2F,NPINF3B,NPINF3D,NPINF3F,NPINF4B,NPINF4D,NPINF4F,NACCDAGE,NACCINT,NPCHROM
2835,289,21,728,728,1929,88,1,88,88,14,...,-4.4,-4.4,-4.4,-4.4,-4.4,-4.4,-4.4,81,8,50.0
130545,6499,28,2984,2347,1941,88,2,88,88,18,...,-4.4,-4.4,-4.4,-4.4,-4.4,-4.4,-4.4,888,888,-4.0
92402,4967,9,4748,4096,1935,88,1,88,88,18,...,-4.4,-4.4,-4.4,-4.4,-4.4,-4.4,-4.4,888,888,-4.0
103222,5452,5,5881,5071,1955,88,2,88,88,12,...,-4.4,-4.4,-4.4,-4.4,-4.4,-4.4,-4.4,888,888,-4.0
156622,8646,22,5796,5796,1943,88,2,88,88,19,...,-4.4,-4.4,-4.4,-4.4,-4.4,-4.4,-4.4,888,888,-4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,6061,22,2533,1751,1927,88,1,88,88,16,...,-4.4,-4.4,-4.4,-4.4,-4.4,-4.4,-4.4,888,888,-4.0
103694,5452,19,3291,0,1958,88,1,88,88,16,...,-4.4,-4.4,-4.4,-4.4,-4.4,-4.4,-4.4,888,888,-4.0
131932,6499,3,4131,2195,1945,88,1,88,88,18,...,-4.4,-4.4,-4.4,-4.4,-4.4,-4.4,-4.4,888,888,-4.0
146867,8361,2,1883,1523,1922,88,1,88,88,13,...,-4.4,-4.4,-4.4,-4.4,-4.4,-4.4,-4.4,89,12,50.0


## Logistic Regression

In [34]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# --- Scale the data properly ---
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)   # fit on train, transform train
X_test_scaled  = scaler.transform(X_test)        # ONLY transform test

# --- Train Logistic Regression ---
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)

# --- Predict ---
lr_preds = lr_model.predict(X_test_scaled)

# --- Evaluation ---
print(classification_report(y_test, lr_preds))


              precision    recall  f1-score   support

           0       0.98      0.97      0.97     41285
           1       0.92      0.94      0.93     17274

    accuracy                           0.96     58559
   macro avg       0.95      0.95      0.95     58559
weighted avg       0.96      0.96      0.96     58559



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### LR - Model performance





In [29]:
confusion_matrix(y_test,lr_preds)

array([[40678,   607],
       [  634, 16640]])

## RandomForest Model

In [35]:
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42
)

rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)



### Rf - Model Performance

In [36]:
print(classification_report(y_test, rf_preds))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98     41285
           1       0.95      0.96      0.95     17274

    accuracy                           0.97     58559
   macro avg       0.96      0.97      0.97     58559
weighted avg       0.97      0.97      0.97     58559



# Model Selection
Chose LogisticRegression Model due to more generelizaiton