<a href="https://colab.research.google.com/github/ShathuCodes/ModelX_Hackathon/blob/main/AI_Aces.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Import needed library

In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
import pandas as pd
import numpy as np

## Store data set as df

In [None]:
df = pd.read_csv("/content/Dementia Prediction Dataset.zip")

In [None]:
df.head()

## Clean the needed coloums only

In [None]:
demographic_columns =  [
    # Identifier and visit info
    'NACCID', 'NACCADC', 'PACKET', 'FORMVER', 'VISITMO', 'VISITDAY', 'VISITYR',
    'NACCVNUM', 'NACCAVST', 'NACCNVST',

    # TARGET VARIABLE - Dementia Status
    'DEMENTED',

    # Alternative target variables (optional)
    'NACCUDSD', 'NORMCOG', 'CDRGLOB',

    # Demographics & Baseline
    'NACCAGE', 'EDUC', 'SEX', 'NACCNIHR','AGE',

    # Cognitive Test Scores
    'NACCMMSE', 'NACCMOCA', 'ANIMALS', 'VEG', 'TRAILA', 'TRAILB',
    'LOGIMEM', 'MEMUNITS', 'BOSTON', 'DIGIF', 'DIGIB',

    # Functional Assessment (FAQ)
    'BILLS', 'TAXES', 'SHOPPING', 'GAMES', 'STOVE', 'MEALPREP',
    'EVENTS', 'PAYATIN', 'REMDATES', 'TRAVEL',

    # Clinical Dementia Rating (CDR)
    'CDRSUM', 'CDRGLOB', 'MEMORY', 'ORIENT', 'JUDGMENT',
    'COMMUN', 'HOMEHOBB', 'PERSCARE',

    # Neuropsychiatric Symptoms (NPI-Q)
    'DEL', 'HALL', 'AGIT', 'DEPD', 'ANX', 'ELAT', 'APA',
    'DISN', 'IRR', 'MOT', 'NITE', 'APP',

    # Clinician Judgment of Symptoms
    'COGMEM', 'COGORI', 'COGIUDG', 'COGLANG', 'COGVIS', 'COGATTN',

    # Medical History
    'CVHATT', 'CVAFIB', 'CBSTROKE', 'CBTIA', 'DIABETES',
    'HYPERTEN', 'HYPERCHO', 'DEP2YRS', 'NACCTBI',

    # Family History
    'NACCFAM'
    "TOBAC30", "TOBAC100", "SMOKYRS",
    "ALCOCCAS", "ALCFREQ",
    "DIABETES", "INDEPEND", "RESIDENC", "MARISTAT", "NACCLIVS", "HANDED"
]


In [None]:
df = df[[col for col in demographic_columns if col in df.columns]]
df

In [None]:
df_filtered = df[['NACCMMSE','SEX','ANIMALS','VEG','LOGIMEM','DEMENTED', 'NACCAGE','EDUC',  "TOBAC100", "SMOKYRS",
    "ALCOCCAS", "ALCFREQ","DIABETES", "INDEPEND", "RESIDENC", "MARISTAT", "NACCLIVS", "HANDED"]]

Clean NACCMMSE col

In [None]:
# 1) LIST of all variables
all_cols = [
    # Cognitive tests
    "NACCMMSE", "LOGIMEM", "ANIMALS", 'VEG',

    # Core demographics
    "SEX", "NACCAGE", "EDUC",

    # Lifestyle / history
    "TOBAC100", "SMOKYRS", "ALCOCCAS", "ALCFREQ",

    # Categorical variables
    "DIABETES", "INDEPEND", "RESIDENC",
    "MARISTAT", "NACCLIVS", "HANDED" , 'DEMENTED'
]


# 2) VALIDATION RULES for each variable
validation_rules = {

    # Cognitive tests
    "NACCMMSE":  lambda s: s.between(0, 30),
    "LOGIMEM":   lambda s: s.between(0, 25),
    "ANIMALS":   lambda s: s.between(0, 77),

    # Sex
    "SEX":       lambda s: s.isin([0, 1, 2]),   # we convert 2->0 later

    # Numerical demographics
    "NACCAGE":   lambda s: s.between(10, 100),
    "EDUC":      lambda s: s.between(0, 36),

    # Lifestyle
    "TOBAC100":  lambda s: s.isin([0, 1]),
    "SMOKYRS":   lambda s: s.between(0, 88),
    "ALCOCCAS":  lambda s: s.isin([0, 1]),
    "ALCFREQ":   lambda s: s.between(0, 8),

    # Categorical with invalid codes 9, -4
    "DIABETES":    lambda s: s.isin([0, 1, 2, 3]),
    "INDEPEND":  lambda s: s.isin([1, 2, 3, 4]),
    "RESIDENC":  lambda s: s.isin([1, 2, 3, 4]),
    "MARISTAT":  lambda s: s.isin([1, 2, 3, 4, 5, 6]),
    "NACCLIVS":  lambda s: s.isin([1, 2, 3, 4, 5]),
    "HANDED":    lambda s: s.isin([1, 2, 3]),
}


In [None]:
df_sub = df[all_cols].copy()
df_sub["SEX"] = df_sub["SEX"].replace(2, 0)
mask = np.ones(len(df_sub), dtype=bool)

for col, rule in validation_rules.items():
    if col not in df_sub.columns:
        print(f"Warning: Column '{col}' not found in dataframe. Skipping.")
        continue

    try:
        result = rule(df_sub[col])
        mask &= result.fillna(False)
    except Exception as e:
        print(f" Error while validating {col}: {e}")
df_valid = df_sub[mask].copy()
df_valid.head()



In [None]:
df_valid = df_valid.loc[:, ~df_valid.columns.duplicated()]


In [None]:
df_valid

In [None]:
df_valid['NACCMMSE'] = (30 - df_valid['NACCMMSE']) / 30
df_valid['ANIMALS']  = (77 - df_valid['ANIMALS']) / 77
df_valid['LOGIMEM']  = (25 - df_valid['LOGIMEM']) / 25
df_valid['VEG']      = (77 - df_valid['VEG']) / 77

df_valid['SMOKYRS']   = df_valid['SMOKYRS'] / df_valid['SMOKYRS'].max()
df_valid['ALCOCCAS']  = df_valid['ALCOCCAS'] / df_valid['ALCOCCAS'].max()
df_valid['ALCFREQ']   = df_valid['ALCFREQ'] / df_valid['ALCFREQ'].max()
df_valid['TOBAC100']  = df_valid['TOBAC100'].astype(int)

df_valid['NACCAGE'] = df_valid['NACCAGE'] / df_valid['NACCAGE'].max()
df_valid['EDUC'] = (df_valid['EDUC'].max() - df_valid['EDUC']) / df_valid['EDUC'].max()

df_valid['DIABETES'] = df_valid['DIABETES'] / 3
df_valid['INDEPEND'] = (df_valid['INDEPEND'] - 1) / 3
df_valid['RESIDENC'] = (df_valid['RESIDENC'] - 1) / 3

df_valid.head(10)


In [None]:
print(df_valid['DEMENTED'])

In [None]:
# Features and target
X = df_valid[["NACCMMSE", "LOGIMEM", "ANIMALS", "VEG", "SEX","NACCAGE", "EDUC", "TOBAC100", "SMOKYRS", "ALCOCCAS","ALCFREQ", "DIABETES", "INDEPEND", "RESIDENC",
    "MARISTAT", "NACCLIVS", "HANDED"]].values  # shape (n_samples, n_features)
y = df_valid['DEMENTED'].values.reshape(-1,1)  # shape (n_samples, 1)

# Add bias term (intercept)
X = np.hstack([np.ones((X.shape[0],1)), X])  # shape (n_samples, n_features+1)

# Feature scaling
X_mean = X[:,1:].mean(axis=0)
X_std = X[:,1:].std(axis=0)
X[:,1:] = (X[:,1:] - X_mean) / X_std

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))


In [None]:
def compute_loss(y, y_pred):
    m = y.shape[0]
    epsilon = 1e-8
    loss = - (1/m) * np.sum(y*np.log(y_pred+epsilon) + (1-y)*np.log(1-y_pred+epsilon))
    return loss


In [None]:
import matplotlib.pyplot as plt

# Initialize weights
n_features = X.shape[1]
weights = np.zeros((n_features,1))

# Hyperparameters
learning_rate = 0.01
iterations = 5000
m = X.shape[0]

# History lists for plotting
loss_history = []
accuracy_history = []
iteration_steps = []

for i in range(iterations):
    # Forward pass
    z = np.dot(X, weights)
    y_pred = sigmoid(z)

    # Compute gradient
    gradient = (1/m) * np.dot(X.T, (y_pred - y))

    # Update weights
    weights -= learning_rate * gradient

    # Store loss and accuracy for plotting at specified intervals
    if i % 50 == 0: # Store every 50 iterations for a smoother plot without too much data
        current_loss = compute_loss(y, y_pred)
        y_pred_class_current = (y_pred >= 0.5).astype(int)
        current_accuracy = np.mean(y_pred_class_current == y)

        loss_history.append(current_loss)
        accuracy_history.append(current_accuracy)
        iteration_steps.append(i)

    # Print loss occasionally
    if i % 500 == 0:
        print(f"Iteration {i}, Loss: {current_loss:.4f}, Accuracy: {current_accuracy:.4f}")

# Plotting the loss and accuracy
plt.figure(figsize=(12, 5))

# Plot Loss
plt.subplot(1, 2, 1)
plt.plot(iteration_steps, loss_history, label='Training Loss')
plt.title('Loss over Iterations')
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.grid(True)
plt.legend()

# Plot Accuracy
plt.subplot(1, 2, 2)
plt.plot(iteration_steps, accuracy_history, label='Training Accuracy', color='orange')
plt.title('Accuracy over Iterations')
plt.xlabel('Iterations')
plt.ylabel('Accuracy')
plt.grid(True)
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Probability of dementia
y_prob = sigmoid(np.dot(X, weights))

# Binary prediction
y_pred_class = (y_prob >= 0.5).astype(int)

# Accuracy
accuracy = np.mean(y_pred_class == y)
print("Training Accuracy:", accuracy)


In [None]:
import pickle

model_data = {
    'weights': weights,
    'X_mean': X_mean,
    'X_std': X_std
}

with open("dementia_model.p", "wb") as f:
    pickle.dump(model_data, f)
print("Model saved as dementia_model.p")


In [None]:
print("X shape:", X.shape)
print("weights shape:", weights.shape)
print("X_mean shape:", X_mean.shape)
print("X_std shape:", X_std.shape)


In [None]:
print("X columns:")
print(df_valid.columns)


In [None]:
def predict_user_input(weights, X_mean, X_std):

    print("\nEnter values for prediction (press ENTER for mean):\n")

    # RAW feature names (same order as training)
    feature_names = [
        "NACCMMSE","LOGIMEM","ANIMALS","VEG",
        "SEX","NACCAGE","EDUC",
        "TOBAC100","SMOKYRS","ALCOCCAS","ALCFREQ",
        "DIABETES","INDEPEND","RESIDENC",
        "MARISTAT","NACCLIVS","HANDED"
    ]


    prompts = [
        "Please enter your memory & thinking test score (0–30): ",
        "Please enter your logical memory score (0–25): ",
        "How many animals can you name within one minute? (0–77): ",
        "How many vegetables can you name within one minute? (0–77): ",

        "What is your sex? (0 = Female, 1 = Male): ",
        "What is your age? (10–100): ",
        "How many years of formal education have you completed? (0–36): ",

        "Have you smoked at least 100 cigarettes in your life? (0 = No, 1 = Yes): ",
        "For how many total years have you smoked? (0–88): ",
        "Do you drink alcohol occasionally? (0 = No, 1 = Yes): ",
        "How often do you drink alcohol? (0 = Never to 8 = Daily): ",

        "What is your diabetes status? (0=None, 1=Adult onset, 2=Childhood onset, 3=Unknown): ",
        "What is your level of independence? (1=Independent, 2=Some help, 3=Needs help, 4=Fully dependent): ",
        "What is your living situation? (1=House, 2=Apartment, 3=Assisted living, 4=Nursing home): ",
        "What is your marital status? (1=Married, 2=Widowed, 3=Divorced, 4=Separated, 5=Never married, 6=Other): ",
        "Who do you currently live with? (1=Alone, 2=Spouse, 3=Family, 4=Friends, 5=Other): ",
        "Are you right-handed, left-handed, or ambidextrous? (1=Right, 2=Left, 3=Both): "

    ]


    raw_means = np.zeros(17)

    raw_means[0] = 30 - X_mean[0] * 30
    raw_means[1] = 25 - X_mean[1] * 25
    raw_means[2] = 77 - X_mean[2] * 77
    raw_means[3] = 77 - X_mean[3] * 77
    raw_means[4] = X_mean[4]
    raw_means[5] = X_mean[5] * 110
    raw_means[6] = 36 - X_mean[6] * 36
    raw_means[7] = X_mean[7]
    raw_means[8] = X_mean[8] * 88
    raw_means[9] = X_mean[9]
    raw_means[10] = X_mean[10] * 8
    raw_means[11] = X_mean[11] * 3
    raw_means[12] = X_mean[12] * 3 + 1
    raw_means[13] = X_mean[13] * 3 + 1
    raw_means[14] = X_mean[14]
    raw_means[15] = X_mean[15]
    raw_means[16] = X_mean[16]


    user_vals = []

    for i, feat in enumerate(feature_names):

        default_val = raw_means[i]
        raw = input(f"{prompts[i]} (default={default_val}): ")

        if raw.strip() == "":
            user_vals.append(default_val)
        else:
            try:

                if feat in ["SEX","TOBAC100","ALCOCCAS","DIABETES",
                            "INDEPEND","RESIDENC","MARISTAT","NACCLIVS","HANDED"]:
                    user_vals.append(int(raw))
                else:
                    user_vals.append(float(raw))
            except:
                print("Invalid entry → using mean")
                user_vals.append(default_val)


    processed = np.array([
        (30 - user_vals[0]) / 30,
        (25 - user_vals[1]) / 25,
        (77 - user_vals[2]) / 77,
        (77 - user_vals[3]) / 77,

        user_vals[4],
        user_vals[5] / 110,
        (36 - user_vals[6]) / 36,

        user_vals[7],
        user_vals[8] / 88,
        user_vals[9],
        user_vals[10] / 8,

        user_vals[11] / 3,
        (user_vals[12] - 1) / 3,
        (user_vals[13] - 1) / 3,

        user_vals[14],
        user_vals[15],
        user_vals[16]
    ])


    x_scaled = (processed - X_mean) / X_std
    x_final = np.insert(x_scaled, 0, 1)

    prob = sigmoid(np.dot(x_final, weights))
    prob = float(prob.item())

    cls = int(prob >= 0.5)

    print("\n---------------------------")
    print("Predicted Probability:", prob)
    print("Predicted Class:", cls)
    print("(1 = Demented, 0 = Not Demented)")
    print("---------------------------")
predict_user_input(weights, X_mean, X_std)