Data Loading, Cleaning, Preprocessing and Splitting (Categorical Encoding)

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE

# Load dataset
file_path = "final_preprocessed_dataset.csv"
data = pd.read_csv(file_path, low_memory=False)

print("Dataset loaded successfully.")
print(f"Dataset shape: {data.shape}")
print("First 5 rows of the dataset:")
print(data.head())
print("\nChecking for missing values:")
missing_values = data.isnull().mean() * 100
print(missing_values[missing_values > 0].sort_values(ascending=False))

# Define the columns to derive the target variable
target_column = "cognitive_status"  
target_columns = [
    'diagnostic_summary_changes_bcadas',
    'diagnostic_summary_changes_bcmmse',
    'diagnostic_summary_changes_bcmmsrec',
    'diagnostic_summary_changes_bcnmmms',
    'diagnostic_summary_changes_bcnonmem',
    'diagnostic_summary_changes_bccdr'
]

# Define a function to create the target variable
def create_target(df, columns):
    target = df[columns].apply(lambda x: 1 if 1 in x.values else 0, axis=1) # Check if any of the specified columns contain the value 1
    return target

if target_column not in data.columns:
    print(f"Target column '{target_column}' not found. Deriving it...")
    data[target_column] = create_target(data, target_columns)
    print(f"Derived column '{target_column}' based on selected columns.")

# Drop rows with missing target values
data = data.dropna(subset=[target_column])
print(f"Dataset shape after dropping rows with missing target: {data.shape}")

# Split features and target
X = data.drop(columns=[target_column])
y = data[target_column]

# Identify numeric and categorical columns
numeric_columns = X.select_dtypes(include=["float64", "int64"]).columns
categorical_columns = X.select_dtypes(include=["object"]).columns

print("\nNumeric columns:", numeric_columns)
print("Categorical columns:", categorical_columns)

# Handle preprocessing with ColumnTransformer
# - Impute missing values for numeric columns with the mean
# - Encode categorical columns with OneHotEncoder
# - Retain column names after transformation
preprocessor = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="mean"), numeric_columns),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_columns),
    ],
    remainder="drop",
)

# Apply the transformations
X_preprocessed = preprocessor.fit_transform(X)
feature_names = (
    numeric_columns.tolist()
    + preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_columns).tolist()
)

X_preprocessed = pd.DataFrame(X_preprocessed, columns=feature_names)

# Standardize features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_preprocessed), columns=feature_names)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Display final dataset information
print("\nFinal dataset after preprocessing:")
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train distribution:\n{y_train.value_counts()}")
print(f"y_test distribution:\n{y_test.value_counts()}")

# Step 1: Verify target variability and check if classes are missing
print("Unique values in the target variable:", y_train.unique())
print("Target variable distribution in training data:")
print(y_train.value_counts())

# Step 2: Check for target class imbalance
if len(y_train.unique()) == 1:
    print("The target variable has only one class. Investigate the dataset further.")
else:
    print("The target variable has multiple classes. Proceeding with the next steps.")

# Step 3: Perform feature selection to reduce dimensionality
# Remove low variance features
print("Performing Variance Threshold to remove low variance features...")
vt = VarianceThreshold(threshold=0.01)  # Keep features with variance > 0.01
X_train_reduced = vt.fit_transform(X_train)
X_test_reduced = vt.transform(X_test)

print("Shape after Variance Threshold:")
print("X_train:", X_train_reduced.shape)
print("X_test:", X_test_reduced.shape)

# Step 4: Apply PCA for further dimensionality reduction (optional)
print("Applying PCA for dimensionality reduction...")
pca = PCA(n_components=100)  # Keep 100 principal components
X_train_pca = pca.fit_transform(X_train_reduced)
X_test_pca = pca.transform(X_test_reduced)

print("Shape after PCA:")
print("X_train:", X_train_pca.shape)
print("X_test:", X_test_pca.shape)

# Step 5: Address class imbalance (if applicable)
# Apply SMOTE only if there are multiple classes
if len(y_train.unique()) > 1:
    print("Applying SMOTE to address class imbalance...")
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train_pca, y_train)
    print("Shape after SMOTE:")
    print("X_train_balanced:", X_train_balanced.shape)
    print("y_train_balanced distribution:")
    print(y_train_balanced.value_counts())
else:
    print("SMOTE not applied as there is only one class in the target variable.")

# Save preprocessed data for model training
np.save("X_train_preprocessed.npy", X_train_pca)
np.save("X_test_preprocessed.npy", X_test_pca)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

print("Preprocessing steps completed. Data saved for model training.")

Dataset loaded successfully.
Dataset shape: (260951, 194)
First 5 rows of the dataset:
  diagnostic_summary_changes_phase diagnostic_summary_changes_ptid  rid  \
0                            ADNI2                      003_S_0908    1   
1                            ADNI1                      011_S_0010   10   
2                            ADNI1                      011_S_0010   10   
3                            ADNI1                      011_S_0010   10   
4                            ADNI1                      011_S_0010   10   

  diagnostic_summary_changes_viscode diagnostic_summary_changes_viscode2  \
0                                v11                                  bl   
1                                 bl                                  bl   
2                                 bl                                  bl   
3                                 bl                                  bl   
4                                 bl                                  bl   

  dia

In [2]:
print(data.columns)

Index(['diagnostic_summary_changes_phase', 'diagnostic_summary_changes_ptid',
       'rid', 'diagnostic_summary_changes_viscode',
       'diagnostic_summary_changes_viscode2',
       'diagnostic_summary_changes_examdate',
       'diagnostic_summary_changes_bcadas',
       'diagnostic_summary_changes_bcmmse',
       'diagnostic_summary_changes_bcmmsrec',
       'diagnostic_summary_changes_bcnmmms',
       ...
       'biomarker_samples_bicsftrns', 'biomarker_samples_bicsffroz',
       'biomarker_samples_bilppadate', 'biomarker_samples_bilpfldate',
       'biomarker_samples_bilpspdate', 'biomarker_samples_id',
       'biomarker_samples_siteid', 'biomarker_samples_userdate',
       'biomarker_samples_update_stamp', 'cognitive_status'],
      dtype='object', length=195)


 Model Building

In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import joblib

# Load the preprocessed data (assuming you saved it as .npy and .csv files)
X_train = np.load('X_train_preprocessed.npy')
X_test = np.load('X_test_preprocessed.npy')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

# Define and train the classifier (using RandomForest as an example)
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Save the trained model
joblib.dump(clf, 'cognitive_status_predictor_model.pkl')

print("Model saved successfully.")

  return fit_method(estimator, *args, **kwargs)


Accuracy: 0.9567
Precision: 0.9660
Recall: 0.9008
F1-Score: 0.9322
Model saved successfully.


In [4]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import joblib
from xgboost import XGBClassifier

# Load the preprocessed data
X_train = np.load('X_train_preprocessed.npy')
X_test = np.load('X_test_preprocessed.npy')
y_train = pd.read_csv('y_train.csv').values.ravel()  # Reshaping y_train to 1D
y_test = pd.read_csv('y_test.csv').values.ravel()  # Reshaping y_test to 1D

# Define the models to try
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42)
}

# Store results
results = []

# Train and evaluate each model
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    })
    
    # Optionally save the best model
    if model_name == 'RandomForest':  # Save the random forest model as an example
        joblib.dump(model, 'cognitive_status_predictor_random_forest.pkl')
        print(f"Saved {model_name} model successfully.")

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
print("\nModel comparison:")
print(results_df)

# Find the best model based on F1-Score
best_model = results_df.loc[results_df['F1-Score'].idxmax()]
print(f"\nBest model based on F1-Score: {best_model['Model']} with F1-Score: {best_model['F1-Score']:.4f}")


Training RandomForest...
Saved RandomForest model successfully.

Training LogisticRegression...

Training GradientBoosting...

Training XGBoost...

Model comparison:
                Model  Accuracy  Precision    Recall  F1-Score
0        RandomForest  0.956736   0.965983  0.900777  0.932241
1  LogisticRegression  0.897607   0.860562  0.823533  0.841640
2    GradientBoosting  0.882087   0.863559  0.763802  0.810623
3             XGBoost  0.953459   0.947340  0.909708  0.928142

Best model based on F1-Score: RandomForest with F1-Score: 0.9322


Fine-tuning with GridSearchCV

Recommendations Framework

In [5]:
def categorize_risk(prediction_prob):
    if prediction_prob < 0.3:
        return "Low Risk"
    elif 0.3 <= prediction_prob < 0.7:
        return "Moderate Risk"
    else:
        return "High Risk"
recommendations = {
    "Low Risk": {
        "Treatment": ["Follow a Mediterranean diet", "Engage in social activities"],
        "Lifestyle": ["Play memory-enhancing games", "Exercise 30 minutes daily"]
    },
    "Moderate Risk": {
        "Treatment": ["Consult a primary care physician", "Start cognitive stimulation therapy"],
        "Lifestyle": ["Engage in aerobic exercises", "Avoid high-sugar diets"]
    },
    "High Risk": {
        "Treatment": ["Neurologist consultation", "Consider medication for symptom management"],
        "Lifestyle": ["Work with a caregiver", "Follow a structured daily routine"]
    }
}


In [6]:
def generate_recommendations(prediction_prob):
    risk_category = categorize_risk(prediction_prob)
    treatment_recs = recommendations[risk_category]["Treatment"]
    lifestyle_recs = recommendations[risk_category]["Lifestyle"]
    
    return risk_category, treatment_recs, lifestyle_recs

# Example usage
example_prob = 0.65  # Prediction probability for a sample patient
category, treatment, lifestyle = generate_recommendations(example_prob)
print(f"Risk Category: {category}")
print(f"Treatment Recommendations: {treatment}")
print(f"Lifestyle Recommendations: {lifestyle}")

Risk Category: Moderate Risk
Treatment Recommendations: ['Consult a primary care physician', 'Start cognitive stimulation therapy']
Lifestyle Recommendations: ['Engage in aerobic exercises', 'Avoid high-sugar diets']


In [12]:
import joblib

# Load the saved model
cognitive_status_predictor_random_forest = joblib.load('cognitive_status_predictor_random_forest.pkl')

In [13]:
def predict_and_recommend(model, X_test):
    predictions = model.predict_proba(X_test)[:, 1]  # Probability of 'High Risk'
    results = []

    for prob in predictions:
        category, treatment, lifestyle = generate_recommendations(prob)
        results.append({
            "Risk Probability": prob,
            "Risk Category": category,
            "Treatment Recommendations": treatment,
            "Lifestyle Recommendations": lifestyle
        })

    return pd.DataFrame(results)

# Example: Generate recommendations for test data
recommendation_results = predict_and_recommend(cognitive_status_predictor_random_forest, X_test_pca)
print(recommendation_results.head())


   Risk Probability Risk Category  \
0              0.96     High Risk   
1              0.94     High Risk   
2              0.10      Low Risk   
3              0.79     High Risk   
4              0.90     High Risk   

                           Treatment Recommendations  \
0  [Neurologist consultation, Consider medication...   
1  [Neurologist consultation, Consider medication...   
2  [Follow a Mediterranean diet, Engage in social...   
3  [Neurologist consultation, Consider medication...   
4  [Neurologist consultation, Consider medication...   

                           Lifestyle Recommendations  
0  [Work with a caregiver, Follow a structured da...  
1  [Work with a caregiver, Follow a structured da...  
2  [Play memory-enhancing games, Exercise 30 minu...  
3  [Work with a caregiver, Follow a structured da...  
4  [Work with a caregiver, Follow a structured da...  


In [14]:
recommendation_results.to_csv("recommendations.csv", index=False)