In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE

In [3]:
file_path = "/content/drive/MyDrive/2078_Akalya_Multiple Disease/2078_Akalya_Multiple Disease/Parkinsons.csv"
df = pd.read_csv(file_path)

In [4]:
X = df.drop(["name", "status"], axis=1)
y = df["status"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
pd.Series(y_train).value_counts()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
1,118
0,38


In [8]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

In [9]:
models = {
    "SVM": (SVC(probability=True, random_state=42), {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    }),

    "LogisticRegression": (LogisticRegression(max_iter=1000, random_state=42), {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    }),

    "RandomForest": (RandomForestClassifier(random_state=42), {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5]
    }),

    "XGBoost": (XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2]
    })
}

In [10]:
results = {}

for name, (model, params) in models.items():
    print(f"\n🔹 Training {name}...")
    grid = GridSearchCV(model, params, cv=StratifiedKFold(5), scoring='roc_auc', n_jobs=-1)
    grid.fit(X_train_res, y_train_res)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test_scaled)
    y_prob = best_model.predict_proba(X_test_scaled)[:, 1]

    results[name] = {
        "best_params": grid.best_params_,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_prob),
        "confusion_matrix": confusion_matrix(y_test, y_pred)
    }

    print("Best Params:", grid.best_params_)
    print(classification_report(y_test, y_pred))


🔹 Training SVM...
Best Params: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.73      0.80      0.76        10
           1       0.93      0.90      0.91        29

    accuracy                           0.87        39
   macro avg       0.83      0.85      0.84        39
weighted avg       0.88      0.87      0.87        39


🔹 Training LogisticRegression...
Best Params: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
              precision    recall  f1-score   support

           0       0.64      0.90      0.75        10
           1       0.96      0.83      0.89        29

    accuracy                           0.85        39
   macro avg       0.80      0.86      0.82        39
weighted avg       0.88      0.85      0.85        39


🔹 Training RandomForest...
Best Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
              precision    recall  f1-score   support

           

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [11]:
results_df = pd.DataFrame(results).T
print("\nFinal Comparison:\n", results_df)


Final Comparison:
                                                           best_params  \
SVM                      {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}   
LogisticRegression  {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}   
RandomForest        {'max_depth': None, 'min_samples_split': 2, 'n...   
XGBoost             {'learning_rate': 0.2, 'max_depth': 3, 'n_esti...   

                    accuracy precision    recall        f1   roc_auc  \
SVM                 0.871795  0.928571  0.896552  0.912281  0.962069   
LogisticRegression  0.846154      0.96  0.827586  0.888889  0.910345   
RandomForest        0.897436  0.962963  0.896552  0.928571  0.965517   
XGBoost             0.923077  0.964286  0.931034  0.947368  0.982759   

                     confusion_matrix  
SVM                 [[8, 2], [3, 26]]  
LogisticRegression  [[9, 1], [5, 24]]  
RandomForest        [[9, 1], [3, 26]]  
XGBoost             [[9, 1], [2, 27]]  


In [12]:
# -*- coding: utf-8 -*-
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import time
import pickle
import os

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

from imblearn.over_sampling import SMOTE

# --- 1. Load Dataset ---
file_path = "/content/drive/MyDrive/2078_Akalya_Multiple Disease/2078_Akalya_Multiple Disease/Parkinsons.csv"
df = pd.read_csv(file_path)

print("Initial shape:", df.shape)
print(df.head())
print(df.info())
print("\nMissing values per column:\n", df.isnull().sum())

# --- 2. Data Preprocessing ---
# Assuming no missing values; otherwise handle here e.g. df.fillna()

# Drop irrelevant columns
if "name" in df.columns:
    df = df.drop(columns=["name"])

# Encode target if needed (assuming status is 0/1 already)
target_column = 'status'
X = df.drop(columns=[target_column])
y = df[target_column]

# Label encode categorical columns if any (none in Parkinson dataset usually)
# Example:
# cat_cols = [col for col in X.columns if X[col].dtype == 'object']
# for col in cat_cols:
#     le = LabelEncoder()
#     X[col] = le.fit_transform(X[col].astype(str))

# --- 3. Normalization ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- 4. Feature Selection: PCA ---
pca = PCA(n_components=0.95, random_state=42)  # Keep 95% variance
X_pca = pca.fit_transform(X_scaled)

# --- 5. Dataset Splitting ---
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, stratify=y, random_state=42)

# --- Handle imbalance with SMOTE ---
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("After SMOTE, class distribution:\n", pd.Series(y_train_res).value_counts())

# --- 6. Model Training ---
models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier()
}

SAVE_DIR = "parkinson_models"
os.makedirs(SAVE_DIR, exist_ok=True)

results = []

for name, model in models.items():
    print(f"\nTraining {name}...")
    start_time = time.time()
    model.fit(X_train_res, y_train_res)
    train_time = time.time() - start_time

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Cross validation on full data (PCA + scaled)
    cv_scores = cross_val_score(model, X_pca, y, cv=5, scoring='accuracy')
    cv_mean = cv_scores.mean()

    print(f"{name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, "
          f"F1: {f1:.4f}, ROC-AUC: {roc_auc if roc_auc else 'N/A'}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Cross-Validation Accuracy: {cv_mean:.4f}")
    print(f"Training Time: {train_time:.2f} seconds")

    # Save model
    with open(os.path.join(SAVE_DIR, f"{name}_model.pkl"), "wb") as f:
        pickle.dump(model, f)

    results.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1": f1,
        "ROC_AUC": roc_auc,
        "CV_Accuracy": cv_mean,
        "Training_Time_sec": train_time
    })

# Save scaler and PCA
with open(os.path.join(SAVE_DIR, "scaler.pkl"), "wb") as f:
    pickle.dump(scaler, f)
with open(os.path.join(SAVE_DIR, "pca.pkl"), "wb") as f:
    pickle.dump(pca, f)

print("\nAll models and preprocessors saved.")

# --- 7. Prediction function ---
def load_artifacts(model_name: str):
    with open(os.path.join(SAVE_DIR, f"{model_name}_model.pkl"), "rb") as f:
        model = pickle.load(f)
    with open(os.path.join(SAVE_DIR, "scaler.pkl"), "rb") as f:
        scaler = pickle.load(f)
    with open(os.path.join(SAVE_DIR, "pca.pkl"), "rb") as f:
        pca = pickle.load(f)
    return model, scaler, pca

def predict_parkinsons(user_input: dict, model_name: str):
    """
    Predict Parkinson's status.
    user_input: dict of feature:value pairs, keys must match original feature names before PCA
    model_name: one of trained model names e.g. "XGBoost"
    """
    model, scaler, pca = load_artifacts(model_name)

    input_df = pd.DataFrame([user_input])

    # Make sure all features present
    for col in X.columns:
        if col not in input_df.columns:
            input_df[col] = 0  # or fill with mean if preferred

    input_df = input_df[X.columns]  # reorder columns

    # Preprocess: scale and PCA transform
    input_scaled = scaler.transform(input_df)
    input_pca = pca.transform(input_scaled)

    pred = model.predict(input_pca)[0]
    confidence = None
    if hasattr(model, "predict_proba"):
        confidence = model.predict_proba(input_pca)[0][1]

    return pred, confidence

# --- Example usage ---
if __name__ == "__main__":
    example_input = {
        # Add all required features here (copy from original df.columns)
        # Example:
        "MDVP:Fo(Hz)": 119.992,
        "MDVP:Fhi(Hz)": 157.302,
        "MDVP:Flo(Hz)": 74.997,
        "MDVP:Jitter(%)": 0.00784,
        "MDVP:Jitter(Abs)": 0.00007,
        "MDVP:RAP": 0.00370,
        "MDVP:PPQ": 0.00554,
        "Jitter:DDP": 0.01111,
        "MDVP:Shimmer": 0.04374,
        "MDVP:Shimmer(dB)": 0.426,
        "Shimmer:APQ3": 0.02182,
        "Shimmer:APQ5": 0.03130,
        "MDVP:APQ": 0.02971,
        "Shimmer:DDA": 0.06543,
        "NHR": 0.02211,
        "HNR": 21.031,
        "RPDE": 0.414783,
        "DFA": 0.815285,
        "spread1": -4.813029,
        "spread2": 0.266482,
        "D2": 2.302269,
        "PPE": 0.284654
    }

    pred, conf = predict_parkinsons(example_input, model_name="XGBoost")
    print(f"Prediction: {'Parkinsons (1)' if pred == 1 else 'No Parkinsons (0)'}")
    if conf is not None:
        print(f"Confidence: {conf:.2f}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Initial shape: (195, 24)
             name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0  phon_R01_S01_1      119.992       157.302        74.997         0.00784   
1  phon_R01_S01_2      122.400       148.650       113.819         0.00968   
2  phon_R01_S01_3      116.682       131.111       111.555         0.01050   
3  phon_R01_S01_4      116.676       137.871       111.366         0.00997   
4  phon_R01_S01_5      116.014       141.781       110.655         0.01284   

   MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  ...  \
0           0.00007   0.00370   0.00554     0.01109       0.04374  ...   
1           0.00008   0.00465   0.00696     0.01394       0.06134  ...   
2           0.00009   0.00544   0.00781     0.01633       0.05233  ...   
3           0.00009   0.00502   0.00698     0.01505       0.05492  ...   
4      

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost - Accuracy: 0.9231, Precision: 1.0000, Recall: 0.8966, F1: 0.9455, ROC-AUC: 0.9931034482758622
Confusion Matrix:
[[10  0]
 [ 3 26]]
Cross-Validation Accuracy: 0.8256
Training Time: 0.04 seconds

Training SVM...
SVM - Accuracy: 0.8462, Precision: 1.0000, Recall: 0.7931, F1: 0.8846, ROC-AUC: 0.9344827586206896
Confusion Matrix:
[[10  0]
 [ 6 23]]
Cross-Validation Accuracy: 0.8308
Training Time: 0.01 seconds

Training RandomForest...
RandomForest - Accuracy: 0.9487, Precision: 0.9655, Recall: 0.9655, F1: 0.9655, ROC-AUC: 0.9896551724137932
Confusion Matrix:
[[ 9  1]
 [ 1 28]]
Cross-Validation Accuracy: 0.8051
Training Time: 0.20 seconds

Training DecisionTree...
DecisionTree - Accuracy: 0.8462, Precision: 0.9600, Recall: 0.8276, F1: 0.8889, ROC-AUC: 0.8637931034482758
Confusion Matrix:
[[ 9  1]
 [ 5 24]]
Cross-Validation Accuracy: 0.7385
Training Time: 0.00 seconds

Training KNN...
KNN - Accuracy: 0.9231, Precision: 1.0000, Recall: 0.8966, F1: 0.9455, ROC-AUC: 0.9844827586206896
C

In [2]:
# -*- coding: utf-8 -*-
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import time
import pickle
import os

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

from imblearn.over_sampling import SMOTE

# --- 1. Load Dataset ---
file_path = "/content/drive/MyDrive/2078_Akalya_Multiple Disease/2078_Akalya_Multiple Disease/Parkinsons.csv"
df = pd.read_csv(file_path)

print("Initial shape:", df.shape)
print(df.head())
print(df.info())
print("\nMissing values per column:\n", df.isnull().sum())

# --- 2. Data Preprocessing ---
if "name" in df.columns:
    df = df.drop(columns=["name"])

target_column = 'status'
X = df.drop(columns=[target_column])
y = df[target_column]

# --- 3. Normalization ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- 4. Feature Selection: PCA ---
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# --- 5. Dataset Splitting ---
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, stratify=y, random_state=42)

# --- Handle imbalance with SMOTE ---
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("After SMOTE, class distribution:\n", pd.Series(y_train_res).value_counts())

# --- 6. Model Training ---
models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier()
}

SAVE_DIR = "/content/drive/MyDrive/2078_Akalya_Multiple Disease/2078_Akalya_Multiple Disease/parkinson_models"
os.makedirs(SAVE_DIR, exist_ok=True)
print(f"Saving models to folder: {SAVE_DIR}")

results = []

for name, model in models.items():
    print(f"\nTraining {name}...")
    start_time = time.time()
    model.fit(X_train_res, y_train_res)
    train_time = time.time() - start_time

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    conf_matrix = confusion_matrix(y_test, y_pred)

    cv_scores = cross_val_score(model, X_pca, y, cv=5, scoring='accuracy')
    cv_mean = cv_scores.mean()

    print(f"{name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, "
          f"F1: {f1:.4f}, ROC-AUC: {roc_auc if roc_auc else 'N/A'}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Cross-Validation Accuracy: {cv_mean:.4f}")
    print(f"Training Time: {train_time:.2f} seconds")

    # Save model
    model_filename = f"{name}_model.pkl"
    with open(os.path.join(SAVE_DIR, model_filename), "wb") as f:
        pickle.dump(model, f)
    print(f"Saved {name} model to {model_filename}")

    results.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1": f1,
        "ROC_AUC": roc_auc,
        "CV_Accuracy": cv_mean,
        "Training_Time_sec": train_time
    })

# Save scaler and PCA objects
with open(os.path.join(SAVE_DIR, "scaler.pkl"), "wb") as f:
    pickle.dump(scaler, f)
print("Saved scaler.pkl")

with open(os.path.join(SAVE_DIR, "pca.pkl"), "wb") as f:
    pickle.dump(pca, f)
print("Saved pca.pkl")

print("\nAll models and preprocessors saved successfully.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Initial shape: (195, 24)
             name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0  phon_R01_S01_1      119.992       157.302        74.997         0.00784   
1  phon_R01_S01_2      122.400       148.650       113.819         0.00968   
2  phon_R01_S01_3      116.682       131.111       111.555         0.01050   
3  phon_R01_S01_4      116.676       137.871       111.366         0.00997   
4  phon_R01_S01_5      116.014       141.781       110.655         0.01284   

   MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  ...  \
0           0.00007   0.00370   0.00554     0.01109       0.04374  ...   
1           0.00008   0.00465   0.00696     0.01394       0.06134  ...   
2           0.00009   0.00544   0.00781     0.01633       0.05233  ...   
3           0.00009   0.00502   0.00698     0.01505       0.05492  ...   
4      

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost - Accuracy: 0.9231, Precision: 1.0000, Recall: 0.8966, F1: 0.9455, ROC-AUC: 0.9931034482758622
Confusion Matrix:
[[10  0]
 [ 3 26]]
Cross-Validation Accuracy: 0.8256
Training Time: 0.09 seconds
Saved XGBoost model to XGBoost_model.pkl

Training SVM...
SVM - Accuracy: 0.8462, Precision: 1.0000, Recall: 0.7931, F1: 0.8846, ROC-AUC: 0.9344827586206896
Confusion Matrix:
[[10  0]
 [ 6 23]]
Cross-Validation Accuracy: 0.8308
Training Time: 0.02 seconds
Saved SVM model to SVM_model.pkl

Training RandomForest...
RandomForest - Accuracy: 0.9487, Precision: 0.9655, Recall: 0.9655, F1: 0.9655, ROC-AUC: 0.9896551724137932
Confusion Matrix:
[[ 9  1]
 [ 1 28]]
Cross-Validation Accuracy: 0.8051
Training Time: 0.30 seconds
Saved RandomForest model to RandomForest_model.pkl

Training DecisionTree...
DecisionTree - Accuracy: 0.8462, Precision: 0.9600, Recall: 0.8276, F1: 0.8889, ROC-AUC: 0.8637931034482758
Confusion Matrix:
[[ 9  1]
 [ 5 24]]
Cross-Validation Accuracy: 0.7385
Training Time: 0.00 