# Parkinson's Disease Model Training
## Training with Best Accuracy Optimization

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries imported successfully!")

‚úÖ Libraries imported successfully!


## 1. Load and Explore Dataset

In [2]:
# Load the dataset
print("Loading Parkinson's disease dataset...")
df = pd.read_csv('../parkinsons.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

Loading Parkinson's disease dataset...
Dataset shape: (195, 24)

Columns: ['name', 'MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'status', 'RPDE', 'DFA', 'spread1', 'spread2', 'D2', 'PPE']


Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [3]:
# Dataset information
print("Dataset Info:")
print(df.info())
print(f"\nMissing values:")
print(df.isnull().sum())
print(f"\nTarget distribution:")
print(df['status'].value_counts())
print(f"\nDataset statistics:")
print(df.describe())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   MDVP:Fo(Hz)       195 non-null    float64
 2   MDVP:Fhi(Hz)      195 non-null    float64
 3   MDVP:Flo(Hz)      195 non-null    float64
 4   MDVP:Jitter(%)    195 non-null    float64
 5   MDVP:Jitter(Abs)  195 non-null    float64
 6   MDVP:RAP          195 non-null    float64
 7   MDVP:PPQ          195 non-null    float64
 8   Jitter:DDP        195 non-null    float64
 9   MDVP:Shimmer      195 non-null    float64
 10  MDVP:Shimmer(dB)  195 non-null    float64
 11  Shimmer:APQ3      195 non-null    float64
 12  Shimmer:APQ5      195 non-null    float64
 13  MDVP:APQ          195 non-null    float64
 14  Shimmer:DDA       195 non-null    float64
 15  NHR               195 non-null    float64
 16  HNR               195 non-null

## 2. Prepare Data

In [4]:
# Drop the 'name' column as it's just an identifier
df = df.drop('name', axis=1)

# Separate features and target
X = df.drop('status', axis=1)
y = df['status']

print(f"Features shape: {X.shape}")
print(f"Features: {X.columns.tolist()}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\n‚úÖ Data prepared:")
print(f"  Training set: {X_train.shape}")
print(f"  Test set: {X_test.shape}")

Features shape: (195, 22)
Features: ['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'spread1', 'spread2', 'D2', 'PPE']

‚úÖ Data prepared:
  Training set: (156, 22)
  Test set: (39, 22)


## 3. Train and Evaluate Base Models

In [5]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM (RBF Kernel)": SVC(kernel='rbf', probability=True, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training {name}...")
    
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred = model.predict(X_test_scaled)
    
    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    cv_mean = cv_scores.mean()
    cv_std = cv_scores.std()
    
    results[name] = {'accuracy': accuracy, 'cv_mean': cv_mean, 'cv_std': cv_std}
    
    print(f"\n{name} Results:")
    print(f"  Test Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"  Cross-Val Accuracy: {cv_mean:.4f} (+/- {cv_std:.4f})")
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['No Parkinsons', 'Parkinsons']))


Training Logistic Regression...

Logistic Regression Results:
  Test Accuracy: 0.9231 (92.31%)
  Cross-Val Accuracy: 0.8335 (+/- 0.0550)

Classification Report:
               precision    recall  f1-score   support

No Parkinsons       0.89      0.80      0.84        10
   Parkinsons       0.93      0.97      0.95        29

     accuracy                           0.92        39
    macro avg       0.91      0.88      0.90        39
 weighted avg       0.92      0.92      0.92        39


Training Random Forest...

Random Forest Results:
  Test Accuracy: 0.9231 (92.31%)
  Cross-Val Accuracy: 0.8915 (+/- 0.0462)

Classification Report:
               precision    recall  f1-score   support

No Parkinsons       0.89      0.80      0.84        10
   Parkinsons       0.93      0.97      0.95        29

     accuracy                           0.92        39
    macro avg       0.91      0.88      0.90        39
 weighted avg       0.92      0.92      0.92        39


Training SVM (RBF Ker

In [6]:
# Summary of base models
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('accuracy', ascending=False)
print("\nüìä Base Model Comparison:")
print(results_df)


üìä Base Model Comparison:
                     accuracy   cv_mean    cv_std
Logistic Regression  0.923077  0.833468  0.054987
Random Forest        0.923077  0.891532  0.046194
SVM (RBF Kernel)     0.923077  0.871774  0.020465
Gradient Boosting    0.923077  0.917137  0.041984


## 4. Hyperparameter Tuning

In [7]:
# Random Forest Tuning
print("\nüîß Tuning Random Forest...")
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_params,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

rf_grid.fit(X_train_scaled, y_train)
rf_best = rf_grid.best_estimator_
rf_accuracy = accuracy_score(y_test, rf_best.predict(X_test_scaled))

print(f"\n‚úÖ Random Forest Best Params: {rf_grid.best_params_}")
print(f"‚úÖ Random Forest Tuned Accuracy: {rf_accuracy:.4f} ({rf_accuracy*100:.2f}%)")


üîß Tuning Random Forest...
Fitting 5 folds for each of 108 candidates, totalling 540 fits

‚úÖ Random Forest Best Params: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
‚úÖ Random Forest Tuned Accuracy: 0.9231 (92.31%)


In [8]:
# SVM Tuning
print("\nüîß Tuning SVM...")
svm_params = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1],
    'kernel': ['rbf', 'linear']
}

svm_grid = GridSearchCV(
    SVC(probability=True, random_state=42),
    svm_params,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

svm_grid.fit(X_train_scaled, y_train)
svm_best = svm_grid.best_estimator_
svm_accuracy = accuracy_score(y_test, svm_best.predict(X_test_scaled))

print(f"\n‚úÖ SVM Best Params: {svm_grid.best_params_}")
print(f"‚úÖ SVM Tuned Accuracy: {svm_accuracy:.4f} ({svm_accuracy*100:.2f}%)")


üîß Tuning SVM...
Fitting 5 folds for each of 40 candidates, totalling 200 fits

‚úÖ SVM Best Params: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
‚úÖ SVM Tuned Accuracy: 0.9487 (94.87%)


In [9]:
# Gradient Boosting Tuning
print("\nüîß Tuning Gradient Boosting...")
gb_params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

gb_grid = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    gb_params,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

gb_grid.fit(X_train_scaled, y_train)
gb_best = gb_grid.best_estimator_
gb_accuracy = accuracy_score(y_test, gb_best.predict(X_test_scaled))

print(f"\n‚úÖ Gradient Boosting Best Params: {gb_grid.best_params_}")
print(f"‚úÖ Gradient Boosting Tuned Accuracy: {gb_accuracy:.4f} ({gb_accuracy*100:.2f}%)")


üîß Tuning Gradient Boosting...
Fitting 5 folds for each of 72 candidates, totalling 360 fits

‚úÖ Gradient Boosting Best Params: {'learning_rate': 0.2, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
‚úÖ Gradient Boosting Tuned Accuracy: 0.9487 (94.87%)


## 5. Select Best Model and Save

In [10]:
# Compare tuned models
tuned_models = {
    "Random Forest (Tuned)": (rf_best, rf_accuracy),
    "SVM (Tuned)": (svm_best, svm_accuracy),
    "Gradient Boosting (Tuned)": (gb_best, gb_accuracy)
}

final_best_name = max(tuned_models.items(), key=lambda x: x[1][1])[0]
final_best_model = tuned_models[final_best_name][0]
final_best_accuracy = tuned_models[final_best_name][1]

print("\n" + "="*60)
print("üèÜ FINAL BEST MODEL")
print("="*60)
print(f"\nBest Model: {final_best_name}")
print(f"Best Accuracy: {final_best_accuracy:.4f} ({final_best_accuracy*100:.2f}%)")

# Final evaluation
y_pred_final = final_best_model.predict(X_test_scaled)
print(f"\nüìã Final Classification Report:")
print(classification_report(y_test, y_pred_final, target_names=['No Parkinsons', 'Parkinsons']))
print(f"\nüìä Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_final))


üèÜ FINAL BEST MODEL

Best Model: SVM (Tuned)
Best Accuracy: 0.9487 (94.87%)

üìã Final Classification Report:
               precision    recall  f1-score   support

No Parkinsons       0.90      0.90      0.90        10
   Parkinsons       0.97      0.97      0.97        29

     accuracy                           0.95        39
    macro avg       0.93      0.93      0.93        39
 weighted avg       0.95      0.95      0.95        39


üìä Confusion Matrix:
[[ 9  1]
 [ 1 28]]


In [11]:
# Save the best model and scaler
model_filename = 'parkinsons_model.sav'
scaler_filename = 'parkinsons_scaler.sav'

pickle.dump(final_best_model, open(model_filename, 'wb'))
pickle.dump(scaler, open(scaler_filename, 'wb'))

print(f"\n‚úÖ Model saved to: {model_filename}")
print(f"‚úÖ Scaler saved to: {scaler_filename}")
print(f"\nModel Type: {type(final_best_model).__name__}")
print(f"Number of features: {X.shape[1]}")
print(f"Feature names: {X.columns.tolist()}")
print("\nüéâ Training complete!")


‚úÖ Model saved to: parkinsons_model.sav
‚úÖ Scaler saved to: parkinsons_scaler.sav

Model Type: SVC
Number of features: 22
Feature names: ['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'spread1', 'spread2', 'D2', 'PPE']

üéâ Training complete!
