In [1]:
import numpy as np
import matplotlib as matlib
import seaborn as sns
import pandas as pd

try:
    df=pd.read_csv("parkinsons.data")
    
except Exception as e:
    print(f'error opening file{e}')
    
df.min()
print(df['status'])
df.info()

0      1
1      1
2      1
3      1
4      1
      ..
190    0
191    0
192    0
193    0
194    0
Name: status, Length: 195, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   MDVP:Fo(Hz)       195 non-null    float64
 2   MDVP:Fhi(Hz)      195 non-null    float64
 3   MDVP:Flo(Hz)      195 non-null    float64
 4   MDVP:Jitter(%)    195 non-null    float64
 5   MDVP:Jitter(Abs)  195 non-null    float64
 6   MDVP:RAP          195 non-null    float64
 7   MDVP:PPQ          195 non-null    float64
 8   Jitter:DDP        195 non-null    float64
 9   MDVP:Shimmer      195 non-null    float64
 10  MDVP:Shimmer(dB)  195 non-null    float64
 11  Shimmer:APQ3      195 non-null    float64
 12  Shimmer:APQ5      195 non-null    float64
 13  MDVP:APQ          195 non-null    float64
 14 

In [2]:
from sklearn.model_selection import train_test_split #for split
from sklearn.linear_model import LogisticRegression  #training 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report #for report
from sklearn.preprocessing import StandardScaler #scaling 
from sklearn.ensemble import RandomForestClassifier #traing 
from sklearn.svm import SVC #traning

from sklearn.model_selection import GridSearchCV #for hyperparameter..

In [3]:
#spliting data 

X = df.drop(columns=['name', 'status'])
y = df['status']

# Now split again
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)



In [4]:
#scalling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
#training
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_scaled, y_train)

In [6]:
y_pred = log_reg.predict(X_test_scaled)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9230769230769231

Confusion Matrix:
 [[ 8  2]
 [ 1 28]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.80      0.84        10
           1       0.93      0.97      0.95        29

    accuracy                           0.92        39
   macro avg       0.91      0.88      0.90        39
weighted avg       0.92      0.92      0.92        39



In [7]:
#traing more
svm_clf = SVC(kernel='rbf', random_state=42)

# Step 2: train
svm_clf.fit(X_train_scaled, y_train)

# Step 3: predict
y_pred_svm = svm_clf.predict(X_test_scaled)

# Step 4: evaluate
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))

Accuracy: 0.9230769230769231

Confusion Matrix:
 [[ 7  3]
 [ 0 29]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.70      0.82        10
           1       0.91      1.00      0.95        29

    accuracy                           0.92        39
   macro avg       0.95      0.85      0.89        39
weighted avg       0.93      0.92      0.92        39



In [8]:
#hyperparameter tunning

log_reg = LogisticRegression() 

#create a grid and define parameters
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000]
}

grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    cv=5,                # 5-fold cross-validation
    scoring='accuracy',  # Metric to optimize
    n_jobs=-1,           # Use all CPU cores for speed
    verbose=2
)  
# Step 4: Train with cross-validation
grid_search.fit(X_train_scaled, y_train)

# Step 5: Display the best results
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-validation Accuracy:", grid_search.best_score_)

# Step 6: Evaluate on test data
best_log_reg = grid_search.best_estimator_
test_acc = best_log_reg.score(X_test_scaled, y_test)
print("Test Accuracy:", test_acc)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'C': 100, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'}
Best Cross-validation Accuracy: 0.871975806451613
Test Accuracy: 0.8974358974358975




In [74]:
svm_model = SVC(probability=True)
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}

# Step 2: Perform Grid Search with Cross Validation
grid_svm = GridSearchCV(estimator=svm_model, param_grid=param_grid,
                        cv=5, verbose=2, n_jobs=-1)
grid_svm.fit(X_train_scaled, y_train)

# Step 3: Print best parameters and performance
print("Best Parameters:", grid_svm.best_params_)
print("Best Cross-validation Accuracy:", grid_svm.best_score_)

# Step 4: Evaluate on test data
y_pred_svm = grid_svm.predict(X_test_scaled)
print("Test Accuracy:", grid_svm.score(X_test_scaled, y_test))

# Step 5: Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))

# Step 6: Confusion Matrix
cm = confusion_matrix(y_test, y_pred_svm)
print("\nConfusion Matrix:\n", cm)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best Cross-validation Accuracy: 0.9423387096774194
Test Accuracy: 0.9487179487179487

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.90      0.90        10
           1       0.97      0.97      0.97        29

    accuracy                           0.95        39
   macro avg       0.93      0.93      0.93        39
weighted avg       0.95      0.95      0.95        39


Confusion Matrix:
 [[ 9  1]
 [ 1 28]]


In [12]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Define parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4]  # only for poly kernel
}

# Create the SVM model
svm = SVC(probability=True, random_state=42)

# Grid search with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=svm,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_scaled, y_train)

best_svm = grid_search.best_estimator_

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-validation Accuracy:", grid_search.best_score_)

# Evaluate on test data
y_pred = best_svm.predict(X_test_scaled)
test_acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_acc)


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}
Best Cross-validation Accuracy: 0.9296370967741936
Test Accuracy: 0.8974358974358975


In [13]:
import joblib, os, json

os.makedirs("artifacts", exist_ok=True)

# Save trained model
joblib.dump(best_svm, "artifacts/model.joblib")

# Save scaler
joblib.dump(scaler, "artifacts/scaler.pkl")

# Optional: Save feature order
feature_order = list(X_train.columns) if hasattr(X_train, "columns") else [f"feature_{i}" for i in range(X_train.shape[1])]
with open("artifacts/feature_order.json", "w") as f:
    json.dump(feature_order, f)

print("✅ Model, scaler, and feature order saved successfully!")




✅ Model, scaler, and feature order saved successfully!


In [14]:
import joblib
model = joblib.load("artifacts/model.joblib")
scaler = joblib.load("artifacts/scaler.pkl")

# get a sample row from your training data
sample = scaler.transform([[147.654, 160.123, 111.23, 0.005, 0.00004, 0.003, 0.004, 0.009, 0.030, 0.25, 0.02, 0.03, 0.04, 0.02, 0.01, 21, 0.4, 0.8, -5.3, 0.02, 1.8, 0.3]])
print(model.predict_proba(sample))


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


ValueError: X has 22 features, but StandardScaler is expecting 30 features as input.

In [15]:
import joblib
scaler = joblib.load("artifacts/scaler.pkl")
print("Scaler feature count:", len(scaler.mean_))


Scaler feature count: 22


In [16]:
print(X_train.shape)


(156, 22)
