5. Data Pre-processing

In [1]:
# import required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, classification_report
from sklearn.model_selection import StratifiedKFold

# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.neural_network import MLPClassifier


In [2]:
# load the processed dataset into dataframe
pst_df = pd.read_csv('data/processed_prostate.csv')


In [3]:
# view the pst_df dataframe

pst_df.head(10)

Unnamed: 0,diagnosis_result,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
0,0,23,12,151,954,0.143,0.278,0.242,0.079
1,1,9,13,133,1326,0.143,0.079,0.181,0.057
2,0,21,27,130,1203,0.125,0.16,0.207,0.06
3,0,14,16,78,386,0.07,0.284,0.26,0.097
4,0,9,19,135,1297,0.141,0.133,0.181,0.059
5,1,25,25,83,477,0.128,0.17,0.209,0.076
6,0,16,26,120,1040,0.095,0.109,0.179,0.057
7,0,15,18,90,578,0.119,0.165,0.22,0.075
8,0,19,24,88,520,0.127,0.193,0.235,0.074
9,0,25,11,84,476,0.119,0.24,0.203,0.082


In [4]:
# features and target 

X = pst_df.drop('diagnosis_result', axis=1)

y = pst_df['diagnosis_result']



In [None]:
#Train_test_split

X_train, X_test, y_train, y_test = train_test_split(X , y,  test_size=0.2, random_state=42)

In [13]:
# Define Model

models = {
  
      'Logistic Regression' : LogisticRegression(max_iter=1000),
      'Random Forest' : RandomForestClassifier(n_estimators=100),
      'Support Vector Machine' : SVC(probability=True),
      'KNN': KNeighborsClassifier(),
      'Neural Net' : MLPClassifier(max_iter=1000),
}

In [14]:
# Standard scaler instance

scaler = StandardScaler()

#Dictionary to hold result

results = []

# cross-validation setup

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [19]:
#evaluate the model

for name, model in models.items():

        # build pipeline

        pipeline = Pipeline([

                ('scaler', scaler),
                ('classifier', model)
            ])
    
        # fit on training data
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        y_proba = pipeline.predict_proba(X_test)[:, 1]


        acc = accuracy_score(y_pred, y_test)
        auc = roc_auc_score(y_test, y_proba)
        report = classification_report(y_test, y_pred, output_dict=True)

        # record result

        results.append({
            "model" : name,
            "Accuracy": acc,
            "Precision" : report['1']['precision'],
            "Recall"   :  report['1']['recall'],
            'F1-Score' : report['1']['f1-score'],
            "ROC-AUC" : auc
        })

In [20]:
results_df = pd.DataFrame(results).sort_values("ROC-AUC", ascending=False, ignore_index=True)
results_df

Unnamed: 0,model,Accuracy,Precision,Recall,F1-Score,ROC-AUC
0,Logistic Regression,0.95,1.0,0.875,0.933333,1.0
1,Random Forest,0.8,1.0,0.5,0.666667,0.989583
2,Support Vector Machine,0.9,1.0,0.75,0.857143,0.96875
3,KNN,0.85,1.0,0.625,0.769231,0.885417
4,Neural Net,0.8,1.0,0.5,0.666667,0.8125
5,Neural Net,0.75,0.8,0.5,0.615385,0.802083


In [22]:
import os
import joblib

# Create the best_model folder if it doesn't exist
model_dir = 'best_model'
os.makedirs(model_dir, exist_ok=True)

# Find the best model by ROC-AUC
best_model_row = results_df.loc[results_df['ROC-AUC'].idxmax()]
best_model_name = best_model_row['model']
print(f"\nBest Model: {best_model_name}")

# Get the corresponding model instance
best_model_instance = models[best_model_name]

# Rebuild the pipeline for the best model
best_pipeline = Pipeline([
    ('scaler', scaler),
    ('classifier', best_model_instance)
])

# Train the best pipeline on the full training data
best_pipeline.fit(X_train, y_train)

# Save the pipeline
model_path = os.path.join(model_dir, f'{best_model_name.replace(" ", "_").lower()}_pipeline.pkl')
joblib.dump(best_pipeline, model_path)

print(f"\nBest model saved to: {model_path}")



Best Model: Logistic Regression

Best model saved to: best_model/logistic_regression_pipeline.pkl
