<a href="https://colab.research.google.com/github/PanLuochuan/project1/blob/main/Modularization%EF%BC%88LR%2BRF%2BSVC%EF%BC%89.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
data = pd.read_csv('/content/drive/MyDrive/Diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [15]:
data.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Let's use the integrated model to integrate model training, cross-validation, and performance evaluation.

In [7]:
# Set the model and parameter grid.
model_params={
    'Logistic Regression':{
        'model':LogisticRegression(random_state=42,solver='liblinear'),
        'params':{
            'C':[0.1, 1, 10, 100]
        }
    },
    'Random Forest':{
        'model':RandomForestClassifier(random_state=42),
        'params':{
            'n_estimators':[10, 50, 100],
            'max_depth':[10, 20, 30]
        }
    },
    'SVC':{
        'model':Pipeline([
            ('scaler',StandardScaler()),
            ('svc',SVC(probability=True,random_state=42))
        ]),
        'params':{
        'svc__C':[0.1, 1, 10, 100, 1000],
        'svc__gamma':[0.001, 0.01, 0.1, 1, 10, 100],
        'svc__kernel':['rbf']
        }
    }
}

In [8]:
# Training model + cross-validation
def train_and_evaluate_model(model,params,X_train,y_train,X_test,y_test):
    grid_search=GridSearchCV(model,params,cv=5,scoring='accuracy',return_train_score=False)
    grid_search.fit(X_train,y_train)
    best_model=grid_search.best_estimator_
    predictions=best_model.predict(X_test)
    probabilities=best_model.predict_proba(X_test)[:, 1]
    return best_model,grid_search.best_params_,predictions,probabilities

In [9]:
# Performance evaluation
def evaluate_model(name,predictions,probabilities,y_test):
    accuracy=accuracy_score(y_test,predictions)
    precision=precision_score(y_test,predictions)
    recall=recall_score(y_test,predictions)
    f1=f1_score(y_test,predictions)
    auc=roc_auc_score(y_test,probabilities)
    print(f"{name} - Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}, AUC: {auc:.2f}")
    return {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1, 'AUC': auc}


In [14]:
if __name__ == "__main__":
    X = data.drop(columns=['Outcome'])
    y = data['Outcome']
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

    results={}
    for name,spec in model_params.items():
        model,best_params,predictions,probabilities = train_and_evaluate_model(
            spec['model'],spec['params'],X_train,y_train,X_test,y_test
        )
        results[name]={
            'Best Params': best_params,
            'Performance': evaluate_model(name, predictions, probabilities, y_test)
        }

    for model_name, details in results.items():
        print(f"\n{model_name} Best Params: {details['Best Params']}")

Logistic Regression - Accuracy: 0.74, Precision: 0.63, Recall: 0.62, F1 Score: 0.63, AUC: 0.80
Random Forest - Accuracy: 0.75, Precision: 0.63, Recall: 0.66, F1 Score: 0.65, AUC: 0.81
SVC - Accuracy: 0.76, Precision: 0.67, Recall: 0.59, F1 Score: 0.63, AUC: 0.80

Logistic Regression Best Params: {'C': 10}

Random Forest Best Params: {'max_depth': 20, 'n_estimators': 100}

SVC Best Params: {'svc__C': 10, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}


### Conclusion
According to the output results, the three models of Logistic Regression, Random Forest and SVM model are compared based on: Accuracy, Precision, Recall, F1 score and AUC value.

Although SVC model performs best in Accuracy and Precision, Random Forest performs well or best in recall, F1 score and AUC value, which shows that it is more balanced in balancing all aspects of prediction performance. If considering the actual application requirements, random forest may be the best choice, especially in medical application scenarios that require a high recall rate, it is crucial to ensure that as many actual diabetes cases as possible are identified.
