In [36]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Load the diabetes dataset
train_df = pd.read_csv('./dataset/train_data.csv', encoding='latin-1')
test_df = pd.read_csv('./dataset/test_data.csv', encoding='latin-1')

# Display basic information
# print("Dataset information:")
# print(train_df.info())
# print(test_df.info())
# Check for missing values
# print("\nMissing values:")
# print(train_df.isnull().sum())
# print(test_df.isnull().sum())

# Split the data into features and target variable
X_train = train_df.drop('Outcome', axis=1)
y_train = train_df['Outcome']

X_test = test_df.drop('Outcome', axis=1)
y_test = test_df['Outcome']

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Raw data

In [37]:
# Define a function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f"Model: {model.__class__.__name__}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\n")

# Initialize models
models = [
    LogisticRegression(random_state=42),
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    SVC(random_state=42),
    XGBClassifier(random_state=42)
]

# Evaluate each model
for model in models:
    evaluate_model(model, X_train, X_test, y_train, y_test)

Model: LogisticRegression
Accuracy: 0.7359307359307359
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.79      0.80       151
           1       0.62      0.62      0.62        80

    accuracy                           0.74       231
   macro avg       0.71      0.71      0.71       231
weighted avg       0.74      0.74      0.74       231

Confusion Matrix:
[[120  31]
 [ 30  50]]


Model: DecisionTreeClassifier
Accuracy: 0.7056277056277056
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.71      0.76       151
           1       0.56      0.70      0.62        80

    accuracy                           0.71       231
   macro avg       0.69      0.70      0.69       231
weighted avg       0.73      0.71      0.71       231

Confusion Matrix:
[[107  44]
 [ 24  56]]


Model: RandomForestClassifier
Accuracy: 0.7575757575757576
Classification Report:
              prec

Drop zero values

In [38]:
# Load the diabetes dataset
train_df = pd.read_csv('./dataset/train_data.csv', encoding='latin-1')
test_df = pd.read_csv('./dataset/test_data.csv', encoding='latin-1')

# Replace zero values in specified columns with NaN
columns_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
# Replace zero values in specified columns with NaN
train_df[columns_with_zeros] = train_df[columns_with_zeros].replace(0, np.nan)
test_df[columns_with_zeros] = test_df[columns_with_zeros].replace(0, np.nan)

# Choose to drop or impute NaN values
print(train_df.shape)
print(test_df.shape)
train_df = train_df.dropna()
test_df = test_df.dropna()
print(train_df.shape)
print(test_df.shape)

# Split the data into features and target variable
X_train = train_df.drop('Outcome', axis=1)
y_train = train_df['Outcome']

X_test = test_df.drop('Outcome', axis=1)
y_test = test_df['Outcome']

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize models
models = [
    LogisticRegression(random_state=42),
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    SVC(random_state=42),
    XGBClassifier(random_state=42)
]

# Evaluate each model
for model in models:
    evaluate_model(model, X_train, X_test, y_train, y_test)


(537, 9)
(231, 9)
(274, 9)
(118, 9)
Model: LogisticRegression
Accuracy: 0.7711864406779662
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.86      0.83        78
           1       0.69      0.60      0.64        40

    accuracy                           0.77       118
   macro avg       0.75      0.73      0.74       118
weighted avg       0.77      0.77      0.77       118

Confusion Matrix:
[[67 11]
 [16 24]]


Model: DecisionTreeClassifier
Accuracy: 0.711864406779661
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.76      0.78        78
           1       0.57      0.62      0.60        40

    accuracy                           0.71       118
   macro avg       0.68      0.69      0.69       118
weighted avg       0.72      0.71      0.71       118

Confusion Matrix:
[[59 19]
 [15 25]]


Model: RandomForestClassifier
Accuracy: 0.7542372881355932
Classification

Impute with median value

In [39]:
# Load the diabetes dataset
train_df = pd.read_csv('./dataset/train_data.csv', encoding='latin-1')
test_df = pd.read_csv('./dataset/test_data.csv', encoding='latin-1')


# Replace zero values in specified columns with NaN
columns_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
train_df[columns_with_zeros] = train_df[columns_with_zeros].replace(0, np.nan)
test_df[columns_with_zeros] = test_df[columns_with_zeros].replace(0, np.nan)

# Choose to drop or impute NaN values
# Option 1: Drop rows with any NaN values
# train_df = train_df.dropna()
# test_df = test_df.dropna()

# Option 2: Impute missing values with median
imputer = SimpleImputer(strategy='median')
train_df[columns_with_zeros] = imputer.fit_transform(train_df[columns_with_zeros])
test_df[columns_with_zeros] = imputer.transform(test_df[columns_with_zeros])

# Split the data into features and target variable
X_train = train_df.drop('Outcome', axis=1)
y_train = train_df['Outcome']

X_test = test_df.drop('Outcome', axis=1)
y_test = test_df['Outcome']

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Initialize models
models = [
    LogisticRegression(random_state=42),
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    SVC(random_state=42),
    XGBClassifier(random_state=42)
]

# Evaluate each model
for model in models:
    evaluate_model(model, X_train, X_test, y_train, y_test)


Model: LogisticRegression
Accuracy: 0.7402597402597403
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.82      0.81       151
           1       0.64      0.59      0.61        80

    accuracy                           0.74       231
   macro avg       0.71      0.70      0.71       231
weighted avg       0.74      0.74      0.74       231

Confusion Matrix:
[[124  27]
 [ 33  47]]


Model: DecisionTreeClassifier
Accuracy: 0.6926406926406926
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.75      0.76       151
           1       0.55      0.59      0.57        80

    accuracy                           0.69       231
   macro avg       0.66      0.67      0.67       231
weighted avg       0.70      0.69      0.69       231

Confusion Matrix:
[[113  38]
 [ 33  47]]


Model: RandomForestClassifier
Accuracy: 0.7575757575757576
Classification Report:
              prec