In [72]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [87]:
# Load dataset
dataset = pd.read_csv('diabetes.csv')
dataset.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [75]:
# Separate features and target variable
X = dataset.iloc[:, :-1]
y = dataset['Outcome']

In [76]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [77]:
# Define column types
categorical_column = []  # Add categorical columns if there are any
numerical_column = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']


In [78]:
# Numerical pipeline
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scalar', StandardScaler()),
    ]
)

# Categorical pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehotencoding', OneHotEncoder()),
    ]
)

In [79]:
# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numerical_column),
        ('cat', cat_pipeline, categorical_column)
    ]
)

In [80]:
# Transform the training data
X_train = preprocessor.fit_transform(X_train)
# Transform the test data
X_test = preprocessor.transform(X_test)


In [81]:
# Model Training Automation
models = {
    'Random forest': RandomForestClassifier(),
    'Logistic Regression':LogisticRegression(),
    'Decision Tree':DecisionTreeClassifier()
}

In [82]:
def evaluate_model(X_train, y_train, X_test, y_test, models):
    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        # Train model
        model.fit(X_train, y_train)

        # Predict Testing data
        y_test_pred = model.predict(X_test)

        # Get accuracy for test data prediction
        test_model_score = accuracy_score(y_test, y_test_pred)

        report[list(models.keys())[i]] = test_model_score

    return report


In [86]:
# Evaluate model
results = evaluate_model(X_train, y_train, X_test, y_test, models)
print(results)

{'Random forest': 0.7467532467532467, 'Logistic Regression': 0.7532467532467533, 'Decision Tree': 0.7662337662337663}
