In [1]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Step 2: Load the dataset
data = pd.read_csv("data.csv")

In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
1,0,84,82,31,125,38.2,0.233,23,0
2,0,145,0,0,0,44.2,0.63,31,1
3,0,135,68,42,250,42.3,0.365,24,1
4,1,139,62,41,480,40.7,0.536,21,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               2000 non-null   int64  
 1   Glucose                   2000 non-null   int64  
 2   BloodPressure             2000 non-null   int64  
 3   SkinThickness             2000 non-null   int64  
 4   Insulin                   2000 non-null   int64  
 5   BMI                       2000 non-null   float64
 6   DiabetesPedigreeFunction  2000 non-null   float64
 7   Age                       2000 non-null   int64  
 8   Outcome                   2000 non-null   int64  
dtypes: float64(2), int64(7)
memory usage: 140.8 KB


In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Replace zeros with NaN for specific columns
columns_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[columns_with_zeros] = data[columns_with_zeros].replace(0, np.nan)

# Impute missing values with the median
imputer = SimpleImputer(strategy='median')
data[columns_with_zeros] = imputer.fit_transform(data[columns_with_zeros])

In [6]:
# Scale the data (excluding the target column, 'Outcome')
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data.drop('Outcome', axis=1))
X = pd.DataFrame(scaled_features, columns=data.columns[:-1])
y = data['Outcome']

In [7]:
# Display the first few rows of the preprocessed data
print(X.head())
print(y.head())

   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0    -0.515394  0.525974      -0.869310       0.631010 -0.177601  0.132630   
1    -1.120495 -1.242888       0.804774       0.192542 -0.189864  0.772550   
2    -1.120495  0.755271      -0.032268      -0.026692 -0.177601  1.607229   
3    -1.120495  0.427704      -0.367085       1.398329  1.343080  1.342914   
4    -0.817945  0.558731      -0.869310       1.288712  4.163696  1.120333   

   DiabetesPedigreeFunction       Age  
0                 -1.063246  1.180424  
1                 -0.735551 -0.856326  
2                  0.491759 -0.177409  
3                 -0.327478 -0.771462  
4                  0.201161 -1.026055  
0    1
1    0
2    1
3    1
4    0
Name: Outcome, dtype: int64


In [8]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Support Vector Machine": SVC(random_state=42, probability=True),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Train and evaluate models
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    # Predict on the test set
    y_pred = model.predict(X_test)
    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")
    print(f"Classification Report for {name}:\n{classification_report(y_test, y_pred)}")

Logistic Regression Accuracy: 0.80
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.81      0.90      0.85       263
           1       0.75      0.61      0.67       137

    accuracy                           0.80       400
   macro avg       0.78      0.75      0.76       400
weighted avg       0.79      0.80      0.79       400

Random Forest Accuracy: 0.98
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       263
           1       0.98      0.98      0.98       137

    accuracy                           0.98       400
   macro avg       0.98      0.98      0.98       400
weighted avg       0.98      0.98      0.98       400

Support Vector Machine Accuracy: 0.85
Classification Report for Support Vector Machine:
              precision    recall  f1-score   support

           0       0.86      0.92      0.89       2

In [10]:
import pickle

# Save the Random Forest model
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(models['Random Forest'], file)

print("Model saved as random_forest_model.pkl")

Model saved as random_forest_model.pkl
