In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset=pd.read_csv("diabetics.csv")
dataset

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0,6,148,72,35,0,33.6,0.627,50,1
1,1,1,85,66,29,0,26.6,0.351,31,0
2,2,8,183,64,0,0,23.3,0.672,32,1
3,3,1,89,66,23,94,28.1,0.167,21,0
4,4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...,...
763,763,10,101,76,48,180,32.9,0.171,63,0
764,764,2,122,70,27,0,36.8,0.340,27,0
765,765,5,121,72,23,112,26.2,0.245,30,0
766,766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
X = dataset[['Pregnancies', 'Glucose', 'Insulin', 'BMI', 'Age']]  # Drop the target column
y = dataset["Outcome"]  # Target column

In [4]:
X

Unnamed: 0,Pregnancies,Glucose,Insulin,BMI,Age
0,6,148,0,33.6,50
1,1,85,0,26.6,31
2,8,183,0,23.3,32
3,1,89,94,28.1,21
4,0,137,168,43.1,33
...,...,...,...,...,...
763,10,101,180,32.9,63
764,2,122,0,36.8,27
765,5,121,112,26.2,30
766,1,126,0,30.1,47


In [5]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=0)

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy', 'log_loss'],

}

rf = RandomForestClassifier()
grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=3, scoring='accuracy')

# Fit the grid search
grid.fit(X_train, y_train)

# Output the best parameters and score
print("Best parameters found: ", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

# Get the best estimator and make predictions
best_rf = grid.best_estimator_

# Predict on the test data
y_pred = best_rf.predict(X_test)

# Optionally, you can evaluate performance, for example:
from sklearn.metrics import accuracy_score
print("Test set accuracy: {:.2f}".format(accuracy_score(y_test, y_pred)))

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters found:  {'criterion': 'log_loss', 'max_features': 'log2', 'n_estimators': 300}
Best cross-validation score: 0.75
Test set accuracy: 0.80


In [8]:
from sklearn.metrics import confusion_matrix

# Use y_test (true labels) instead of 'dep'
cm = confusion_matrix(y_test, y_pred)

print(cm)


[[140  17]
 [ 30  44]]


In [9]:
from sklearn.metrics import classification_report

# Use y_test (true labels) instead of 'dep'
clf_report = classification_report(y_test, y_pred)

print(clf_report)


              precision    recall  f1-score   support

           0       0.82      0.89      0.86       157
           1       0.72      0.59      0.65        74

    accuracy                           0.80       231
   macro avg       0.77      0.74      0.75       231
weighted avg       0.79      0.80      0.79       231



In [10]:
import pickle

# Save the best model as a .sav file
with open('best_rf_model.sav', 'wb') as model_file:
    pickle.dump(best_rf, model_file)

print("Model saved as best_rf_model.sav")

Model saved as best_rf_model.sav
