In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [2]:
data = pd.read_csv('diabetes_dataset.csv')

In [3]:
x = data.drop(columns=['Diabetic'])
y = data['Diabetic']

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.7, random_state = 40)

In [5]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [6]:
random_forest = RandomForestClassifier(n_estimators=80, random_state = 42)
random_forest.fit(x_train_scaled, y_train)
y_pred = random_forest.predict(x_test_scaled)

In [7]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9397590361445783


In [8]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       483
           1       0.90      0.88      0.89       181

    accuracy                           0.94       664
   macro avg       0.93      0.92      0.92       664
weighted avg       0.94      0.94      0.94       664



In [9]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[465  18]
 [ 22 159]]


In [10]:
cv_scores = cross_val_score(random_forest, x, y, cv=5)
print("\nCross-Validation Scores:", cv_scores)
print("Mean Accuracy with cross-validation:", cv_scores.mean())


Cross-Validation Scores: [0.97894737 0.93684211 0.95789474 0.97354497 0.98412698]
Mean Accuracy with cross-validation: 0.9662712336396547


In [11]:
input_data = {
    'BMI': 21,
    'Sleep': 10,
    'SoundSleep': 10,
    'Pregancies': 0,
    'Age': 1,
    'highBP': 1,
    'Smoking': 0,
    'BPLevel': 2,
    'Pdiabetes': 0,
    'Stress': 1,
    'Family_Diabetes': 0,
    'PhysicallyActive': 3,
    'Alcohol': 1,
    'UriationFreq': 0,
    'JunkFood': 0,
    'Gender': 1,
    'RegularMedicine': 0
}

input_df = pd.DataFrame([input_data])
scaled_input = scaler.transform(input_df)
predictions = random_forest.predict(scaled_input)
print("Predicted outcome:", predictions)
prediction_proba = random_forest.predict_proba(scaled_input)
diabetes_proba = prediction_proba[0][1]
output = ('{0:.{1}f}'.format(diabetes_proba, 2))
print(f"Probability of diabetes happening:{output}%.")

Predicted outcome: [1]
Probability of diabetes happening:0.69%.


In [12]:
joblib.dump(random_forest, 'rf_model_1.joblib')
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']