In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib

# Load the dataset
data = pd.read_csv(r"C:\SUDHA\Personal portfolio project\datasets\diabetes.csv") # Replace with the path to your dataset

# Inspect and preprocess the data
print(data.info())
# Handle missing values or outliers if necessary

# Split features and target
X = data.drop("Outcome", axis=1)  # Replace 'Outcome' with the actual target column
y = data["Outcome"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)[:, 1]  # For ROC-AUC

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC-AUC: {roc_auc:.2f}")

# Save the model
joblib.dump(rf_model, "diabetes_rf_model.pkl")
print("Model saved as 'diabetes_rf_model.pkl'")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None
Accuracy: 0.72
Precision: 0.61
Recall: 0.62
F1 Score: 0.61
ROC-AUC: 0.81
Model saved as 'diabetes_rf_model.pkl'


In [6]:
pip install pandas scikit-learn joblib

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [7]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)[:, 1]  # Probability of being diabetic (class 1)

# Compare predictions with actual values
comparison = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred,
    'Probability': y_prob
})

print(comparison.head(10))  # Print the first 10 predictions

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC-AUC: {roc_auc:.2f}")

# Identify misclassifications
misclassified = comparison[comparison['Actual'] != comparison['Predicted']]
print("\nMisclassified examples:")
print(misclassified)

   Actual  Predicted  Probability
0       0          0         0.31
1       0          0         0.32
2       0          0         0.17
3       0          0         0.25
4       0          0         0.39
5       0          1         0.53
6       0          0         0.01
7       0          1         0.73
8       0          1         0.62
9       0          1         0.55
Accuracy: 0.72
Precision: 0.61
Recall: 0.62
F1 Score: 0.61
ROC-AUC: 0.81

Misclassified examples:
     Actual  Predicted  Probability
5         0          1         0.53
7         0          1         0.73
8         0          1         0.62
9         0          1         0.55
10        1          0         0.29
11        0          1         0.82
12        1          0         0.29
15        1          0         0.41
24        0          1         0.60
28        1          0         0.11
34        0          1         0.60
37        1          0         0.49
42        0          1         0.73
43        1          0  

In [9]:
import numpy as np
import joblib

# Load the saved model
rf_model = joblib.load("diabetes_rf_model.pkl")

# Define a function to take user input and make a prediction
def predict_diabetes():
    print("Enter the following details:")
    try:
        pregnancies = int(input("Pregnancies: "))
        glucose = float(input("Glucose Level: "))
        blood_pressure = float(input("Blood Pressure: "))
        skin_thickness = float(input("Skin Thickness: "))
        insulin = float(input("Insulin Level: "))
        bmi = float(input("BMI (Body Mass Index): "))
        diabetes_pedigree = float(input("Diabetes Pedigree Function: "))
        age = int(input("Age: "))
        
        # Create a feature array for prediction
        features = np.array([[pregnancies, glucose, blood_pressure, skin_thickness, insulin, bmi, diabetes_pedigree, age]])
        
        # Make a prediction
        prediction = rf_model.predict(features)
        probability = rf_model.predict_proba(features)[:, 1][0]
        
        # Output the result
        if prediction[0] == 1:
            print(f"\nThe person is likely to have diabetes with a confidence of {probability:.2%}.")
        else:
            print(f"\nThe person is unlikely to have diabetes with a confidence of {(1 - probability):.2%}.")
    
    except ValueError:
        print("\nInvalid input! Please enter numerical values for all symptoms.")

# Call the function
predict_diabetes()

Enter the following details:

The person is likely to have diabetes with a confidence of 78.00%.


