In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
df = pd.read_csv(r'D:\Datasets\Lesson_03_Supervised_Learning_ Regression_and_its_Application\diabetes_dataset.csv')

In [3]:
df.head()

Unnamed: 0,PatientID,Age,Gender,BMI,BloodPressure,Insulin,Glucose,DiabetesPedigreeFunction,Outcome
0,1,52,0,30.239636,149,113,115,0.312886,1
1,2,53,1,30.932781,71,39,104,0.389526,0
2,3,21,0,45.092714,161,67,151,1.205869,0
3,4,39,0,47.79553,101,165,184,0.595201,1
4,5,22,1,20.718108,160,158,116,0.974248,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   PatientID                 100 non-null    int64  
 1   Age                       100 non-null    int64  
 2   Gender                    100 non-null    int64  
 3   BMI                       100 non-null    float64
 4   BloodPressure             100 non-null    int64  
 5   Insulin                   100 non-null    int64  
 6   Glucose                   100 non-null    int64  
 7   DiabetesPedigreeFunction  100 non-null    float64
 8   Outcome                   100 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 7.2 KB


In [5]:
df.describe()

Unnamed: 0,PatientID,Age,Gender,BMI,BloodPressure,Insulin,Glucose,DiabetesPedigreeFunction,Outcome
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,50.5,49.27,0.49,32.452396,124.06,150.37,133.71,1.142923,0.45
std,29.011492,17.750718,0.502418,9.357709,31.921248,73.840896,36.500531,0.64187,0.5
min,1.0,21.0,0.0,18.663323,71.0,15.0,70.0,0.148171,0.0
25%,25.75,36.0,0.0,24.294953,96.5,87.0,102.75,0.600629,0.0
50%,50.5,50.5,0.0,31.260253,122.5,159.5,133.0,1.069903,0.0
75%,75.25,62.5,1.0,40.080581,153.5,205.25,164.0,1.570069,1.0
max,100.0,79.0,1.0,49.427988,178.0,273.0,198.0,2.467637,1.0


In [6]:
df.isnull().sum()

PatientID                   0
Age                         0
Gender                      0
BMI                         0
BloodPressure               0
Insulin                     0
Glucose                     0
DiabetesPedigreeFunction    0
Outcome                     0
dtype: int64

In [7]:
# Separate features (X) and the target variable (y)
X = df.drop(columns=['Outcome'])  # Drop the target column
y = df['Outcome']  # Target variable

# Check the shape of X and y
print("Features shape:", X.shape)
print("Target shape:", y.shape)


Features shape: (100, 8)
Target shape: (100,)


In [10]:
# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the training and test sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Training set shape: (80, 8) (80,)
Test set shape: (20, 8) (20,)


In [11]:
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [12]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training set
rf_model.fit(X_train_scaled, y_train)


In [13]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Classification Report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)


Accuracy: 0.55
Confusion Matrix:
 [[7 4]
 [5 4]]
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.64      0.61        11
           1       0.50      0.44      0.47         9

    accuracy                           0.55        20
   macro avg       0.54      0.54      0.54        20
weighted avg       0.55      0.55      0.55        20



In [14]:
# Example: Tune the model with different parameters
rf_model_tuned = RandomForestClassifier(n_estimators=150, max_depth=10, random_state=42)
rf_model_tuned.fit(X_train_scaled, y_train)

# Predict and evaluate again
y_pred_tuned = rf_model_tuned.predict(X_test_scaled)
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
print("Tuned Model Accuracy:", accuracy_tuned)


Tuned Model Accuracy: 0.55
