In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier  #The model needed for the prediction

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt


In [11]:
df = pd.read_excel('diabetes.xls')

# Displaying the first few rows
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [12]:
# To check if there is any null value
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [13]:
# Count the values of the 'Outcome' column
df['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

Here we are standardizing the data for testing and testing into model

In [14]:

# Split the data into features (x) and target (y)
x = df.drop(columns='Outcome', axis=1)
y = df['Outcome']

# Standardize the features
data_scaling = StandardScaler()
data_scaling.fit(x)

standardized_data = data_scaling.transform(x)
x = standardized_data

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

# Train the RandomForest model
model = RandomForestClassifier(random_state=1)
model.fit(x_train, y_train)

# Make predictions on the training data
y_train_pred = model.predict(x_train)

# Make predictions on the test data
y_test_pred = model.predict(x_test)



In [15]:
# Calculate and print the accuracy for the training data
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Data Accuracy: {train_accuracy:.2f}")

# Calculate and print the accuracy for the test data
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Data Accuracy: {test_accuracy:.2f}")

Training Data Accuracy: 1.00
Test Data Accuracy: 0.81


In [16]:
# Print the classification report for the test data
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.82      0.89      0.85        99
           1       0.77      0.65      0.71        55

    accuracy                           0.81       154
   macro avg       0.79      0.77      0.78       154
weighted avg       0.80      0.81      0.80       154



0.81 accuracy ( Close to 1 ) will be enough to predict the result

In [17]:
# Prediction for new data
data = (5, 166, 72, 19, 175, 25.8, 0.587, 51)

In [18]:

# Convert the data to a DataFrame and assign feature names
input_data_df = pd.DataFrame([data], columns=df.drop(columns='Outcome').columns)

# Standardize the input data
std_data = data_scaling.transform(input_data_df)

# Predict the outcome
prediction = model.predict(std_data)
print(prediction)

if prediction[0] == 0:
    print('The person is not diabetic')
else:
    print('The person is diabetic')

[1]
The person is diabetic


# End