In [1]:
import pandas as pd
import numpy as np

In [2]:
import time
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.metrics import confusion_matrix , classification_report , accuracy_score
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.impute import SimpleImputer

In [4]:
from sklearn.ensemble import RandomForestClassifier

In [5]:
%matplotlib inline

In [6]:
df = pd.read_csv('diabetes.csv')

In [7]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
# checking if there is any null data point
df.isnull().values.any()

False

In [None]:
sns.pairplot(df , hue='Outcome')

In [None]:
# finding Correlation between each and every columns

corr = df.corr()

plt.figure(figsize=(20,15))
sns.heatmap(corr , annot=True , cmap='viridis')

In [None]:
diabetes_true_count = len(df.loc[df['Outcome'] == 1])
diabetes_false_count = len(df.loc[df['Outcome'] == 0])

In [None]:
print(f'Number of Diabetic peoples: {diabetes_true_count}')
print(f'Number of Non Diabetic peoples: {diabetes_false_count}')

# Spliting the data into traning and testing data

In [None]:
x = df.drop('Outcome' , axis=1)
y = df['Outcome']

x_train , x_test, y_train , y_test = train_test_split(x , y , test_size=0.3 , random_state=101)

#  checking for any meaning less values for ex. age = 0     /(O_X)\ 

In [None]:
print(f'Number of rows: {len(df)}')
print(f"Number of missing Glucose values: {len(df.loc[df['Glucose'] == 0])}")
print(f"Number of missing BloodPressure values: {len(df.loc[df['BloodPressure'] == 0])}")
print(f"Number of missing Insulin values: {len(df.loc[df['Insulin'] == 0])}")
print(f"Number of missing BMI values: {len(df.loc[df['BMI'] == 0])}")
print(f"Number of missing SkinThickness values: {len(df.loc[df['SkinThickness'] == 0])}")
print(f"Number of missing Age values: {len(df.loc[df['Age'] == 0])}")
print(f"Number of missing DiabetesPedigreeFunction values: {len(df.loc[df['DiabetesPedigreeFunction'] == 0])}")

In [None]:
x_test.head()

# FIlling those values with average values 

In [None]:
imputer = SimpleImputer(missing_values=0 , strategy='mean')

x_train_preg = []
x_test_preg = []
for i in x_train['Pregnancies']:
    x_train_preg.append(i)
for i in x_test['Pregnancies']:
    x_test_preg.append(i)

x_train = pd.DataFrame(imputer.fit_transform(x_train) , columns=x_train.columns)
x_train['Pregnancies'] = x_train_preg

x_test = pd.DataFrame(imputer.fit_transform(x_test) , columns=x_test.columns)
x_test['Pregnancies'] = x_test_preg

In [None]:
y_train.index = np.arange(0 , len(y_train))
y_test.index = np.arange(0 , len(y_test))

In [None]:
x_test

# Using RandomForest model for prediction

In [None]:
# traning the model 
random_forest_model = RandomForestClassifier(n_estimators=50 , random_state=1)

random_forest_model.fit(x_train.values , y_train.values)

In [None]:
# testing the model
predictions = random_forest_model.predict(x_test.values)

In [None]:
print(f'Accuracy score is: {accuracy_score(y_test , predictions)}%')

In [None]:
print(f'Confusion matrix of predictions:- \n{confusion_matrix(y_test , predictions)}', end='\n\n')
print(f'Classification report of predictions:- \n{classification_report(y_test , predictions)}', end='\n')

In [None]:
def predictor(Pregnancies , Glucose , BloodPressure , SkinThickness , Insulin , BMI,  DiabetesPedigreeFunction , Age):
    x_data = [[Pregnancies , Glucose , BloodPressure , SkinThickness , Insulin , BMI , DiabetesPedigreeFunction , Age]]
    
    prediction = random_forest_model.predict(x_data)
        
    if prediction[0] == 0:
        return "\nCongrats.... you are not diabetic but you sould have a consultaion with doctor"
    
    elif prediction[0] == 1:
        return "It seems you are diabetic and you sould have a consultaion with doctor"

## mean value of every cloumn

In [None]:
print(f"Mean of Glucose: {np.mean(df['Glucose'])}")
print(f"Mean of BloodPressure: {np.mean(df['BloodPressure'])}")
print(f"Mean of BMI: {np.mean(df['BMI'])}")
print(f"Mean of Insulin: {np.mean(df['Insulin'])}")
print(f"Mean of Skin-Thickness: {np.mean(df['SkinThickness'])}")
print(f"Mean of Diabetes-Pedigree-Function: {np.mean(df['DiabetesPedigreeFunction'])}")

In [None]:
# inputs from user
age = int(input('Age: '))
glucose = int(input('glucose concentration: '))
bp = int(input('Blood pressure: '))
number_of_pregnancies = int(input('Number of pregnancies: '))
insulin = float(input('Insulin : '))
skin_thickness = int(input('Skin Thickness : '))
bmi = float(input('Body mass index : '))
DPF = float(input('Diabetes Pedigree Fucntion : '))

start_time = time.time()

# sending user data to predictor function
print(predictor(number_of_pregnancies ,glucose , bp , skin_thickness , insulin , bmi , DPF , age))

print(f"\n Total time taken in prediction:  {time.time()-start_time}")