In [30]:
# importing the required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [31]:
# importing the dataset
Diabetes = pd.read_csv('diabetes.csv')

In [32]:
# splitting the data into the dependent and the independent variables

y = Diabetes['Outcome']

x = Diabetes.loc[:,Diabetes.columns != 'Outcome']

In [33]:
# Creating the training and the testing datsets

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42)

In [34]:
# Creating a Logistic Regression classification model 

Classification_log = LogisticRegression(solver='liblinear', random_state=0)

# Fitting the training data to the Logistic regression model

Classification_log.fit(x_train,y_train)

# Predicting the values of the test dataset using the fitted logistic regression model

y_prediction_log = Classification_log.predict(x_test)



In [35]:
# getting the accuracy of the logistic regression model which is 100 - MAPE (mean absolute percentage error)

print("ACCURACY OF Logistic Regression Model: ", metrics.accuracy_score(y_test, y_prediction_log))

#The classification report for the Logistic regression model

class_rep_log = classification_report(y_test, y_prediction_log)

print(class_rep_log)

ACCURACY OF Logistic Regression Model:  0.75
              precision    recall  f1-score   support

           0       0.80      0.81      0.81       123
           1       0.66      0.64      0.65        69

    accuracy                           0.75       192
   macro avg       0.73      0.73      0.73       192
weighted avg       0.75      0.75      0.75       192



In [36]:
# Creating a random forest classification model 

Classification_model = RandomForestClassifier(n_estimators=1000)


# Fitting the training data to the classification model

Classification_model.fit(x_train,y_train)


# Predicting the values of the test dataset using the fitted classification model

y_prediction_rf = Classification_model.predict(x_test)

In [37]:
# getting the accuracy of the random forest model which is 100 - MAPE (mean absolute percentage error)

print("ACCURACY OF THE Random Forest MODEL: ", metrics.accuracy_score(y_test, y_prediction_rf))

#The classification report for the Random Forest model

class_rep_rf = classification_report(y_test, y_prediction_rf)

print(class_rep_rf)

ACCURACY OF THE Random Forest MODEL:  0.7395833333333334
              precision    recall  f1-score   support

           0       0.80      0.80      0.80       123
           1       0.64      0.64      0.64        69

    accuracy                           0.74       192
   macro avg       0.72      0.72      0.72       192
weighted avg       0.74      0.74      0.74       192



In [38]:
# Importance of each independent variable used for classification.

feature_imp = pd.Series(Classification_model.feature_importances_,index = x.columns).sort_values(ascending = False)

feature_imp

Glucose                     0.267336
BMI                         0.162708
Age                         0.141397
DiabetesPedigreeFunction    0.118491
BloodPressure               0.086207
Pregnancies                 0.079908
Insulin                     0.074315
SkinThickness               0.069640
dtype: float64

**Question 2**

In [39]:
# XGBoost classification - creating a model

Classification_xgb = xgb.XGBClassifier(use_label_encoder = False)


# Fitting the XGB model using the training data

Classification_xgb.fit(x_train,y_train)


#predicting the values of the test dataset using the xgb classification model

y_pred_xgb = Classification_xgb.predict(x_test)



In [40]:
# getting the accuracy of the xgboost model which is 100 - MAPE (mean absolute percentage error)

print("ACCURACY OF XGBoost Model: ", metrics.accuracy_score(y_test, y_pred_xgb))

#The classification report for the xgboost model model

class_rep_xgb = classification_report(y_test, y_pred_xgb)

print(class_rep_xgb)

ACCURACY OF XGBoost Model:  0.75
              precision    recall  f1-score   support

           0       0.83      0.77      0.80       123
           1       0.64      0.71      0.67        69

    accuracy                           0.75       192
   macro avg       0.73      0.74      0.73       192
weighted avg       0.76      0.75      0.75       192



In [41]:
# Importance of each independent variable used for XGB classification.

feature_imp2 = pd.Series(Classification_xgb.feature_importances_,index = x.columns).sort_values(ascending = False)

feature_imp2

Glucose                     0.263345
Age                         0.141789
BMI                         0.129308
Insulin                     0.115281
DiabetesPedigreeFunction    0.098113
SkinThickness               0.091657
BloodPressure               0.085577
Pregnancies                 0.074930
dtype: float32