In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt
#pickle and joblib are used to save your trained model
import pickle
#from sklearn.externals import joblib
import joblib
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

In [2]:
data=pd.read_csv("Diabetes.csv")
data

Unnamed: 0,RW,IR,SSPG,CC
0,0.81,124,55,0
1,0.95,117,76,0
2,0.94,143,105,0
3,1.04,199,108,0
4,1.00,240,143,0
...,...,...,...,...
85,0.95,748,122,1
86,1.06,320,253,1
87,0.98,188,211,1
88,1.16,607,271,1


In [3]:
data.isnull().sum()

RW       0
IR       0
SSPG     0
CC       0
dtype: int64

# Numerical features

In [4]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='object']

print("Number of Numeric Variables ",len(numerical_features))
data[numerical_features]

Number of Numeric Variables  4


Unnamed: 0,RW,IR,SSPG,CC
0,0.81,124,55,0
1,0.95,117,76,0
2,0.94,143,105,0
3,1.04,199,108,0
4,1.00,240,143,0
...,...,...,...,...
85,0.95,748,122,1
86,1.06,320,253,1
87,0.98,188,211,1
88,1.16,607,271,1


#  Removing Outliers

In [5]:
dataset=data.copy()
for feature in numerical_features:
    
    if 0 in dataset[feature].unique():
        pass
    else:
        
        
        q1=dataset[feature].quantile(0.25)
        q3=dataset[feature].quantile(0.75)
        iqr=q3-q1
        lower_lim=q1-1.5*iqr
        upper_lim=q3+1.5*iqr
        
        dataset=dataset[(dataset[feature]<=upper_lim) & (dataset[feature]>=lower_lim)]
        
       

# Handling Ibalancing (oversampled Technique)

In [6]:
new_data=dataset.copy()

In [7]:

item_1=new_data[new_data["CC"]==1]
item_0=new_data[new_data["CC"]==0]


item_0.shape





(74, 4)

In [8]:
item_1.shape

(12, 4)

In [9]:
# 0 has more counts

#maximise 1 values and make it equal to the 0 
#So we pass 74 to data_over 1 as total data_over_0 are74

data_over_1=item_1.sample(74,replace=True)
data_over=pd.concat([data_over_1,item_0],axis=0)

data_over.CC.value_counts()
#Hence data is already Sampled almost 



1    74
0    74
Name: CC, dtype: int64

In [10]:
data_over

Unnamed: 0,RW,IR,SSPG,CC
58,0.99,151,122,1
89,1.18,297,220,1
86,1.06,320,253,1
58,0.99,151,122,1
70,1.10,344,270,1
...,...,...,...,...
77,0.95,156,159,0
78,0.74,221,103,0
79,0.84,199,59,0
80,0.89,76,108,0


In [11]:

x=data_over.drop(['CC'],axis=1)
y=data_over["CC"]

# Splitting Data

In [12]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.15,random_state=15,stratify=y)

In [13]:
from sklearn.ensemble import GradientBoostingClassifier

In [14]:
model=GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=10, random_state=1)
model.fit(x_train, y_train)
model.score(x_test, y_test)


0.9565217391304348

In [15]:
pred=model.predict(x_test)

In [16]:
y_test

5     0
84    1
10    0
86    1
47    0
89    1
28    0
65    1
87    1
23    0
67    0
86    1
82    1
70    1
17    0
82    1
12    0
27    0
31    0
76    1
0     0
32    0
61    1
Name: CC, dtype: int64

# Classification Report

In [17]:

from sklearn.metrics import classification_report
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       1.00      0.92      0.96        12
           1       0.92      1.00      0.96        11

    accuracy                           0.96        23
   macro avg       0.96      0.96      0.96        23
weighted avg       0.96      0.96      0.96        23

