In [1]:
#Load Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
#Load Dataset
dataset=pd.read_csv('./diabetes.csv')
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
#Create x and y variables
x = dataset.drop('Outcome', axis=1)
Y = dataset['Outcome']

#Create Train and Test Datasets
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,Y,test_size = 0.2,stratify=Y,random_state = 100)

In [4]:
#Current Class Balance - Train Data
print('Current - Class Split')
num_zeros = (y_train == 0).sum()
num_ones = (y_train == 1).sum()
print('Class 0 -',  num_zeros)
print('Class 1 -',  num_ones)

Current - Class Split
Class 0 - 400
Class 1 - 214


In [6]:
#Oversampling Technique
from sklearn.utils import resample

# concatenate our training data back together
X = pd.concat([x_train, y_train], axis=1)

# separate minority and majority classes
NoDiabetes = X[X.Outcome==0]
Diabetes = X[X.Outcome==1]

# Oversampling
Diabetes_upsampled = resample(Diabetes,
                          replace=True, # sample with replacement
                          n_samples=len(NoDiabetes), # match number in majority class
                          random_state=100) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([NoDiabetes, Diabetes_upsampled])

#Oversampling - Test Data
print('Oversampling - Class Split')
num_zeros = (upsampled.Outcome == 0).sum()
num_ones = (upsampled.Outcome == 1).sum()
print('Class 0 -',  num_zeros)
print('Class 1 -',  num_ones)

Oversampling - Class Split
Class 0 - 400
Class 1 - 400


In [7]:
#Downsample majority

# concatenate our training data back together
X = pd.concat([x_train, y_train], axis=1)

# separate minority and majority classes
NoDiabetes = X[X.Outcome==0]
Diabetes = X[X.Outcome==1]

#Downsample Majority
NoDiabetes_downsampled = resample(NoDiabetes,
                                replace = False, # sample without replacement
                                n_samples = len(Diabetes), # match minority n
                                random_state = 100) # reproducible results

# combine minority and downsampled majority
downsampled = pd.concat([NoDiabetes_downsampled, Diabetes])

#Undersampling - Test Data
print('Undersampling - Class Split')
num_zeros = (downsampled.Outcome == 0).sum()
num_ones = (downsampled.Outcome == 1).sum()
print('Class 0 -',  num_zeros)
print('Class 1 -',  num_ones)

Undersampling - Class Split
Class 0 - 214
Class 1 - 214
