# Techniques to handle imbalance data

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
#PIMA diabetes dataset
#column_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
data.head()

In [None]:
sns.countplot(data['Outcome'])

In [None]:
# class count
class_count_0, class_count_1 = data['Outcome'].value_counts()

# Separate class
class_0 = data[data['Outcome'] == 0]
class_1 = data[data['Outcome'] == 1]

# print the shape of the class
print('class 0 : ', class_0.shape)
print('class 1 : ', class_1.shape)

# Techniques
1. Random Under-Sampling
2. Random Over-Sampling

In [None]:
#Random Under-Smapling
class_0_under = class_0.sample(class_count_1)

# print the shape of the class
print("class_0_under : ",class_0_under.shape)
print('class 1 : ', class_1.shape)

In [None]:
#Random Over-Smapling
class_1_over = class_1.sample(class_count_0,replace=True)

# print the shape of the class
print('class 0 : ', class_0.shape)
print("class_1_over : ",class_1_over.shape)

In [None]:
type(class_0_under)

# Balance data with the imbalanced-learn python module
Let’s apply some of these resampling techniques, using the Python library imbalanced-learn.
It is compatible with scikit-learn and is part of scikit-learn-contrib projects.

Install imblearn model

1. Command : pip install -U imbalanced-learn

2. Command : conda install -c conda-forge imbalanced-learn (I have used this.)

In [None]:
#import imblearn

*RandomUnderSampler* is a fast and easy way to balance the data by randomly selecting a subset of data for the targeted classes. Under-sample the majority class(es) by randomly picking samples with or without replacement.


In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=10)

X = data.drop(['Outcome'],axis=1)
y = data[['Outcome']]

X_rus, y_rus = rus.fit_resample(X,y)

# plot
sns.countplot(y_rus['Outcome'])

In [None]:
sns.countplot(y['Outcome'])

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=10)

X = data.drop(['Outcome'],axis=1)
y = data[['Outcome']]

X_ros, y_ros = ros.fit_resample(X,y)

# plot
sns.countplot(y_ros['Outcome'])

# Synthetic Minority Oversampling Technique (SMOTE)
This technique generates synthetic data for the minority class.

SMOTE (Synthetic Minority Oversampling Technique) works by randomly picking a point from the minority class and computing the k-nearest neighbors for this point. The synthetic points are added between the chosen point and its neighbors.

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=10)

X = data.drop(['Outcome'],axis=1)
y = data[['Outcome']]

X_smote, y_smote = smote.fit_resample(X,y)

# plot
sns.countplot(y_smote['Outcome'])

# NearMiss
NearMiss is an under-sampling technique. Instead of resampling the Minority class, using a distance, this will make the majority class equal to the minority class.

In [None]:
from imblearn.under_sampling import NearMiss
nm = NearMiss()

X = data.drop(['Outcome'],axis=1)
y = data[['Outcome']]

X_nm, y_nm = nm.fit_resample(X,y)

# plot
sns.countplot(y_nm['Outcome'])