# Handling Imbalanced Data

In [1]:
import numpy as np
import pandas as pd

## Making the dataset

In [2]:
#setting a seed
np.random.seed(912)

#creating a df with two categories

n = 1000        #total sample

#category split is 9:1. category 0 has 900 samples other has 100
c0_ratio = 0.9

nc0= int(n * c0_ratio) #category 0 sample points

nc1 = n - nc0       ##category 1 sample points
(nc0, nc1)

(900, 100)

In [3]:
category0 = pd.DataFrame({
    "feature1" : np.random.normal(0,1, nc0),
    "feature2" : np.random.normal(0,1, nc0),
    "target" : [0] * nc0
})

category1 = pd.DataFrame({
    "feature1" : np.random.normal(0,1, nc1),
    "feature2" : np.random.normal(0,1, nc1),
    "target" : [1] * nc1
})

In [4]:
imbalanced_df = pd.concat([category0, category1], axis = 0)     #will concatenate by the columns means new rows will be added

imbalanced_df       #since we didn't include reset_index method this will have the index 0 to 899 then again 0 to 99

Unnamed: 0,feature1,feature2,target
0,-0.213950,0.197360,0
1,1.888450,1.847285,0
2,-0.672371,-0.926437,0
3,0.256054,-0.382164,0
4,-1.126118,-0.593033,0
...,...,...,...
95,0.774179,-0.677380,1
96,-0.986109,0.568897,1
97,0.074162,-1.317835,1
98,-1.490417,-1.334180,1


In [5]:
imbalanced_df.reset_index(drop = True)      #reset the index from 0 to 999

Unnamed: 0,feature1,feature2,target
0,-0.213950,0.197360,0
1,1.888450,1.847285,0
2,-0.672371,-0.926437,0
3,0.256054,-0.382164,0
4,-1.126118,-0.593033,0
...,...,...,...
995,0.774179,-0.677380,1
996,-0.986109,0.568897,1
997,0.074162,-1.317835,1
998,-1.490417,-1.334180,1


In [8]:
imbalanced_df["target"].value_counts()

target
0    900
1    100
Name: count, dtype: int64

### Upsampling or Over-Sampling


> It's a Resampling technique used for handling imbalanced data. Theory can be learnt from Statistical Sample survey books.


The process to increase the lesser sample category to the same no

In [None]:
df_minority = imbalanced_df[imbalanced_df['target'] == 1]
df_majority = imbalanced_df[imbalanced_df['target'] == 0]
df_minority

Unnamed: 0,feature1,feature2,target
0,0.136127,-0.436375,1
1,0.339332,0.303134,1
2,-0.741131,0.107946,1
3,0.036458,-0.152518,1
4,0.219315,-0.102317,1
...,...,...,...
95,0.468409,0.578012,1
96,0.553956,0.779218,1
97,-0.635448,0.492569,1
98,0.387791,0.996613,1


In [None]:
df_majority

Unnamed: 0,feature1,feature2,target
0,-0.543580,-1.056542,0
1,1.196298,-1.207045,0
2,2.189726,0.140938,0
3,-1.786581,-0.621431,0
4,1.014798,0.810934,0
...,...,...,...
895,0.254952,0.770188,0
896,-0.496955,-0.162211,0
897,0.692185,0.476835,0
898,-1.395967,-0.622788,0


In [None]:
# PERFORMING UPSAMPLING
from sklearn.utils import resample

In [None]:
df_minority_upsampled = resample(df_minority, replace = True,  #sample with replacement
                                n_samples = len(df_majority),   # to match the sample points
                                random_state = 95
                                )

In [None]:
df_minority_upsampled.shape

(900, 3)

In [None]:
df_minority_upsampled['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,900


In [1]:
print("The lazy brown fox jumps over the lazy dog. 1 2 3  6 4556 3 2 556")

The lazy brown fox jumps over the lazy dog. 1 2 3  6 4556 3 2 556


In [None]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled]).reset_index(drop = True)
df_upsampled

Unnamed: 0,feature1,feature2,target
0,-0.543580,-1.056542,0
1,1.196298,-1.207045,0
2,2.189726,0.140938,0
3,-1.786581,-0.621431,0
4,1.014798,0.810934,0
...,...,...,...
1795,1.932263,0.868800,1
1796,0.330800,0.165834,1
1797,-0.510546,0.319147,1
1798,1.364416,0.180245,1


In [None]:
df_upsampled.shape

(1800, 3)

In [None]:
df_upsampled['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,900
1,900




> **Now we have successfully convert the imbalanced data to a balanced data**



### Down-Sampling or Under-Sampling

In [None]:
df_majority_downsampled = resample(df_majority,
                                   replace = False,      #THIS IS THE MAIN DIFFERENCE. SAMPLE WITHOUT REPLACEMENT, BY THIS WE WILL LOSE DATA
                                   n_samples = len(df_minority),
                                   random_state = 95
                                   )

In [None]:
df_majority_downsampled.shape

(100, 3)

In [None]:
df_majority_downsampled['target'].value_counts()


Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,100


In [None]:
df_downsampled = pd.concat([df_majority_downsampled, df_minority]).reset_index(drop = True)
df_downsampled

Unnamed: 0,feature1,feature2,target
0,-0.457544,0.288744,0
1,-1.506558,-0.343864,0
2,0.301445,-0.417578,0
3,0.254952,0.770188,0
4,0.342426,0.042588,0
...,...,...,...
195,0.468409,0.578012,1
196,0.553956,0.779218,1
197,-0.635448,0.492569,1
198,0.387791,0.996613,1


In [None]:
df_downsampled.shape

(200, 3)

In [None]:
df_downsampled['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,100
1,100


## SMOTE