In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame(np.random.randint(0,100,size=(100, 3)), columns=['f1','f2','f3'])

In [3]:
df

Unnamed: 0,f1,f2,f3
0,6,46,55
1,71,90,8
2,24,1,92
3,69,41,31
4,25,51,82
...,...,...,...
95,3,63,15
96,85,80,0
97,33,4,97
98,56,85,18


In [4]:
df_label = pd.DataFrame(np.random.randint(1,4,size=(100,1)), columns=['l1'])

In [5]:
df_label

Unnamed: 0,l1
0,2
1,3
2,3
3,3
4,2
...,...
95,1
96,3
97,2
98,1


In [6]:
df_label.l1.value_counts()

1    37
2    33
3    30
Name: l1, dtype: int64

In [7]:
df['label'] = df_label

In [8]:
df

Unnamed: 0,f1,f2,f3,label
0,6,46,55,2
1,71,90,8,3
2,24,1,92,3
3,69,41,31,3
4,25,51,82,2
...,...,...,...,...
95,3,63,15,1
96,85,80,0,3
97,33,4,97,2
98,56,85,18,1


In [9]:
class_1, class_2, class_3 = df.label.value_counts()

In [10]:
c3 = df[df['label'] == 3]
c2 = df[df['label'] == 2]
c1 = df[df['label'] == 1]

In [11]:
df_1_under = c1.sample(class_3)
df_2_under = c2.sample(class_3)

In [12]:
undersampled_df = pd.concat([df_1_under,df_2_under,c3],axis=0)

In [13]:
undersampled_df.label.value_counts()

3    30
2    30
1    30
Name: label, dtype: int64

In [14]:
df_2_over = c2.sample(class_1, replace=True)
df_3_over = c3.sample(class_1, replace=True)

In [16]:
oversampled_df = pd.concat([c1,df_2_over,df_3_over], axis=0)

In [17]:
oversampled_df.label.value_counts()

3    37
2    37
1    37
Name: label, dtype: int64

In [18]:
def undersample(df):
    classes = df.label.value_counts().to_dict()
    least_class_amount = min(classes.values())
    classes_list = []
    for key in classes:
        classes_list.append(df[df['label'] == key]) 
    classes_sample = []
    for i in range(0,len(classes_list)-1):
        classes_sample.append(classes_list[i].sample(least_class_amount))
    df_maybe = pd.concat(classes_sample)
    final_df = pd.concat([df_maybe,classes_list[-1]], axis=0)
    final_df = final_df.reset_index(drop=True)
    return final_df

In [19]:
new_df = undersample(df)

In [20]:
new_df

Unnamed: 0,f1,f2,f3,label
0,5,35,17,1
1,11,95,35,1
2,26,53,12,1
3,17,92,73,1
4,31,99,41,1
...,...,...,...,...
85,83,66,22,3
86,57,79,45,3
87,49,97,44,3
88,48,96,37,3


In [21]:
new_df.label.value_counts()

3    30
2    30
1    30
Name: label, dtype: int64

In [22]:
def oversample(df):
    classes = df.label.value_counts().to_dict()
    most = max(classes.values())
    classes_list = []
    for key in classes:
        classes_list.append(df[df['label'] == key]) 
    classes_sample = []
    for i in range(1,len(classes_list)):
        classes_sample.append(classes_list[i].sample(most, replace=True))
    df_maybe = pd.concat(classes_sample)
    final_df = pd.concat([df_maybe,classes_list[0]], axis=0)
    final_df = final_df.reset_index(drop=True)
    return final_df

In [23]:
over_df = oversample(df)

In [24]:
over_df

Unnamed: 0,f1,f2,f3,label
0,48,13,16,2
1,80,34,48,2
2,7,14,87,2
3,58,32,93,2
4,48,75,98,2
...,...,...,...,...
106,31,99,41,1
107,7,30,37,1
108,34,87,73,1
109,3,63,15,1


In [25]:
over_df.label.value_counts()

3    37
2    37
1    37
Name: label, dtype: int64