# Oversampling 

In [11]:
from sklearn.utils import resample
import pandas as pd
from sklearn.model_selection import train_test_split


In [12]:
df = {'age':[10,20,30,40,50,60,70,80,90],
     'Class':[0,0,0,0,0,0,0,1,1]}

df = pd.DataFrame(df)

In [13]:
# Separate input features and target
y = df.Class
X = df.drop('Class', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
not_fraud = X[X.Class==0]
fraud = X[X.Class==1]

# upsample minority
fraud_upsampled = resample(fraud,
                          replace=True, # sample with replacement
                          n_samples=len(not_fraud), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([not_fraud, fraud_upsampled])

# check new class counts
upsampled.Class.value_counts()

1    5
0    5
Name: Class, dtype: int64

In [21]:
print(upsampled)

   age  Class
1   20      0
5   60      0
6   70      0
0   10      0
3   40      0
7   80      1
7   80      1
7   80      1
7   80      1
7   80      1


# Undersampling

In [19]:
# still using our separated classes fraud and not_fraud from above

# downsample majority
not_fraud_downsampled = resample(not_fraud,
                                replace = False, # sample without replacement
                                n_samples = len(fraud), # match minority n
                                random_state = 27) # reproducible results

# combine minority and downsampled majority
downsampled = pd.concat([not_fraud_downsampled, fraud])

# checking counts
downsampled.Class.value_counts()

1    1
0    1
Name: Class, dtype: int64

In [20]:
print(downsampled)

   age  Class
6   70      0
7   80      1


In [None]:
from imblearn.over_sampling import SMOTE

# Separate input features and target
y = df.Class
X = df.drop('Class', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)


sm = SMOTE(random_state=27, ratio=1.0)
X_train, y_train = sm.fit_sample(X_train, y_train)