In [1]:
# Necessary imports
import numpy as np
import pandas as pd

from utils.load_dataset import load_dataset
from utils.split_x_y import split_x_y
from utils.under_sampling import random_under_sampling, near_miss_under_sampling
from utils.over_sampling import random_over_sampling
from utils.smote import smote_nc
from utils.save_dataset import save_dataset

In [2]:
# Load dataframes
train_df_one = load_dataset(dataset_type='train', balance=1)
train_df_three = load_dataset(dataset_type='train', balance=3)
train_df_five = load_dataset(dataset_type='train', balance=5)
train_df_twenty_five = load_dataset(dataset_type='train', balance=25)
train_df_fifty = load_dataset(dataset_type='train', balance=50)

In [3]:
# Sanity check for correct loading
print(train_df_one.info())
print(train_df_three.info())
print(train_df_five.info())
print(train_df_twenty_five.info())
print(train_df_fifty.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234428 entries, 0 to 481091
Data columns (total 12 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   itemID                        234428 non-null  float64
 1   size                          234428 non-null  float64
 2   color                         234428 non-null  float64
 3   manufacturerID                234428 non-null  float64
 4   price                         234428 non-null  float64
 5   customerID                    234428 non-null  float64
 6   salutation                    234428 non-null  float64
 7   state                         234428 non-null  float64
 8   returnShipment                234428 non-null  int64  
 9   deliveryTime                  234428 non-null  float64
 10  ageOfAccountUntilTimeOfOrder  234428 non-null  float64
 11  age                           234428 non-null  float64
dtypes: float64(11), int64(1)
memory usage: 23.3 

In [4]:
# Random under sampling
# Random Under sampling aims to balance class distribution by randomly eliminating majority class examples.
# This is done until majority and minority class instances are balanced out.

# 1% class balance
train_one_X_random_under, train_one_y_random_under = random_under_sampling(df=train_df_one, target='returnShipment')

# 3% class balance
train_three_X_random_under, train_three_y_random_under = random_under_sampling(df=train_df_three, target='returnShipment')

# 5% class balance
train_five_X_random_under, train_five_y_random_under = random_under_sampling(df=train_df_five, target='returnShipment')

# 25% class balance
train_twenty_five_X_random_under, train_twenty_five_y_random_under = random_under_sampling(df=train_df_twenty_five, target='returnShipment')

# 50% class balance
train_fifty_X_random_under, train_fifty_y_random_under = random_under_sampling(df=train_df_fifty, target='returnShipment')

In [5]:
# Checking new dataframes
print(train_one_X_random_under.info())
print(train_one_y_random_under.value_counts())

print(train_three_X_random_under.info())
print(train_three_y_random_under.value_counts())

print(train_five_X_random_under.info())
print(train_five_y_random_under.value_counts())

print(train_twenty_five_X_random_under.info())
print(train_twenty_five_y_random_under.value_counts())

print(train_fifty_X_random_under.info())
print(train_fifty_y_random_under.value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4688 entries, 0 to 4687
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   itemID                        4688 non-null   float64
 1   size                          4688 non-null   float64
 2   color                         4688 non-null   float64
 3   manufacturerID                4688 non-null   float64
 4   price                         4688 non-null   float64
 5   customerID                    4688 non-null   float64
 6   salutation                    4688 non-null   float64
 7   state                         4688 non-null   float64
 8   deliveryTime                  4688 non-null   float64
 9   ageOfAccountUntilTimeOfOrder  4688 non-null   float64
 10  age                           4688 non-null   float64
dtypes: float64(11)
memory usage: 403.0 KB
None
returnShipment
0                 2344
1                 2344
dtype: int64
<cla

In [6]:
# Save new datasets
train_one_random_under = pd.concat([train_one_X_random_under, train_one_y_random_under], axis=1)
save_dataset(df=train_one_random_under, dataset_type='train', balance=1, technique='ru')

train_three_random_under = pd.concat([train_three_X_random_under, train_three_y_random_under], axis=1)
save_dataset(df=train_three_random_under, dataset_type='train', balance=3, technique='ru')

train_five_random_under = pd.concat([train_five_X_random_under, train_five_y_random_under], axis=1)
save_dataset(df=train_five_random_under, dataset_type='train', balance=5, technique='ru')

train_twenty_five_random_under = pd.concat([train_twenty_five_X_random_under, train_twenty_five_y_random_under], axis=1)
save_dataset(df=train_twenty_five_random_under, dataset_type='train', balance=25, technique='ru')

train_fifty_random_under = pd.concat([train_fifty_X_random_under, train_fifty_y_random_under], axis=1)
save_dataset(df=train_fifty_random_under, dataset_type='train', balance=50, technique='ru')

In [7]:
# Near Miss under sampling
# NearMiss refers to a collection of under-sampling methods that select examples based on the distance of majority
# class examples to minority class examples.

# 1% class balance
train_one_X_near_miss_under, train_one_y_near_miss_under = near_miss_under_sampling(df=train_df_one, target='returnShipment')

# 3% class balance
train_three_X_near_miss_under, train_three_y_near_miss_under = near_miss_under_sampling(df=train_df_three, target='returnShipment')

# 5% class balance
train_five_X_near_miss_under, train_five_y_near_miss_under = near_miss_under_sampling(df=train_df_five, target='returnShipment')

# 25% class balance
train_twenty_five_X_near_miss_under, train_twenty_five_y_near_miss_under = near_miss_under_sampling(df=train_df_twenty_five, target='returnShipment')

# 50% class balance
train_fifty_X_near_miss_under, train_fifty_y_near_miss_under = near_miss_under_sampling(df=train_df_fifty, target='returnShipment')

In [8]:
# Checking new dataframes
print(train_one_X_near_miss_under.info())
print(train_one_y_near_miss_under.value_counts())

print(train_three_X_near_miss_under.info())
print(train_three_y_near_miss_under.value_counts())

print(train_five_X_near_miss_under.info())
print(train_five_y_near_miss_under.value_counts())

print(train_twenty_five_X_near_miss_under.info())
print(train_twenty_five_y_near_miss_under.value_counts())

print(train_fifty_X_near_miss_under.info())
print(train_fifty_y_near_miss_under.value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4688 entries, 0 to 4687
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   itemID                        4688 non-null   float64
 1   size                          4688 non-null   float64
 2   color                         4688 non-null   float64
 3   manufacturerID                4688 non-null   float64
 4   price                         4688 non-null   float64
 5   customerID                    4688 non-null   float64
 6   salutation                    4688 non-null   float64
 7   state                         4688 non-null   float64
 8   deliveryTime                  4688 non-null   float64
 9   ageOfAccountUntilTimeOfOrder  4688 non-null   float64
 10  age                           4688 non-null   float64
dtypes: float64(11)
memory usage: 403.0 KB
None
returnShipment
0                 2344
1                 2344
dtype: int64
<cla

In [9]:
# Save new datasets
train_one_near_miss_under = pd.concat([train_one_X_near_miss_under, train_one_y_near_miss_under], axis=1)
save_dataset(df=train_one_near_miss_under, dataset_type='train', balance=1, technique='nmu')

train_three_near_miss_under = pd.concat([train_three_X_near_miss_under, train_three_y_near_miss_under], axis=1)
save_dataset(df=train_three_near_miss_under, dataset_type='train', balance=3, technique='nmu')

train_five_near_miss_under = pd.concat([train_five_X_near_miss_under, train_five_y_near_miss_under], axis=1)
save_dataset(df=train_five_near_miss_under, dataset_type='train', balance=5, technique='nmu')

train_twenty_five_near_miss_under = pd.concat([train_twenty_five_X_near_miss_under, train_twenty_five_y_near_miss_under], axis=1)
save_dataset(df=train_twenty_five_near_miss_under, dataset_type='train', balance=25, technique='nmu')

train_fifty_near_miss_under = pd.concat([train_fifty_X_near_miss_under, train_fifty_y_near_miss_under], axis=1)
save_dataset(df=train_fifty_near_miss_under, dataset_type='train', balance=50, technique='nmu')

In [10]:
# Random Over Sampling
# Over-Sampling increases the number of instances in the minority class by randomly replicating them in order 
# to present a higher representation of the minority class in the sample. 

# 1% class balance
train_one_X_random_over, train_one_y_random_over = random_over_sampling(df=train_df_one, target='returnShipment')

# 3% class balance
train_three_X_random_over, train_three_y_random_over = random_over_sampling(df=train_df_three, target='returnShipment')

# 5% class balance
train_five_X_random_over, train_five_y_random_over = random_over_sampling(df=train_df_five, target='returnShipment')

# 25% class balance
train_twenty_five_X_random_over, train_twenty_five_y_random_over = random_over_sampling(df=train_df_twenty_five, target='returnShipment')

# 50% class balance
train_fifty_X_random_over, train_fifty_y_random_over = random_over_sampling(df=train_df_fifty, target='returnShipment')

In [11]:
# Checking new dataframes
print(train_one_X_random_over.info())
print(train_one_y_random_over.value_counts())

print(train_three_X_random_over.info())
print(train_three_y_random_over.value_counts())

print(train_five_X_random_over.info())
print(train_five_y_random_over.value_counts())

print(train_twenty_five_X_random_over.info())
print(train_twenty_five_y_random_over.value_counts())

print(train_fifty_X_random_over.info())
print(train_fifty_y_random_over.value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 464168 entries, 0 to 464167
Data columns (total 11 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   itemID                        464168 non-null  float64
 1   size                          464168 non-null  float64
 2   color                         464168 non-null  float64
 3   manufacturerID                464168 non-null  float64
 4   price                         464168 non-null  float64
 5   customerID                    464168 non-null  float64
 6   salutation                    464168 non-null  float64
 7   state                         464168 non-null  float64
 8   deliveryTime                  464168 non-null  float64
 9   ageOfAccountUntilTimeOfOrder  464168 non-null  float64
 10  age                           464168 non-null  float64
dtypes: float64(11)
memory usage: 39.0 MB
None
returnShipment
0                 232084
1                 2320

In [12]:
# Save new datasets
train_one_random_over = pd.concat([train_one_X_random_over, train_one_y_random_over], axis=1)
save_dataset(df=train_one_random_over, dataset_type='train', balance=1, technique='ro')

train_three_random_over = pd.concat([train_three_X_random_over, train_three_y_random_over], axis=1)
save_dataset(df=train_three_random_over, dataset_type='train', balance=3, technique='ro')

train_five_random_over = pd.concat([train_five_X_random_over, train_five_y_random_over], axis=1)
save_dataset(df=train_five_random_over, dataset_type='train', balance=5, technique='ro')

train_twenty_five_random_over = pd.concat([train_twenty_five_X_random_over, train_twenty_five_y_random_over], axis=1)
save_dataset(df=train_twenty_five_random_over, dataset_type='train', balance=25, technique='ro')

train_fifty_random_over = pd.concat([train_fifty_X_random_over, train_fifty_y_random_over], axis=1)
save_dataset(df=train_fifty_random_over, dataset_type='train', balance=50, technique='ro')

In [13]:
# SMOTE

# Indices of categorical features
categorical_features = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11]

# 1% class balance
train_one_X_smote, train_one_y_smote = smote_nc(df=train_df_one, target='returnShipment', categorical_features=categorical_features)

# 3% class balance
train_three_X_smote, train_three_X_smote = smote_nc(df=train_df_three, target='returnShipment', categorical_features=categorical_features)

# 5% class balance
train_five_X_smote, train_five_y_smote = smote_nc(df=train_df_five, target='returnShipment', categorical_features=categorical_features)

# 25% class balance
train_twenty_five_X_smote, train_twenty_five_y_smote = smote_nc(df=train_df_twenty_five, target='returnShipment', categorical_features=categorical_features)

# 50% class balance
train_fifty_X_smote, train_fifty_y_smote = smote_nc(df=train_df_fifty, target='returnShipment', categorical_features=categorical_features)

MemoryError: Unable to allocate 65.2 GiB for an array with shape (185805, 1, 47085) and data type float64