In [113]:
# Necessary imports
import numpy as np
import pandas as pd

from datetime import datetime
from dateutil.relativedelta import relativedelta

In [114]:
# Data paths
train_path = '../data/orders_train.txt'

In [115]:
# Load csv data into dataframes
train_df = pd.read_csv(train_path, sep=';', index_col=0)

In [116]:
# Replace all '?' values with NaN
train_df = train_df.replace('?', np.nan)

# Drop all columns with NaN values
train_df = train_df.dropna()

# Create new column 'age'
train_df['age'] = train_df['dateOfBirth'].apply(lambda x: relativedelta(pd.to_datetime('now'), datetime.strptime(x, '%Y-%m-%d')).years)

# Drop column 'dateOfBirth'
train_df = train_df.drop(columns=['dateOfBirth'])

# One Hot Encoding for categorical columns (need to split the creation because of memory allocation)
train_df = pd.get_dummies(data=train_df, columns=['itemID'])
train_df = pd.get_dummies(data=train_df, columns=['size'])
train_df = pd.get_dummies(data=train_df, columns=['color'])
train_df = pd.get_dummies(data=train_df, columns=['manufacturerID'])
train_df = pd.get_dummies(data=train_df, columns=['customerID'])
train_df = pd.get_dummies(data=train_df, columns=['salutation'])
train_df = pd.get_dummies(data=train_df, columns=['state'])
train_df = pd.get_dummies(data=train_df, columns=['age'])

MemoryError: Unable to allocate 1.09 GiB for an array with shape (2954, 396495) and data type uint8

In [104]:
train_df.head()

Unnamed: 0_level_0,itemID,size,color,manufacturerID,price,customerID,salutation,state,creationDate,returnShipment,...,deliveryDate_2013-07-05,deliveryDate_2013-07-08,deliveryDate_2013-07-10,deliveryDate_2013-07-11,deliveryDate_2013-07-12,deliveryDate_2013-07-15,deliveryDate_2013-07-17,deliveryDate_2013-07-18,deliveryDate_2013-07-19,deliveryDate_2013-07-22
orderItemID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,186,m,denim,25,69.9,794,Mrs,Baden-Wuerttemberg,2011-04-25,0,...,0,0,0,0,0,0,0,0,0,0
2,71,9+,ocher,21,69.95,794,Mrs,Baden-Wuerttemberg,2011-04-25,1,...,0,0,0,0,0,0,0,0,0,0
3,71,9+,curry,21,69.95,794,Mrs,Baden-Wuerttemberg,2011-04-25,1,...,0,0,0,0,0,0,0,0,0,0
5,151,39,black,53,29.9,825,Mrs,Rhineland-Palatinate,2011-02-16,0,...,0,0,0,0,0,0,0,0,0,0
6,598,xxl,brown,87,89.9,825,Mrs,Rhineland-Palatinate,2011-02-16,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
# Split dataframe on classes 
train_df_0 = train_df[train_df.returnShipment == 0]
train_df_1 = train_df[train_df.returnShipment == 1]

In [47]:
# Sanity check split datasets
print(train_df_0.info())
print(train_df_1.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 249001 entries, 1 to 481092
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   orderDate       249001 non-null  object 
 1   deliveryDate    249001 non-null  object 
 2   itemID          249001 non-null  int64  
 3   size            249001 non-null  object 
 4   color           249001 non-null  object 
 5   manufacturerID  249001 non-null  int64  
 6   price           249001 non-null  float64
 7   customerID      249001 non-null  int64  
 8   salutation      249001 non-null  object 
 9   dateOfBirth     249001 non-null  object 
 10  state           249001 non-null  object 
 11  creationDate    249001 non-null  object 
 12  returnShipment  249001 non-null  int64  
dtypes: float64(1), int64(4), object(8)
memory usage: 26.6+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 232091 entries, 2 to 481088
Data columns (total 13 columns):
 #   Column         

In [48]:
# Generate new datasets with 1%, 3%, 5%, 25% and 50% class balance
# (249 001 * 100 / 99) = 251 516 -> 1% of 251 516 = 2 515
# (249 001 * 100 / 97) = 256 702 -> 3% of 256 702 = 7 701
# (249 001 * 100 / 95) = 262 106 -> 5% of 262 106 = 13 105
# (249 001 * 100 / 75) = 332 001 -> 25% of 332 001 = 83 000
# (232 091 * 100 / 50) = 464 182 -> 50% of 464 182 = 232 091 

# Filter dataframes for the correct number of instances
one_percent = train_df_1.sample(n=2515)
three_percent = train_df_1.sample(n=7701)
five_percent = train_df_1.sample(n=13105)
twentyfive_percent = train_df_1.sample(n=83000)
fifty_percent = train_df_0.sample(n=232091)

# Generate the new datasets
train_df_one = train_df_0.append(one_percent)
train_df_three = train_df_0.append(three_percent)
train_df_five = train_df_0.append(five_percent)
train_df_twentyfive = train_df_0.append(twentyfive_percent)
train_df_fifty = train_df_1.append(fifty_percent)

In [49]:
# Sanity check new datasets
print(train_df_one['returnShipment'].value_counts())
print(train_df_three['returnShipment'].value_counts())
print(train_df_five['returnShipment'].value_counts())
print(train_df_twentyfive['returnShipment'].value_counts())
print(train_df_fifty['returnShipment'].value_counts())

0    249001
1      2515
Name: returnShipment, dtype: int64
0    249001
1      7701
Name: returnShipment, dtype: int64
0    249001
1     13105
Name: returnShipment, dtype: int64
0    249001
1     83000
Name: returnShipment, dtype: int64
1    232091
0    232091
Name: returnShipment, dtype: int64


In [50]:
# Sort new datasets by index (orderItemID) column
train_df_one = train_df_one.sort_values(by='orderItemID')
train_df_three = train_df_three.sort_values(by='orderItemID')
train_df_five = train_df_five.sort_values(by='orderItemID')
train_df_twentyfive = train_df_twentyfive.sort_values(by='orderItemID')
train_df_fifty = train_df_fifty.sort_values(by='orderItemID')

In [51]:
# Save new datasets
train_df_one.to_csv('../data/train_one_percent.csv')
train_df_three.to_csv('../data/train_three_percent.csv')
train_df_five.to_csv('../data/train_five_percent.csv')
train_df_twentyfive.to_csv('../data/train_twentyfive_percent.csv')
train_df_fifty.to_csv('../data/train_fifty_percent.csv')