In [1]:
# Necessary imports
import numpy as np
import pandas as pd

from datetime import datetime
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelBinarizer

from utils.calculate_delivery_time import calculate_delivery_time
from utils.calculate_account_age_until_time_of_order import calculate_account_age_until_time_of_order
from utils.split_x_y import split_x_y
from utils.save_dataset import save_dataset

In [2]:
# Data paths
train_path = '../data/orders_train.txt'
test_features_path = '../data/orders_test_features.txt'
test_realclass_path = '../data/orders_test_realclass.txt'

In [3]:
# Load csv data into dataframes
train_df = pd.read_csv(train_path, sep=';')
test_features_df = pd.read_csv(test_features_path, sep=';')
test_realclass_df = pd.read_csv(test_realclass_path, sep=';')
test_df = pd.concat([test_features_df, test_realclass_df], axis=1)

In [4]:
# Drop column 'orderItemID'
train_df = train_df.drop('orderItemID', axis=1)
test_df = test_df.drop('orderItemID', axis=1)

# Replace all '?' values with NaN
train_df = train_df.replace('?', np.nan)
test_df = test_df.replace('?', np.nan)

In [5]:
# Drop all rows with NaN values in column 'color'
train_df = train_df[train_df['color'].notna()]

# Calculate delivery time; -1 if no delivery date is giving; -1 if delivery date is before order date
train_df['deliveryTime'] = train_df.apply(lambda x: calculate_delivery_time(x['orderDate'], x['deliveryDate']), axis=1)
test_df['deliveryTime'] = test_df.apply(lambda x: calculate_delivery_time(x['orderDate'], x['deliveryDate']), axis=1)

# Calculate age of account until time of order
train_df['ageOfAccountUntilTimeOfOrder'] = train_df.apply(lambda x: calculate_account_age_until_time_of_order(x['creationDate'], x['orderDate']), axis=1)
test_df['ageOfAccountUntilTimeOfOrder'] = test_df.apply(lambda x: calculate_account_age_until_time_of_order(x['creationDate'], x['orderDate']), axis=1)

# Drop columns 'orderDate', 'deliveryDate' and 'creationDate'
train_df = train_df.drop(columns=['orderDate', 'deliveryDate', 'creationDate'])
test_df = test_df.drop(columns=['orderDate', 'deliveryDate', 'creationDate'])

# Missing Category Imputation -> Impute all NaN values with a new category 'Missing'
train_df = train_df.fillna(-1)
test_df = test_df.fillna(-1)

# Create new column 'age'
train_df['age'] = train_df['dateOfBirth'].apply(lambda x: str(relativedelta(pd.to_datetime('now'), datetime.strptime(x, '%Y-%m-%d')).years) if x != -1 else -1)
test_df['age'] = test_df['dateOfBirth'].apply(lambda x: str(relativedelta(pd.to_datetime('now'), datetime.strptime(x, '%Y-%m-%d')).years) if x != -1 else -1)

# Drop column 'dateOfBirth'
train_df = train_df.drop(columns=['dateOfBirth'])
test_df = test_df.drop(columns=['dateOfBirth'])

In [6]:
# One hot encoding for categorical columns
# train_df = pd.get_dummies(train_df, columns=['itemID', 'size', 'color', 'manufacturerID', 'salutation', 'state'])
# test_df = pd.get_dummies(test_df, columns=['itemID', 'size', 'color', 'manufacturerID', 'salutation', 'state'])
# test_df = test_df.reindex(columns=train_df.columns, fill_value=0)

In [6]:
# Ordinal Encoding for categorical columns
# TODO: cast values to string again -> age, deliveryTime and ageOfAccountUntilTimeOfOrder, itemID, manufacturerID und customerID == continous
oe_item_id = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1).fit(train_df['itemID'].values.reshape(-1, 1))
oe_size = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1).fit(train_df['size'].values.reshape(-1, 1))
oe_color = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1).fit(train_df['color'].values.reshape(-1, 1))
oe_manufacturer_id = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1).fit(train_df['manufacturerID'].values.reshape(-1, 1))
oe_customer_id = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1).fit(train_df['customerID'].values.reshape(-1, 1))
oe_salutation = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1).fit(train_df['salutation'].values.reshape(-1, 1))
oe_state = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1).fit(train_df['state'].values.reshape(-1, 1))

train_df['itemID'] = oe_item_id.transform(train_df['itemID'].values.reshape(-1, 1))
train_df['size'] = oe_size.transform(train_df['size'].values.reshape(-1, 1))
train_df['color'] = oe_color.transform(train_df['color'].values.reshape(-1, 1))
train_df['manufacturerID'] = oe_manufacturer_id.transform(train_df['manufacturerID'].values.reshape(-1, 1))
train_df['customerID'] = oe_customer_id.transform(train_df['customerID'].values.reshape(-1, 1))
train_df['salutation'] = oe_salutation.transform(train_df['salutation'].values.reshape(-1, 1))
train_df['state'] = oe_state.transform(train_df['state'].values.reshape(-1, 1))

test_df['itemID'] = oe_item_id.transform(test_df['itemID'].values.reshape(-1, 1))
test_df['size'] = oe_size.transform(test_df['size'].values.reshape(-1, 1))
test_df['color'] = oe_color.transform(test_df['color'].values.reshape(-1, 1))
test_df['manufacturerID'] = oe_manufacturer_id.transform(test_df['manufacturerID'].values.reshape(-1, 1))
test_df['customerID'] = oe_customer_id.transform(test_df['customerID'].values.reshape(-1, 1))
test_df['salutation'] = oe_salutation.transform(test_df['salutation'].values.reshape(-1, 1))
test_df['state'] = oe_state.transform(test_df['state'].values.reshape(-1, 1))

In [7]:
# Check feature engineered dataframe
print(train_df.head(n=50))
print(test_df.head(n=50))

    itemID   size  color  manufacturerID   price  customerID  salutation  \
0    185.0  115.0   42.0            24.0   69.90       422.0         3.0   
1     70.0  102.0   68.0            20.0   69.95       422.0         3.0   
2     70.0  102.0   35.0            20.0   69.95       422.0         3.0   
3     21.0  115.0   49.0            13.0   39.90       430.0         3.0   
4    150.0   59.0   17.0            52.0   29.90       445.0         3.0   
5    590.0  120.0   22.0            86.0   89.90       445.0         3.0   
6     14.0   59.0   17.0             0.0  129.90       445.0         3.0   
7     31.0  120.0   22.0             2.0   21.90       464.0         3.0   
8     31.0  120.0   78.0             2.0   21.90       464.0         3.0   
9     56.0  120.0   49.0             2.0   39.90       464.0         3.0   
10     1.0  120.0   65.0             1.0   39.90       464.0         3.0   
11   257.0   59.0   17.0             0.0  119.90       464.0         3.0   
12   595.0  

In [8]:
# Split dataframe on classes 
train_df_0, train_df_1 = train_df[train_df.returnShipment == 0], train_df[train_df.returnShipment == 1]

In [9]:
# Sanity check split datasets
print(train_df_0.info())
print(train_df_1.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 248865 entries, 0 to 481091
Data columns (total 12 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   itemID                        248865 non-null  float64
 1   size                          248865 non-null  float64
 2   color                         248865 non-null  float64
 3   manufacturerID                248865 non-null  float64
 4   price                         248865 non-null  float64
 5   customerID                    248865 non-null  float64
 6   salutation                    248865 non-null  float64
 7   state                         248865 non-null  float64
 8   returnShipment                248865 non-null  int64  
 9   deliveryTime                  248865 non-null  int64  
 10  ageOfAccountUntilTimeOfOrder  248865 non-null  int64  
 11  age                           248865 non-null  object 
dtypes: float64(8), int64(3), object(1)
memory us

In [10]:
print(train_df_0['returnShipment'].value_counts())
print(train_df_1['returnShipment'].value_counts())

0    248865
Name: returnShipment, dtype: int64
1    232084
Name: returnShipment, dtype: int64


In [11]:
# Generate new datasets with 1%, 3%, 5%, 25% and 50% class balance
# (232 084 * 100 / 99) = 234 428 -> 1% of 234 428 = 2 344
# (232 084 * 100 / 97) = 239 262 -> 3% of 239 262 = 7 178
# (232 084 * 100 / 95) = 244 299 -> 5% of 244 299 = 12 214
# (232 084 * 100 / 75) = 309 445 -> 25% of 309 445 = 77 361
# (232 084 * 100 / 50) = 464 168 -> 50% of 464 168 = 232 084

# Filter dataframes for the correct number of instances
train_df_0 = train_df_0.sample(n=232084)

one_percent = train_df_1.sample(n=2344)
three_percent = train_df_1.sample(n=7178)
five_percent = train_df_1.sample(n=12214)
twenty_five_percent = train_df_1.sample(n=77361)
fifty_percent = train_df_1.sample(n=232084)

# Generate the new datasets
train_df_one = train_df_0.append(one_percent)
train_df_three = train_df_0.append(three_percent)
train_df_five = train_df_0.append(five_percent)
train_df_twenty_five = train_df_0.append(twenty_five_percent)
train_df_fifty = train_df_0.append(fifty_percent)

In [12]:
# Sanity check new datasets
print(train_df_one['returnShipment'].value_counts())
print(train_df_three['returnShipment'].value_counts())
print(train_df_five['returnShipment'].value_counts())
print(train_df_twenty_five['returnShipment'].value_counts())
print(train_df_fifty['returnShipment'].value_counts())

0    232084
1      2344
Name: returnShipment, dtype: int64
0    232084
1      7178
Name: returnShipment, dtype: int64
0    232084
1     12214
Name: returnShipment, dtype: int64
0    232084
1     77361
Name: returnShipment, dtype: int64
0    232084
1    232084
Name: returnShipment, dtype: int64


In [13]:
# Sort new datasets by index
train_df_one = train_df_one.sort_index()
train_df_three = train_df_three.sort_index()
train_df_five = train_df_five.sort_index()
train_df_twenty_five = train_df_twenty_five.sort_index()
train_df_fifty = train_df_fifty.sort_index()

In [14]:
# Save new datasets
save_dataset(df=train_df_one, dataset_type='train', encoding='oe', balance=1)
save_dataset(df=train_df_three, dataset_type='train', encoding='oe', balance=3)
save_dataset(df=train_df_five, dataset_type='train', encoding='oe', balance=5)
save_dataset(df=train_df_twenty_five, dataset_type='train', encoding='oe', balance=25)
save_dataset(df=train_df_fifty, dataset_type='train', encoding='oe', balance=50)

save_dataset(df=test_df, dataset_type='test', encoding='oe')