In [19]:
# Necessary imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [12]:
# Data paths
train_path = '../data/orders_train.txt'
test_X_path = '../data/orders_test_features.txt'
test_y_path = '../data/orders_test_realclass.txt'

In [13]:
# Load csv data into dataframes
train_df = pd.read_csv(train_path, sep=';')
train_y = pd.DataFrame({'orderItemID': train_df['returnShipment'].index, 'returnShipment': train_df['returnShipment'].values}).set_index('orderItemID')
train_X = train_df.drop('returnShipment', axis=1)
test_X = pd.read_csv(test_X_path, sep=';')
test_y = pd.read_csv(test_y_path, sep=';')
test_y = test_y.drop(columns=['orderItemID'])
test_df = pd.concat([test_X, test_y], axis=1)

In [14]:
# Replace all '?' values with NaN
train_df = train_df.replace('?', np.nan)
test_df = test_df.replace('?', np.nan)

In [15]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481092 entries, 0 to 481091
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   orderItemID     481092 non-null  int64  
 1   orderDate       481092 non-null  object 
 2   deliveryDate    441673 non-null  object 
 3   itemID          481092 non-null  int64  
 4   size            481092 non-null  object 
 5   color           480949 non-null  object 
 6   manufacturerID  481092 non-null  int64  
 7   price           481092 non-null  float64
 8   customerID      481092 non-null  int64  
 9   salutation      481092 non-null  object 
 10  dateOfBirth     432203 non-null  object 
 11  state           481092 non-null  object 
 12  creationDate    481092 non-null  object 
 13  returnShipment  481092 non-null  int64  
dtypes: float64(1), int64(5), object(8)
memory usage: 51.4+ MB


In [16]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50078 entries, 0 to 50077
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   orderItemID     50078 non-null  int64  
 1   orderDate       50078 non-null  object 
 2   deliveryDate    45810 non-null  object 
 3   itemID          50078 non-null  int64  
 4   size            50078 non-null  object 
 5   color           50078 non-null  object 
 6   manufacturerID  50078 non-null  int64  
 7   price           50078 non-null  float64
 8   customerID      50078 non-null  int64  
 9   salutation      50078 non-null  object 
 10  dateOfBirth     44909 non-null  object 
 11  state           50078 non-null  object 
 12  creationDate    50078 non-null  object 
 13  returnShipment  50078 non-null  int64  
dtypes: float64(1), int64(5), object(8)
memory usage: 5.3+ MB


In [17]:
train_df.nunique()

orderItemID       481092
orderDate            365
deliveryDate         327
itemID              3007
size                 122
color                 87
manufacturerID       165
price                379
customerID         59754
salutation             5
dateOfBirth        14308
state                 16
creationDate         775
returnShipment         2
dtype: int64

In [18]:
test_df.nunique()

orderItemID       50078
orderDate            30
deliveryDate         72
itemID             1283
size                100
color                66
manufacturerID      119
price               167
customerID        12068
salutation            5
dateOfBirth        6998
state                16
creationDate        777
returnShipment        2
dtype: int64

In [4]:
# train_X info
train_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481092 entries, 0 to 481091
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   orderItemID     481092 non-null  int64  
 1   orderDate       481092 non-null  object 
 2   deliveryDate    481092 non-null  object 
 3   itemID          481092 non-null  int64  
 4   size            481092 non-null  object 
 5   color           481092 non-null  object 
 6   manufacturerID  481092 non-null  int64  
 7   price           481092 non-null  float64
 8   customerID      481092 non-null  int64  
 9   salutation      481092 non-null  object 
 10  dateOfBirth     481092 non-null  object 
 11  state           481092 non-null  object 
 12  creationDate    481092 non-null  object 
dtypes: float64(1), int64(4), object(8)
memory usage: 47.7+ MB


In [5]:
# train_y info
train_y.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 481092 entries, 0 to 481091
Data columns (total 1 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   returnShipment  481092 non-null  int64
dtypes: int64(1)
memory usage: 7.3 MB


In [6]:
# test_X info
test_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50078 entries, 0 to 50077
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   orderItemID     50078 non-null  int64  
 1   orderDate       50078 non-null  object 
 2   deliveryDate    50078 non-null  object 
 3   itemID          50078 non-null  int64  
 4   size            50078 non-null  object 
 5   color           50078 non-null  object 
 6   manufacturerID  50078 non-null  int64  
 7   price           50078 non-null  float64
 8   customerID      50078 non-null  int64  
 9   salutation      50078 non-null  object 
 10  dateOfBirth     50078 non-null  object 
 11  state           50078 non-null  object 
 12  creationDate    50078 non-null  object 
dtypes: float64(1), int64(4), object(8)
memory usage: 5.0+ MB


In [7]:
# test_y info
test_y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50078 entries, 0 to 50077
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   orderItemID     50078 non-null  int64
 1   returnShipment  50078 non-null  int64
dtypes: int64(2)
memory usage: 782.6 KB


In [8]:
# Count of classes
train_y.nunique()

returnShipment    2
dtype: int64

In [9]:
# Class distribution of the train data
train_y.returnShipment.value_counts()

0    249001
1    232091
Name: returnShipment, dtype: int64

In [20]:
test_y.returnShipment.value_counts()

0    25049
1    25029
Name: returnShipment, dtype: int64