### Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
pd.options.mode.chained_assignment = None


root_csv = '../csv files/'
root_pickle = '../pickle files/'

### Creating a function to reduce the memory of dataframe

In [2]:
def reduce_mem_usage(train_data):
    start_mem = train_data.memory_usage().sum() / 1024**2
    
    for col in train_data.columns:
        col_type = train_data[col].dtype
        
        if col_type != object:
            if pd.api.types.is_categorical_dtype(train_data[col]):
                train_data[col] = train_data[col].cat.as_ordered()
            else:
                c_min = train_data[col].min()
                c_max = train_data[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        train_data[col] = train_data[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        train_data[col] = train_data[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        train_data[col] = train_data[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        train_data[col] = train_data[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        train_data[col] = train_data[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        train_data[col] = train_data[col].astype(np.float32)
                    else:
                        train_data[col] = train_data[col].astype(np.float64)
    
    end_mem = train_data.memory_usage().sum() / 1024**2
    print(f'Memory usage of dataframe is {start_mem:.2f} MB --> {end_mem:.2f} MB (Decreased by {100 * (start_mem - end_mem) / start_mem:.1f}%)')
    
    return train_data

### Reading the csv files

In [3]:
orders = pd.read_csv(root_csv + 'orders.csv')
departments = pd.read_csv(root_csv + 'departments.csv')
aisles = pd.read_csv(root_csv + 'aisles.csv')
products = pd.read_csv(root_csv + 'products.csv')
order_products_train = pd.read_csv(root_csv + 'order_products_train.csv')
order_products_prior = pd.read_csv(root_csv + 'order_products_prior.csv')

product_features = pd.read_pickle(root_pickle + 'product_features.pkl')
user_features = pd.read_pickle(root_pickle + 'user_features.pkl')
user_product_features = pd.read_pickle(root_pickle + 'user_product_features.pkl')

### Merging data with orders to prepare final train dataset

In [4]:
train_orders = orders.merge(order_products_train, on = 'order_id', how = 'inner')
train_orders.head(5)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,1187899,1,train,11,4,8,14.0,196,1,1
1,1187899,1,train,11,4,8,14.0,25133,2,1
2,1187899,1,train,11,4,8,14.0,38928,3,1
3,1187899,1,train,11,4,8,14.0,26405,4,1
4,1187899,1,train,11,4,8,14.0,39657,5,1


### Removing unnecessary columns from dataset

In [5]:
train_orders.drop(['eval_set', 'add_to_cart_order', 'order_id'], axis = 1, inplace = True)

### Finding Unique users in data

In [6]:
train_users = train_orders.user_id.unique()
train_users[:20]

array([ 1,  2,  5,  7,  8,  9, 10, 13, 14, 17, 18, 21, 23, 24, 27, 29, 30,
       34, 37, 38], dtype=int64)

### Merging user product features with dataset

In [7]:
df = user_product_features[user_product_features.user_id.isin(train_users)]
df.head(5)

df = df.merge(train_orders, on = ['user_id', 'product_id'], how = 'outer')
df.head(5)

Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,is_reorder_1,order_number,order_dow,order_hour_of_day,days_since_prior_order,reordered
0,1,196,10.0,9.0,0.899902,1.400391,17.59375,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
1,1,10258,9.0,8.0,0.888672,3.333984,19.5625,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
2,1,10326,1.0,0.0,0.0,5.0,28.0,5.0,0.0,0.0,0.0,,,,,
3,1,12427,10.0,9.0,0.899902,3.300781,17.59375,10.0,1.0,1.0,1.0,,,,,
4,1,13032,3.0,2.0,0.666504,6.332031,21.671875,10.0,1.0,0.0,0.0,11.0,4.0,8.0,14.0,1.0


### Handling missing values

In [8]:
df.order_number.fillna(df.groupby('user_id')['order_number'].transform('mean'), inplace = True)
df.order_dow.fillna(df.groupby('user_id')['order_dow'].transform('mean'), inplace = True)
df.order_hour_of_day.fillna(df.groupby('user_id')['order_hour_of_day'].transform('mean'), inplace = True)
df.days_since_prior_order.fillna(df.groupby('user_id')['days_since_prior_order'].transform('mean'), inplace = True)

### Removing products which were bought for the first time in last order by a user

In [9]:
print("Checking reordered count\n",df.reordered.value_counts())
print("\nChecking null values for reordered column in dataframe: ",df.reordered.isnull().sum())\

# Removing those products which were bought the first time in last order by a user
df = df[df.reordered != 0]

# Checking shape of dataframe
print("\nChecking shape of dataframe: ",df.shape)

# Now imputing 0 in reordered as they were not reordered by user in their last order.
df.reordered.fillna(0, inplace = True)
df.isnull().sum()

# Checking final dataframe
df.head(5)

Checking reordered count
 1.0    828824
0.0    555793
Name: reordered, dtype: int64

Checking null values for reordered column in dataframe:  7645837

Checking shape of dataframe:  (8474661, 16)


Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,is_reorder_1,order_number,order_dow,order_hour_of_day,days_since_prior_order,reordered
0,1,196,10.0,9.0,0.899902,1.400391,17.59375,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
1,1,10258,9.0,8.0,0.888672,3.333984,19.5625,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
2,1,10326,1.0,0.0,0.0,5.0,28.0,5.0,0.0,0.0,0.0,11.0,4.0,8.0,14.0,0.0
3,1,12427,10.0,9.0,0.899902,3.300781,17.59375,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,0.0
4,1,13032,3.0,2.0,0.666504,6.332031,21.671875,10.0,1.0,0.0,0.0,11.0,4.0,8.0,14.0,1.0


### Merging product and user features in dataset

In [10]:
df = df.merge(product_features, on = 'product_id', how = 'left')
df = df.merge(user_features, on = 'user_id', how = 'left')
reduce_mem_usage(df)
df.head(5)

Memory usage of dataframe is 1697.23 MB --> 1244.64 MB (Decreased by 26.7%)


Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,...,total_reorders_by_user,reorder_propotion_by_user,average_order_size,reorder_in_order,orders_3,orders_2,orders_1,reorder_3,reorder_2,reorder_1
0,1,196,10.0,9.0,0.899902,1.400391,17.59375,10.0,1.0,1.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
1,1,10258,9.0,8.0,0.888672,3.333984,19.5625,10.0,1.0,1.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
2,1,10326,1.0,0.0,0.0,5.0,28.0,5.0,0.0,0.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
3,1,12427,10.0,9.0,0.899902,3.300781,17.59375,10.0,1.0,1.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
4,1,13032,3.0,2.0,0.666504,6.332031,21.671875,10.0,1.0,0.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504


### Getting the shape of final data and checking for null values

In [11]:
# Checking shape of dataframe
print("\nChecking shape of dataframe: ",df.shape)

# Checking any null values present in final dataframe
df.isnull().sum().sort_values(ascending = False)


Checking shape of dataframe:  (8474661, 69)


user_id                     0
department_unique_users     0
order_dow_avg               0
department_4                0
department_3                0
                           ..
aisle_reorder_percentage    0
aisle_unique_users          0
aisle_0                     0
aisle_1                     0
reorder_1                   0
Length: 69, dtype: int64

### Storing the final data in pickle file for faster processing

In [13]:
df.to_pickle(root_pickle + 'Finaldata.pkl')

### Checking the data of final dataframe

In [14]:
df2 = pd.read_pickle(root_pickle +'Finaldata.pkl')
df2.head(5)

Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,...,total_reorders_by_user,reorder_propotion_by_user,average_order_size,reorder_in_order,orders_3,orders_2,orders_1,reorder_3,reorder_2,reorder_1
0,1,196,10.0,9.0,0.899902,1.400391,17.59375,10.0,1.0,1.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
1,1,10258,9.0,8.0,0.888672,3.333984,19.5625,10.0,1.0,1.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
2,1,10326,1.0,0.0,0.0,5.0,28.0,5.0,0.0,0.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
3,1,12427,10.0,9.0,0.899902,3.300781,17.59375,10.0,1.0,1.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
4,1,13032,3.0,2.0,0.666504,6.332031,21.671875,10.0,1.0,0.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
