## Create Combinations of Datasets to Facilitate Feature Engineering

In [1]:
# Import python libraries to read datasets and transform them.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import seaborn as sns

In [2]:
df_ais=pd.read_csv('aisles.csv') #read aisles dataset
df_dep=pd.read_csv('departments.csv') #read product department/category dataset
df_prd=pd.read_csv('products.csv') #read product details dataset
df_ord=pd.read_csv('orders.csv') #read order details dataset
df_opp=pd.read_csv('order_products__prior.csv') #read ordered products-prior dataset
df_opt=pd.read_csv('order_products__train.csv') #read ordered products-train dataset

In [3]:
df_ais.head()

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [4]:
df_dep.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [5]:
df_prd.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [6]:
df_ord.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [7]:
df_opp.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [8]:
df_opt.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


#### Combining Product Details

In [9]:
# Combining products dataset with aisles and department datasets.
df_pd=df_prd.merge(df_ais,on='aisle_id',how='left').merge(df_dep,on='department_id',how='left')

In [10]:
df_pd['product_name']=df_pd['product_name'].str.replace(' ','-')

In [11]:
df_pd.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
0,1,Chocolate-Sandwich-Cookies,61,19,cookies cakes,snacks
1,2,All-Seasons-Salt,104,13,spices seasonings,pantry
2,3,Robust-Golden-Unsweetened-Oolong-Tea,94,7,tea,beverages
3,4,Smart-Ones-Classic-Favorites-Mini-Rigatoni-Wit...,38,1,frozen meals,frozen
4,5,Green-Chile-Anytime-Sauce,5,13,marinades meat preparation,pantry


In [12]:
df_pd.to_pickle(r'input\product_detail.p')

#### Combining All Data

In [12]:
# Combining all dataset read above to get all the order details in a single dataset.
df_ordPrd_all = pd.concat([df_opp,df_opt],ignore_index=1)

In [13]:
df_ordPrd_all = df_ordPrd_all.merge(df_pd,on='product_id',how='left')
df_ordPrd_all.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,2,33120,1,1,Organic-Egg-Whites,86,16,eggs,dairy eggs
1,2,28985,2,1,Michigan-Organic-Kale,83,4,fresh vegetables,produce
2,2,9327,3,0,Garlic-Powder,104,13,spices seasonings,pantry
3,2,45918,4,1,Coconut-Butter,19,13,oils vinegars,pantry
4,2,30035,5,0,Natural-Sweetener,17,13,baking ingredients,pantry


In [14]:
df_all = df_ordPrd_all.merge(df_ord,on='order_id',how='left')

In [15]:
df_all.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,Organic-Egg-Whites,86,16,eggs,dairy eggs,202279,prior,3,5,9,8.0
1,2,28985,2,1,Michigan-Organic-Kale,83,4,fresh vegetables,produce,202279,prior,3,5,9,8.0
2,2,9327,3,0,Garlic-Powder,104,13,spices seasonings,pantry,202279,prior,3,5,9,8.0
3,2,45918,4,1,Coconut-Butter,19,13,oils vinegars,pantry,202279,prior,3,5,9,8.0
4,2,30035,5,0,Natural-Sweetener,17,13,baking ingredients,pantry,202279,prior,3,5,9,8.0


In [16]:
# Calculating Number of Orders Placed by Customer after the current order
df_all['SucceedingOrdCnt']=df_all.groupby('user_id')['order_number'].transform(np.max)-df_all['order_number']

In [None]:
df_all.to_pickle(r'input\all_data.p')

#### Combining All Orders Data

In [None]:
df_PrdLst = df_all.groupby('order_id')['product_name'].apply(list).reset_index()

In [None]:
df_OrdDet = df_ord.merge(df_PrdLst,on='order_id',how='left')
df_OrdDet.reset_index(drop=True,inplace=True)
df_OrdDet = df_OrdDet.merge(df_all[['order_id','SucceedingOrdCnt']].drop_duplicates(),on='order_id',how='left')
df_OrdDet['days_since_first_order'] = df_OrdDet.groupby('user_id')['days_since_prior_order'].cumsum()

In [None]:
# Define functions to determine previous product, same products reordered and different products ordered for first time
def diffPrd(items1, items2):
    if  isinstance(items1, float) or isinstance(items2, float):
        return items1
    return [i1 for i1 in items1 if i1 not in items2]

def samePrd(items1, items2):
    if  isinstance(items1, float) or isinstance(items2, float):
        return []
    return [i1 for i1 in items1 if i1 in items2]

In [None]:
#Determining Previous Product, Same Products Reordered and different products ordered using function defined above.
df_OrdDet['prev_prd'] = df_OrdDet.groupby('user_id')['product_name'].shift(1)
df_OrdDet['diff_prd'] = df_OrdDet.apply(lambda x: diffPrd(x['product_name'],x['prev_prd']),axis=1)
df_OrdDet['same_prd'] = df_OrdDet.apply(lambda x: samePrd(x['product_name'],x['prev_prd']),axis=1)

In [None]:
df_OrdDet.head()

In [None]:
df_OrdDet.to_pickle('input/all_order_details.p')

#### Label Reorders

In [18]:
# Label reorders and previous order id
lt1 = df_all[df_all['SucceedingOrdCnt']>1]

In [19]:
lt1.drop_duplicates(['user_id','product_id'],keep='last',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [20]:
lt1.sort_values(['user_id','product_id'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [22]:
lt0_y1 = df_all.loc[df_all['SucceedingOrdCnt']==1].loc[df_all['reordered']==1]

In [23]:
lt0_y1.sort_values(['user_id','product_id'],inplace=True)

In [24]:
lt1['key']=lt1['user_id'].map(str)+' '+lt1['product_id'].map(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [25]:
lt0_y1['key'] = lt0_y1['user_id'].map(str)+' '+lt0_y1['product_id'].map(str)

In [26]:
lt0_y0 = lt1[~lt1['key'].isin(lt0_y1['key'])]

In [27]:
lt0_y0.drop('order_id',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [28]:
lt0_y0 = lt0_y0.merge(df_all.loc[df_all['SucceedingOrdCnt']==1,['user_id','order_id']].drop_duplicates(),on='user_id',how='left')

In [29]:
lt0_y0.head()

Unnamed: 0,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,SucceedingOrdCnt,key,order_id
0,10326,5,0,Organic-Fuji-Apples,24,4,fresh fruits,produce,1,prior,5,4,15,28.0,6,1 10326,2550362
1,13176,4,0,Bag-of-Organic-Bananas,24,4,fresh fruits,produce,1,prior,2,3,7,15.0,9,1 13176,2550362
2,14084,2,0,Organic-Unsweetened-Vanilla-Almond-Milk,91,16,soy lactosefree,dairy eggs,1,prior,1,2,8,,10,1 14084,2550362
3,17122,6,0,Honeycrisp-Apples,24,4,fresh fruits,produce,1,prior,5,4,15,28.0,6,1 17122,2550362
4,26088,4,0,Aged-White-Cheddar-Popcorn,23,19,popcorn jerky,snacks,1,prior,1,2,8,,10,1 26088,2550362


In [30]:
lt0_y0['reordered']=0

In [32]:
label = pd.concat([lt0_y1,lt0_y0], ignore_index=1,sort=False)

In [33]:
label.sort_values(['user_id','product_id'],inplace=True)

In [34]:
label.reset_index(drop=1,inplace=True)

In [35]:
col=['order_id','product_id','reordered']

In [36]:
print(label[col].isnull().sum())

order_id      0
product_id    0
reordered     0
dtype: int64


In [37]:
label[col].to_pickle('input/label.p')

In [38]:
label.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,SucceedingOrdCnt,key
0,2550362,196,1,1,Soda,77,7,soft drinks,beverages,1,prior,10,4,8,30.0,1,1 196
1,2550362,10258,6,1,Pistachios,117,19,nuts seeds dried fruit,snacks,1,prior,10,4,8,30.0,1,1 10258
2,2550362,10326,5,0,Organic-Fuji-Apples,24,4,fresh fruits,produce,1,prior,5,4,15,28.0,6,1 10326
3,2550362,12427,9,1,Original-Beef-Jerky,23,19,popcorn jerky,snacks,1,prior,10,4,8,30.0,1,1 12427
4,2550362,13032,8,1,Cinnamon-Toast-Crunch,121,14,cereal,breakfast,1,prior,10,4,8,30.0,1,1 13032
