In [1]:
!pip install PyDrive

Collecting PyDrive
[?25l  Downloading https://files.pythonhosted.org/packages/52/e0/0e64788e5dd58ce2d6934549676243dc69d982f198524be9b99e9c2a4fd5/PyDrive-1.3.1.tar.gz (987kB)
[K     |████████████████████████████████| 993kB 268kB/s 
Building wheels for collected packages: PyDrive
  Building wheel for PyDrive (setup.py) ... [?25l[?25hdone
  Created wheel for PyDrive: filename=PyDrive-1.3.1-cp36-none-any.whl size=27437 sha256=46da146a97b2c46a86dde3dd36c32b092f87507fd0574893050c7bd51a4a75ad
  Stored in directory: /root/.cache/pip/wheels/fa/d2/9a/d3b6b506c2da98289e5d417215ce34b696db856643bad779f4
Successfully built PyDrive
Installing collected packages: PyDrive
Successfully installed PyDrive-1.3.1


In [0]:
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
download = drive.CreateFile({'id': '1QruGYmwRJKPCj_hZrvD64pmtiPWK3lPM'})
download.GetContentFile('insta_cart_data.tar')

In [5]:
!tar -xvf insta_cart_data.tar


instacart_2017_05_01/
instacart_2017_05_01/._aisles.csv
instacart_2017_05_01/aisles.csv
instacart_2017_05_01/._departments.csv
instacart_2017_05_01/departments.csv
instacart_2017_05_01/._order_products__prior.csv
instacart_2017_05_01/order_products__prior.csv
instacart_2017_05_01/._order_products__train.csv
instacart_2017_05_01/order_products__train.csv
instacart_2017_05_01/._orders.csv
instacart_2017_05_01/orders.csv
instacart_2017_05_01/._products.csv
instacart_2017_05_01/products.csv


# Data Wrangling

In [0]:
# We import the needed packages.
import pandas as pd



In [0]:
# read data in data frames
# folder name
folder_str = 'instacart_2017_05_01/'
files = {'ais': 'aisles.csv', 
         'dpt': 'departments.csv', 
         'opp': 'order_products__prior.csv', 
         'opt': 'order_products__train.csv', 
         'ord': 'orders.csv', 
         'prd': 'products.csv'}

df = {}
for key in files.keys():
  file_name = folder_str + files[key]
  df[key] = pd.read_csv(file_name)

__Checking the dataframes__

In [8]:
# aisle frame
df['ais'].head()

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [9]:
# set index
df['ais'].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 2 columns):
aisle_id    134 non-null int64
aisle       134 non-null object
dtypes: int64(1), object(1)
memory usage: 2.2+ KB


In [10]:
# departments frame
df['dpt'].head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [11]:
df['dpt'].set_index('department_id', inplace=True)
#df['dpt']['department'] = df['dpt']['department'].astype('category')
df['dpt'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21 entries, 1 to 21
Data columns (total 1 columns):
department    21 non-null object
dtypes: object(1)
memory usage: 336.0+ bytes


In [12]:
# products frame
df['prd'].head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [13]:
df['prd'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49688 entries, 0 to 49687
Data columns (total 4 columns):
product_id       49688 non-null int64
product_name     49688 non-null object
aisle_id         49688 non-null int64
department_id    49688 non-null int64
dtypes: int64(3), object(1)
memory usage: 1.5+ MB


In [14]:
# orders frame
df['ord'].head(20)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


In [15]:
df['ord'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
order_id                  int64
user_id                   int64
eval_set                  object
order_number              int64
order_dow                 int64
order_hour_of_day         int64
days_since_prior_order    float64
dtypes: float64(1), int64(5), object(1)
memory usage: 182.7+ MB


In [16]:
# change eval_set to category
df['ord']['eval_set'] = df['ord']['eval_set'].astype('category')

# check values
df['ord']['eval_set'].unique()

[prior, train, test]
Categories (3, object): [prior, train, test]

In [17]:
# check for duplicate orders
df['ord']['order_id'].duplicated(keep='first').count

<bound method Series.count of 0          False
1          False
2          False
3          False
4          False
5          False
6          False
7          False
8          False
9          False
10         False
11         False
12         False
13         False
14         False
15         False
16         False
17         False
18         False
19         False
20         False
21         False
22         False
23         False
24         False
25         False
26         False
27         False
28         False
29         False
           ...  
3421053    False
3421054    False
3421055    False
3421056    False
3421057    False
3421058    False
3421059    False
3421060    False
3421061    False
3421062    False
3421063    False
3421064    False
3421065    False
3421066    False
3421067    False
3421068    False
3421069    False
3421070    False
3421071    False
3421072    False
3421073    False
3421074    False
3421075    False
3421076    False
3421077    False
3421078    False
3

The orders dataframe does not contain any duplicate orders.

How many orders of each type?

In [18]:
orders = df['ord'].eval_set.unique()
for order in orders:
  count_type = df['ord'][df['ord']['eval_set']==order]['eval_set'].count()
  print(order, ': we count', count_type)

prior : we count 3214874
train : we count 131209
test : we count 75000


In [19]:
# check all users order # 1 is nan:
d = df['ord'][['order_number', 'days_since_prior_order']]
not_num = d[(d['order_number'] == 1)].dropna().count()

print('First order numeric values: ', not_num[1])

# check all users orders in later number are numbers
num_before = d[(d['order_number'] > 1)].shape
num_after = d[(d['order_number'] > 1) ].dropna().shape
print('shape before dropna: ', num_before)
print('shape after dropna: ', num_after)

First order numeric values:  0
shape before dropna:  (3214874, 2)
shape after dropna:  (3214874, 2)


All orders with order number 1 have NaN for days since prior order. 

All orders with number higher than 1 are numbers. No row was dropped as shown by the shape of the data before and after the attepmt to drop non-numerical values.

In [20]:
# orders prior and train
df['opp'].head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [21]:
print(df['opp'].info())
print(df['opp'].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
order_id             int64
product_id           int64
add_to_cart_order    int64
reordered            int64
dtypes: int64(4)
memory usage: 989.8 MB
None
order_id             32434489
product_id           32434489
add_to_cart_order    32434489
reordered            32434489
dtype: int64


In [22]:
df['opt'].head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [23]:
print(df['opt'].info())
print(df['opt'].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1384617 entries, 0 to 1384616
Data columns (total 4 columns):
order_id             1384617 non-null int64
product_id           1384617 non-null int64
add_to_cart_order    1384617 non-null int64
reordered            1384617 non-null int64
dtypes: int64(4)
memory usage: 42.3 MB
None
order_id             1384617
product_id           1384617
add_to_cart_order    1384617
reordered            1384617
dtype: int64


__Combining data__

In [24]:
# combining prior and train dataframes
combine = pd.concat([df['opp'], df['opt']], sort = True)
print(combine.info())
print(combine.count())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33819106 entries, 0 to 1384616
Data columns (total 4 columns):
add_to_cart_order    int64
order_id             int64
product_id           int64
reordered            int64
dtypes: int64(4)
memory usage: 1.3 GB
None
add_to_cart_order    33819106
order_id             33819106
product_id           33819106
reordered            33819106
dtype: int64


In [25]:
# merging orders and combine dataframes
data_all = pd.merge(left=df['ord'], right=combine, on='order_id')
data_all.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,add_to_cart_order,product_id,reordered
0,2539329,1,prior,1,2,8,,1,196,0
1,2539329,1,prior,1,2,8,,2,14084,0
2,2539329,1,prior,1,2,8,,3,12427,0
3,2539329,1,prior,1,2,8,,4,26088,0
4,2539329,1,prior,1,2,8,,5,26405,0


In [26]:
data_all.count()

order_id                  33819106
user_id                   33819106
eval_set                  33819106
order_number              33819106
order_dow                 33819106
order_hour_of_day         33819106
days_since_prior_order    31741038
add_to_cart_order         33819106
product_id                33819106
reordered                 33819106
dtype: int64

In [27]:
#merging products, aisles and department dataframes
products = pd.merge(left=df['prd'], right=df['ais'], on='aisle_id')
products = pd.merge(left=products, right=df['dpt'], on='department_id')
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks
1,78,Nutter Butter Cookie Bites Go-Pak,61,19,cookies cakes,snacks
2,102,Danish Butter Cookies,61,19,cookies cakes,snacks
3,172,Gluten Free All Natural Chocolate Chip Cookies,61,19,cookies cakes,snacks
4,285,Mini Nilla Wafers Munch Pack,61,19,cookies cakes,snacks


In [28]:
products.count()

product_id       49688
product_name     49688
aisle_id         49688
department_id    49688
aisle            49688
department       49688
dtype: int64

In [29]:
# combining data_all and products one dataframe
data_all = pd.merge(left=data_all, right=products, on='product_id')
data_all.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,add_to_cart_order,product_id,reordered,product_name,aisle_id,department_id,aisle,department
0,2539329,1,prior,1,2,8,,1,196,0,Soda,77,7,soft drinks,beverages
1,2398795,1,prior,2,3,7,15.0,1,196,1,Soda,77,7,soft drinks,beverages
2,473747,1,prior,3,3,12,21.0,1,196,1,Soda,77,7,soft drinks,beverages
3,2254736,1,prior,4,4,7,29.0,1,196,1,Soda,77,7,soft drinks,beverages
4,431534,1,prior,5,4,15,28.0,1,196,1,Soda,77,7,soft drinks,beverages


In [30]:
data_all.count()

order_id                  33819106
user_id                   33819106
eval_set                  33819106
order_number              33819106
order_dow                 33819106
order_hour_of_day         33819106
days_since_prior_order    31741038
add_to_cart_order         33819106
product_id                33819106
reordered                 33819106
product_name              33819106
aisle_id                  33819106
department_id             33819106
aisle                     33819106
department                33819106
dtype: int64

All columns have 33,819,106 entries with except of days_since_prior_order, which has NaN values for first order.

In [31]:
# set index for unique identification of each row.
data_all.set_index(['user_id', 'order_number', 'add_to_cart_order'], inplace=True, drop=True)
data_all.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,order_id,eval_set,order_dow,order_hour_of_day,days_since_prior_order,product_id,reordered,product_name,aisle_id,department_id,aisle,department
user_id,order_number,add_to_cart_order,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1,1,2539329,prior,2,8,,196,0,Soda,77,7,soft drinks,beverages
1,2,1,2398795,prior,3,7,15.0,196,1,Soda,77,7,soft drinks,beverages
1,3,1,473747,prior,3,12,21.0,196,1,Soda,77,7,soft drinks,beverages
1,4,1,2254736,prior,4,7,29.0,196,1,Soda,77,7,soft drinks,beverages
1,5,1,431534,prior,4,15,28.0,196,1,Soda,77,7,soft drinks,beverages
1,6,1,3367565,prior,2,7,19.0,196,1,Soda,77,7,soft drinks,beverages
1,7,1,550135,prior,1,9,20.0,196,1,Soda,77,7,soft drinks,beverages
1,8,2,3108588,prior,1,14,14.0,196,1,Soda,77,7,soft drinks,beverages
1,9,4,2295261,prior,1,16,0.0,196,1,Soda,77,7,soft drinks,beverages
1,10,1,2550362,prior,4,8,30.0,196,1,Soda,77,7,soft drinks,beverages


In [32]:
# sort index
data_all.sort_index(inplace=True)
data_all.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,order_id,eval_set,order_dow,order_hour_of_day,days_since_prior_order,product_id,reordered,product_name,aisle_id,department_id,aisle,department
user_id,order_number,add_to_cart_order,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1,1,2539329,prior,2,8,,196,0,Soda,77,7,soft drinks,beverages
1,1,2,2539329,prior,2,8,,14084,0,Organic Unsweetened Vanilla Almond Milk,91,16,soy lactosefree,dairy eggs
1,1,3,2539329,prior,2,8,,12427,0,Original Beef Jerky,23,19,popcorn jerky,snacks
1,1,4,2539329,prior,2,8,,26088,0,Aged White Cheddar Popcorn,23,19,popcorn jerky,snacks
1,1,5,2539329,prior,2,8,,26405,0,XL Pick-A-Size Paper Towel Rolls,54,17,paper goods,household
1,2,1,2398795,prior,3,7,15.0,196,1,Soda,77,7,soft drinks,beverages
1,2,2,2398795,prior,3,7,15.0,10258,0,Pistachios,117,19,nuts seeds dried fruit,snacks
1,2,3,2398795,prior,3,7,15.0,12427,1,Original Beef Jerky,23,19,popcorn jerky,snacks
1,2,4,2398795,prior,3,7,15.0,13176,0,Bag of Organic Bananas,24,4,fresh fruits,produce
1,2,5,2398795,prior,3,7,15.0,26088,1,Aged White Cheddar Popcorn,23,19,popcorn jerky,snacks


__Test data__
is saved as a separate dataframe, because it does not have any items ordered in it.

In [33]:
data_test = df['ord'][df['ord']['eval_set'] == 'test']
data_test.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
38,2774568,3,test,13,5,15,11.0
44,329954,4,test,6,3,12,30.0
53,1528013,6,test,4,3,16,22.0
96,1376945,11,test,8,6,11,8.0
102,1356845,12,test,6,1,20,30.0


Save the dataframes for further processing.

In [0]:
data_all.to_csv('instacart_combined_data.csv')


In [0]:
data_test.to_csv('data_test.csv')