In [1]:
import pandas as pd
import numpy as np

In [2]:
departments_df = pd.read_csv("data/departments.csv")
aisles_df = pd.read_csv("data/aisles.csv")
orders_df = pd.read_csv("data/orders.csv")
products_df = pd.read_csv("data/products.csv")
order_products_prior_df = pd.read_csv("data/order_products__prior.csv")


### About the following table:

This table has been randomly generated where each user_id is related to a name, location (Address, City and State) , Pin-Code and email address.

In [3]:
random_us_addresses_df = pd.read_csv("data/random_us_addresses.csv", header= None)

### Function to check for NaN values in each column for a table

In [4]:
def checking_NaN(table):
    for column in table.columns:
        null_values = table[f"{column}"].isnull().sum()
        print(f"{column} - {null_values}")

### Checking departments_df for any violations in data and cleaning it

In [5]:
departments_df

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol
5,6,international
6,7,beverages
7,8,pets
8,9,dry goods pasta
9,10,bulk


### Checking aisles_df for any violations in data and cleaning it

In [6]:
aisles_df

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation
...,...,...
129,130,hot cereal pancake mixes
130,131,dry pasta
131,132,beauty
132,133,muscles joints pain relief


Checking if any duplicate ids or aisles are present

In [7]:
aisles_df[aisles_df.duplicated(subset=["aisle_id"])]

Unnamed: 0,aisle_id,aisle


In [8]:
aisles_df[aisles_df.duplicated(subset=["aisle"])]

Unnamed: 0,aisle_id,aisle


Checking if any aisle_id or aisle is null

In [9]:
checking_NaN(aisles_df)

aisle_id - 0
aisle - 0


### Checking orders_df for any violations in data and cleaning it

In [10]:
orders_df

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


Checking for NaN Values in each column

In [11]:
checking_NaN(orders_df)


order_id - 0
user_id - 0
eval_set - 0
order_number - 0
order_dow - 0
order_hour_of_day - 0
days_since_prior_order - 206209


In [12]:
orders_df = orders_df.replace(np.nan,0)

In [13]:
orders_df

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,0.0
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


In [14]:
orders_df = orders_df.astype({"days_since_prior_order":int})

In [15]:
orders_df.days_since_prior_order = orders_df.days_since_prior_order.replace(0,"\\N")

In [16]:
orders_df

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,\N
1,2398795,1,prior,2,3,7,15
2,473747,1,prior,3,3,12,21
3,2254736,1,prior,4,4,7,29
4,431534,1,prior,5,4,15,28
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29
3421079,1854736,206209,prior,11,4,10,30
3421080,626363,206209,prior,12,1,12,18
3421081,2977660,206209,prior,13,1,12,7


Check if combined (order_id, user_id) value is duplicated. This is done since they will be made the primary keys for this table

In [17]:
orders_df[orders_df.duplicated(subset=["order_id", "user_id"])]

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order


In [18]:
def cleaning_orders_df():
    # Dropping eval set column
    orders_df.drop(columns=["eval_set"], inplace=True)
    # Making all the nan values in days_since_prior_order replace with 0
    # orders_df.fillna(0, inplace=True)

In [19]:
cleaning_orders_df()

In [20]:
orders_df

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,\N
1,2398795,1,2,3,7,15
2,473747,1,3,3,12,21
3,2254736,1,4,4,7,29
4,431534,1,5,4,15,28
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29
3421079,1854736,206209,11,4,10,30
3421080,626363,206209,12,1,12,18
3421081,2977660,206209,13,1,12,7


### Checking products_df for any violations in data and cleaning it

In [21]:
products_df

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13
...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5
49684,49685,En Croute Roast Hazelnut Cranberry,42,1
49685,49686,Artisan Baguette,112,3
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8


Checking for NaN values

In [22]:
checking_NaN(products_df)

product_id - 0
product_name - 0
aisle_id - 0
department_id - 0


Checking for duplicate values in product_id as it will be the primary key

In [23]:
products_df[products_df.duplicated(subset=["product_id"])]

Unnamed: 0,product_id,product_name,aisle_id,department_id


Mapping department_df to product_df (department_id)

In [24]:
products_df['department_id'] = products_df.department_id.map(departments_df.set_index('department_id').department)

In [25]:
products_df

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,snacks
1,2,All-Seasons Salt,104,pantry
2,3,Robust Golden Unsweetened Oolong Tea,94,beverages
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,frozen
4,5,Green Chile Anytime Sauce,5,pantry
...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,alcohol
49684,49685,En Croute Roast Hazelnut Cranberry,42,frozen
49685,49686,Artisan Baguette,112,bakery
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,41,pets


Mapping aisle_df to product_df (aisle_id)

In [26]:
products_df['aisle_id'] = products_df.aisle_id.map(aisles_df.set_index('aisle_id').aisle)

In [27]:
products_df

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,cookies cakes,snacks
1,2,All-Seasons Salt,spices seasonings,pantry
2,3,Robust Golden Unsweetened Oolong Tea,tea,beverages
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,frozen meals,frozen
4,5,Green Chile Anytime Sauce,marinades meat preparation,pantry
...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",spirits,alcohol
49684,49685,En Croute Roast Hazelnut Cranberry,frozen vegan vegetarian,frozen
49685,49686,Artisan Baguette,bread,bakery
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,cat food care,pets


### Checking order_products_prior_df for any violations in data and cleaning it

In [28]:
order_products_prior_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0
...,...,...,...,...
32434484,3421083,39678,6,1
32434485,3421083,11352,7,0
32434486,3421083,4600,8,0
32434487,3421083,24852,9,1


In [29]:
order_products_prior_df[order_products_prior_df.duplicated(subset=["order_id", "product_id"])]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered


Checking for NaN values

In [30]:
checking_NaN(order_products_prior_df)

order_id - 0
product_id - 0
add_to_cart_order - 0
reordered - 0


### Checking random_us_addresses_df for any violations in data and cleaning it

In [31]:
random_us_addresses_df

Unnamed: 0,0,1,2,3,4,5,6
0,0,Kelly Norman,530 Kevin Plain,Beckstad,DC,10028,kellynorman530@gordon-carey.com
1,1,Susan Hernandez,3210 Stuart Union,Port Spencer,VA,88043,susanhernandez3210@ayala.com
2,2,David Roach,0687 Matthew Circles,Mallorybury,PW,85734,davidroach0687@chang.com
3,3,Brandon Brown,71413 Dickson Hills Apt. 762,Glendachester,PA,61726,brandonbrown71413@hanna-cruz.com
4,4,Joshua French,2949 Samantha Wall Suite 216,New Davidview,WI,18740,joshuafrench2949@sanchez.com
...,...,...,...,...,...,...,...
3999995,3999995,Pamela Waller,02886 Sara Cove Suite 663,Smithland,NH,9812,pamelawaller02886@serrano-duke.org
3999996,3999996,Jeffrey Lamb,1849 Erin Divide,Port Kelliville,SC,58072,jeffreylamb1849@dillon.com
3999997,3999997,Crystal Walsh,02038 Jason Alley,Steventown,WI,29788,crystalwalsh02038@jennings.biz
3999998,3999998,Joe Davidson,055 Jordan Pass,Guzmanfurt,SC,25399,joedavidson055@parker-schmitt.com


Naming the columns

In [32]:
random_us_addresses_df.rename(columns= {
    0: "user_id",
    1: "user_name",
    2: "user_address",
    3: "city",
    4: "state",
    5: "pincode",
    6: "user_email",
}, inplace= True )

In [33]:
random_us_addresses_df

Unnamed: 0,user_id,user_name,user_address,city,state,pincode,user_email
0,0,Kelly Norman,530 Kevin Plain,Beckstad,DC,10028,kellynorman530@gordon-carey.com
1,1,Susan Hernandez,3210 Stuart Union,Port Spencer,VA,88043,susanhernandez3210@ayala.com
2,2,David Roach,0687 Matthew Circles,Mallorybury,PW,85734,davidroach0687@chang.com
3,3,Brandon Brown,71413 Dickson Hills Apt. 762,Glendachester,PA,61726,brandonbrown71413@hanna-cruz.com
4,4,Joshua French,2949 Samantha Wall Suite 216,New Davidview,WI,18740,joshuafrench2949@sanchez.com
...,...,...,...,...,...,...,...
3999995,3999995,Pamela Waller,02886 Sara Cove Suite 663,Smithland,NH,9812,pamelawaller02886@serrano-duke.org
3999996,3999996,Jeffrey Lamb,1849 Erin Divide,Port Kelliville,SC,58072,jeffreylamb1849@dillon.com
3999997,3999997,Crystal Walsh,02038 Jason Alley,Steventown,WI,29788,crystalwalsh02038@jennings.biz
3999998,3999998,Joe Davidson,055 Jordan Pass,Guzmanfurt,SC,25399,joedavidson055@parker-schmitt.com


In [34]:
checking_NaN(random_us_addresses_df)

user_id - 0
user_name - 0
user_address - 0
city - 0
state - 0
pincode - 0
user_email - 0


### Making order collection

In [35]:
def datetime(x, y):
    return {"order_dow": x, "order_hour_of_day": y}

Adding datetime column

In [36]:
orders_df["datetime"] = orders_df.apply(lambda row: datetime(row["order_dow"], row["order_hour_of_day"]), axis=1)

In [37]:
orders_df.drop(columns={"order_dow", "order_hour_of_day"}, inplace=True)

In [38]:
orders_df

Unnamed: 0,order_id,user_id,order_number,days_since_prior_order,datetime
0,2539329,1,1,\N,"{'order_dow': 2, 'order_hour_of_day': 8}"
1,2398795,1,2,15,"{'order_dow': 3, 'order_hour_of_day': 7}"
2,473747,1,3,21,"{'order_dow': 3, 'order_hour_of_day': 12}"
3,2254736,1,4,29,"{'order_dow': 4, 'order_hour_of_day': 7}"
4,431534,1,5,28,"{'order_dow': 4, 'order_hour_of_day': 15}"
...,...,...,...,...,...
3421078,2266710,206209,10,29,"{'order_dow': 5, 'order_hour_of_day': 18}"
3421079,1854736,206209,11,30,"{'order_dow': 4, 'order_hour_of_day': 10}"
3421080,626363,206209,12,18,"{'order_dow': 1, 'order_hour_of_day': 12}"
3421081,2977660,206209,13,7,"{'order_dow': 1, 'order_hour_of_day': 12}"


Add products column

In [39]:
order_products_prior_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0
...,...,...,...,...
32434484,3421083,39678,6,1
32434485,3421083,11352,7,0
32434486,3421083,4600,8,0
32434487,3421083,24852,9,1


In [40]:
grouped_series= order_products_prior_df.groupby('order_id').apply(lambda x: {'product_id': x['product_id'].tolist(), 'reordered': x['reordered'].tolist()})

In [41]:
grouped_df = grouped_series.to_frame()

In [42]:
grouped_df.reset_index(inplace=True)

In [43]:
grouped_df.rename(columns={0: "products"}, inplace=True)

In [44]:
grouped_df

Unnamed: 0,order_id,products
0,2,"{'product_id': [33120, 28985, 9327, 45918, 300..."
1,3,"{'product_id': [33754, 24838, 17704, 21903, 17..."
2,4,"{'product_id': [46842, 26434, 39758, 27761, 10..."
3,5,"{'product_id': [13176, 15005, 47329, 27966, 23..."
4,6,"{'product_id': [40462, 15873, 41897], 'reorder..."
...,...,...
3214869,3421079,"{'product_id': [30136], 'reordered': [0]}"
3214870,3421080,"{'product_id': [27845, 4932, 18811, 41950, 317..."
3214871,3421081,"{'product_id': [38185, 12218, 32299, 3060, 205..."
3214872,3421082,"{'product_id': [17279, 12738, 16797, 43352, 32..."


Mapping to order_id

In [45]:
orders_df["products"] = orders_df.order_id.map(grouped_df.set_index('order_id').products)

In [46]:
orders_df

Unnamed: 0,order_id,user_id,order_number,days_since_prior_order,datetime,products
0,2539329,1,1,\N,"{'order_dow': 2, 'order_hour_of_day': 8}","{'product_id': [196, 14084, 12427, 26088, 2640..."
1,2398795,1,2,15,"{'order_dow': 3, 'order_hour_of_day': 7}","{'product_id': [196, 10258, 12427, 13176, 2608..."
2,473747,1,3,21,"{'order_dow': 3, 'order_hour_of_day': 12}","{'product_id': [196, 12427, 10258, 25133, 3045..."
3,2254736,1,4,29,"{'order_dow': 4, 'order_hour_of_day': 7}","{'product_id': [196, 12427, 10258, 25133, 2640..."
4,431534,1,5,28,"{'order_dow': 4, 'order_hour_of_day': 15}","{'product_id': [196, 12427, 10258, 25133, 1032..."
...,...,...,...,...,...,...
3421078,2266710,206209,10,29,"{'order_dow': 5, 'order_hour_of_day': 18}","{'product_id': [9405, 6846, 15700, 26503, 4121..."
3421079,1854736,206209,11,30,"{'order_dow': 4, 'order_hour_of_day': 10}","{'product_id': [6846, 9405, 41213, 24852, 1934..."
3421080,626363,206209,12,18,"{'order_dow': 1, 'order_hour_of_day': 12}","{'product_id': [6846, 24852, 9405, 19348, 4121..."
3421081,2977660,206209,13,7,"{'order_dow': 1, 'order_hour_of_day': 12}","{'product_id': [24852, 9405, 16168, 39216, 141..."


### Loading data to MongoDB

In [47]:
products_df.rename(columns={"product_id": "_id", "product_name":"name", "aisle_id":"aisle", "department_id":"department"}, inplace=True)
random_us_addresses_df.rename(columns={"user_id":"_id", "user_address":"address"}, inplace=True)
orders_df.rename(columns={"order_id": "_id"}, inplace=True)

In [48]:
products_df.replace("\\N", None, inplace=True)
random_us_addresses_df.replace("\\N", None, inplace=True)
orders_df.replace("\\N", None, inplace=True)

In [52]:
products_dict = products_df.to_dict(orient='records')
users_dict = random_us_addresses_df.to_dict(orient='records')
orders_dict = orders_df.to_dict(orient='records')

In [53]:
from pymongo import MongoClient
def connect_to_mongodb():
    client = MongoClient("mongodb://localhost:27017")
    mydb = client["instacart"]
    orders = mydb["Orders"]
    products = mydb["Products"]
    users = mydb["Users"]
    return orders, products, users

orders, products, users = connect_to_mongodb()

In [54]:
def process_data(data):
    processed_data = []
    for documents in data:
        doc = {k: v for k, v in documents.items() if v is not None}
        processed_data.append(doc)
    return processed_data

In [55]:
products_data = process_data(products_dict)
orders_data = process_data(orders_dict)
users_data = process_data(users_dict)

In [56]:
products.delete_many({})
orders.delete_many({})
users.delete_many({})

products.insert_many(products_data)
orders.insert_many(orders_data)
users.insert_many(users_data)

BulkWriteError: batch op errors occurred, full error: {'writeErrors': [{'index': 0, 'code': 121, 'errmsg': 'Document failed validation', 'errInfo': {'failingDocumentId': 2539329, 'details': {'operatorName': '$and', 'clausesNotSatisfied': [{'index': 0, 'details': {'operatorName': '$jsonSchema', 'schemaRulesNotSatisfied': [{'operatorName': 'properties', 'propertiesNotSatisfied': [{'propertyName': 'products', 'details': [{'operatorName': 'bsonType', 'specifiedAs': {'bsonType': 'array'}, 'reason': 'type did not match', 'consideredValue': {'product_id': [196, 14084, 12427, 26088, 26405], 'reordered': [0, 0, 0, 0, 0]}, 'consideredType': 'object'}]}]}]}}]}}, 'op': {'_id': 2539329, 'user_id': 1, 'order_number': 1, 'datetime': {'order_dow': 2, 'order_hour_of_day': 8}, 'products': {'product_id': [196, 14084, 12427, 26088, 26405], 'reordered': [0, 0, 0, 0, 0]}}}], 'writeConcernErrors': [], 'nInserted': 0, 'nUpserted': 0, 'nMatched': 0, 'nModified': 0, 'nRemoved': 0, 'upserted': []}