In [45]:
import pandas as pd
import numpy as np

In [46]:
departments_df = pd.read_csv("data/departments.csv")
aisles_df = pd.read_csv("data/aisles.csv")
orders_df = pd.read_csv("data/orders.csv")
products_df = pd.read_csv("data/products.csv")
order_products_prior_df = pd.read_csv("data/order_products__prior.csv")


### About the following table:

This table has been randomly generated where each user_id is related to a name, location (Address, City and State) , Pin-Code and email address.

In [47]:
random_us_addresses_df = pd.read_csv("data/random_us_addresses.csv", header= None)

### Function to check for NaN values in each column for a table

In [48]:
def checking_NaN(table):
    for column in table.columns:
        null_values = table[f"{column}"].isnull().sum()
        print(f"{column} - {null_values}")

### Checking departments_df for any violations in data and cleaning it

In [49]:
departments_df

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol
5,6,international
6,7,beverages
7,8,pets
8,9,dry goods pasta
9,10,bulk


### Checking aisles_df for any violations in data and cleaning it

In [50]:
aisles_df

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation
...,...,...
129,130,hot cereal pancake mixes
130,131,dry pasta
131,132,beauty
132,133,muscles joints pain relief


Checking if any duplicate ids or aisles are present

In [51]:
aisles_df[aisles_df.duplicated(subset=["aisle_id"])]

Unnamed: 0,aisle_id,aisle


In [52]:
aisles_df[aisles_df.duplicated(subset=["aisle"])]

Unnamed: 0,aisle_id,aisle


Checking if any aisle_id or aisle is null

In [53]:
checking_NaN(aisles_df)

aisle_id - 0
aisle - 0


### Checking orders_df for any violations in data and cleaning it

In [54]:
orders_df

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


Checking for NaN Values in each column

In [55]:
checking_NaN(orders_df)


order_id - 0
user_id - 0
eval_set - 0
order_number - 0
order_dow - 0
order_hour_of_day - 0
days_since_prior_order - 206209


Check if combined (order_id, user_id) value is duplicated. This is done since they will be made the primary keys for this table

In [62]:
orders_df[orders_df.duplicated(subset=["order_id", "user_id"])]

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order


In [56]:
def cleaning_orders_df():
    # Dropping eval set column
    orders_df.drop(columns=["eval_set"], inplace=True)
    # Making all the nan values in days_since_prior_order replace with 0
    # orders_df.fillna(0, inplace=True)

In [57]:
cleaning_orders_df()

In [63]:
orders_df

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


### Checking products_df for any violations in data and cleaning it

In [64]:
products_df

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13
...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5
49684,49685,En Croute Roast Hazelnut Cranberry,42,1
49685,49686,Artisan Baguette,112,3
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8


Checking for NaN values

In [65]:
checking_NaN(products_df)

product_id - 0
product_name - 0
aisle_id - 0
department_id - 0


Checking for duplicate values in product_id as it will be the primary key

In [66]:
products_df[products_df.duplicated(subset=["product_id"])]

Unnamed: 0,product_id,product_name,aisle_id,department_id


### Checking order_products_prior_df for any violations in data and cleaning it

In [67]:
order_products_prior_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0
...,...,...,...,...
32434484,3421083,39678,6,1
32434485,3421083,11352,7,0
32434486,3421083,4600,8,0
32434487,3421083,24852,9,1


In [68]:
order_products_prior_df[order_products_prior_df.duplicated(subset=["order_id", "product_id"])]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered


Checking for NaN values

In [69]:
checking_NaN(order_products_prior_df)

order_id - 0
product_id - 0
add_to_cart_order - 0
reordered - 0


### Checking random_us_addresses_df for any violations in data and cleaning it

In [70]:
random_us_addresses_df

Unnamed: 0,0,1,2,3,4,5,6
0,0,Kelly Norman,530 Kevin Plain,Beckstad,DC,10028,kellynorman530@gordon-carey.com
1,1,Susan Hernandez,3210 Stuart Union,Port Spencer,VA,88043,susanhernandez3210@ayala.com
2,2,David Roach,0687 Matthew Circles,Mallorybury,PW,85734,davidroach0687@chang.com
3,3,Brandon Brown,71413 Dickson Hills Apt. 762,Glendachester,PA,61726,brandonbrown71413@hanna-cruz.com
4,4,Joshua French,2949 Samantha Wall Suite 216,New Davidview,WI,18740,joshuafrench2949@sanchez.com
...,...,...,...,...,...,...,...
3999995,3999995,Pamela Waller,02886 Sara Cove Suite 663,Smithland,NH,9812,pamelawaller02886@serrano-duke.org
3999996,3999996,Jeffrey Lamb,1849 Erin Divide,Port Kelliville,SC,58072,jeffreylamb1849@dillon.com
3999997,3999997,Crystal Walsh,02038 Jason Alley,Steventown,WI,29788,crystalwalsh02038@jennings.biz
3999998,3999998,Joe Davidson,055 Jordan Pass,Guzmanfurt,SC,25399,joedavidson055@parker-schmitt.com


Naming the columns

In [73]:
random_us_addresses_df.rename(columns= {
    0: "user_id",
    1: "user_name",
    2: "user_address",
    3: "city",
    4: "state",
    5: "pincode",
    6: "user_email",
}, inplace= True )

In [74]:
random_us_addresses_df

Unnamed: 0,user_id,user_name,user_address,city,state,pincode,user_email
0,0,Kelly Norman,530 Kevin Plain,Beckstad,DC,10028,kellynorman530@gordon-carey.com
1,1,Susan Hernandez,3210 Stuart Union,Port Spencer,VA,88043,susanhernandez3210@ayala.com
2,2,David Roach,0687 Matthew Circles,Mallorybury,PW,85734,davidroach0687@chang.com
3,3,Brandon Brown,71413 Dickson Hills Apt. 762,Glendachester,PA,61726,brandonbrown71413@hanna-cruz.com
4,4,Joshua French,2949 Samantha Wall Suite 216,New Davidview,WI,18740,joshuafrench2949@sanchez.com
...,...,...,...,...,...,...,...
3999995,3999995,Pamela Waller,02886 Sara Cove Suite 663,Smithland,NH,9812,pamelawaller02886@serrano-duke.org
3999996,3999996,Jeffrey Lamb,1849 Erin Divide,Port Kelliville,SC,58072,jeffreylamb1849@dillon.com
3999997,3999997,Crystal Walsh,02038 Jason Alley,Steventown,WI,29788,crystalwalsh02038@jennings.biz
3999998,3999998,Joe Davidson,055 Jordan Pass,Guzmanfurt,SC,25399,joedavidson055@parker-schmitt.com


In [75]:
checking_NaN(random_us_addresses_df)

user_id - 0
user_name - 0
user_address - 0
city - 0
state - 0
pincode - 0
user_email - 0


### Making CSVs to upload to database

In [80]:
departments_df.to_csv("departments.csv",index=False)

In [81]:
aisles_df.to_csv("aisles_df.csv", index=False)

In [82]:
order_products_prior_df.to_csv("order_products_prior_df.csv", index=False)

In [83]:
orders_df.to_csv("orders_df.csv", index=False)

In [84]:
products_df.to_csv("products_df.csv", index=False)

In [85]:
random_us_addresses_df.to_csv("random_us_addresses_df.csv", index=False)