In [1]:
# Dependencies
import pandas as pd
from sqlalchemy import create_engine
import requests
import json
import numpy as np
from bs4 import BeautifulSoup
from splinter import Browser
from selenium import webdriver

## Data cleaning

In [2]:
# Loading all csv files into dataframes
sets_df = pd.read_csv("RebrickableCSVs/sets.csv")
parts_df = pd.read_csv("RebrickableCSVs/parts.csv")
colors_df = pd.read_csv("RebrickableCSVs/colors.csv")
themes_df = pd.read_csv("RebrickableCSVs/themes.csv")
elements_df = pd.read_csv("RebrickableCSVs/elements.csv")
inventories_df = pd.read_csv("RebrickableCSVs/inventories.csv")
inv_sets_df = pd.read_csv("RebrickableCSVs/inventory_sets.csv")
inv_parts_df = pd.read_csv("RebrickableCSVs/inventory_parts.csv")
category_df = pd.read_csv("RebrickableCSVs/part_categories.csv")
minifigs_df = pd.read_csv("RebrickableCSVs/minifigs.csv")
inv_minifigs_df = pd.read_csv("RebrickableCSVs/inventory_minifigs.csv")

sets_df.head()

Unnamed: 0,set_num,name,year,theme_id,num_parts
0,001-1,Gears,1965,1,43
1,0011-2,Town Mini-Figures,1978,84,12
2,0011-3,Castle 2 for 1 Bonus Offer,1987,199,0
3,0012-1,Space Mini-Figures,1979,143,12
4,0013-1,Space Mini-Figures,1979,143,12


In [3]:
# Extracting only 2020 sets
sets_df = sets_df.loc[sets_df['year']==2020]
sets_df.drop('year', axis=1, inplace=True)
print(f"Total sets in 2020: {len(sets_df.index)}")

Total sets in 2020: 789


In [4]:
# Extracting inventory_sets with sets in year 2020
temp_df = pd.merge(sets_df, inv_sets_df, how='inner', on='set_num')
inv_sets_df = temp_df[['inventory_id','set_num']]

print(f"Total inventory_sets in 2020: {len(inv_sets_df.index)}")
inv_sets_df.head()

Total inventory_sets in 2020: 116


Unnamed: 0,inventory_id,set_num
0,77818,41900-1
1,77818,41902-1
2,77818,41905-1
3,77818,41908-1
4,77814,60247-1


In [5]:
# Extracting inventories with sets in year 2020
inventories_df = inventories_df.rename(columns={'id':'inventory_id'})
temp_df = pd.merge(inv_sets_df, inventories_df, how='inner', on='set_num')

print(f"Total inventory in 2020: {len(inventories_df.index)}")
inventories_df.head()

Total inventory in 2020: 26964


Unnamed: 0,inventory_id,version,set_num
0,1,1,7922-1
1,3,1,3931-1
2,4,1,6942-1
3,15,1,5158-1
4,16,1,903-1


In [6]:
# Checking for duplicate rows
inventories_df.loc[inventories_df.duplicated(keep='first') == True]

Unnamed: 0,inventory_id,version,set_num


In [7]:
inventories_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26964 entries, 0 to 26963
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   inventory_id  26964 non-null  int64 
 1   version       26964 non-null  int64 
 2   set_num       26964 non-null  object
dtypes: int64(2), object(1)
memory usage: 632.1+ KB


In [8]:
# Extracting inventory_parts with sets in year 2020
temp_df = pd.merge(inv_parts_df, inventories_df, how='inner', on='inventory_id')
inv_parts_df = temp_df[['inventory_id','part_num','color_id']]
print(f"Total inventory_parts rows: {len(inv_parts_df.index)}")
inv_parts_df.head()

Total inventory_parts rows: 856758


Unnamed: 0,inventory_id,part_num,color_id
0,1,48379c01,72
1,1,48395,7
2,1,mcsport6,25
3,1,paddle,0
4,3,2343,47


In [9]:
# Checking for duplicate rows
inv_parts_df.loc[inv_parts_df.duplicated(keep='first') == True]

Unnamed: 0,inventory_id,part_num,color_id
17,3,33291,191
25,3,6141,27
26,3,6141,29
39,4,6141,42
74,22,3024,0
...,...,...,...
856468,79594,3713,71
856473,79594,4274,1
856508,79595,2780,0
856510,79595,32002,72


In [10]:
# Removing duplicated rows while joining
inv_parts_df = inv_parts_df.drop_duplicates(keep='first')
print(f"Total inventory_parts in 2020: {len(inv_parts_df.index)}")
# Checking for duplicate rows again
parts_df.loc[parts_df.duplicated(keep='first') == True]

Total inventory_parts in 2020: 811079


Unnamed: 0,part_num,name,part_cat_id,part_material


In [11]:
# Extracting parts with sets in year 2020
temp_df = pd.merge(parts_df, inv_parts_df, how='inner', on='part_num')
parts_df = temp_df[['part_num','name','part_cat_id']]
parts_df = parts_df.rename(columns={'part_cat_id':'category_id'})
print(f"Total parts rows: {len(parts_df.index)}")
parts_df.head()

Total parts rows: 811079


Unnamed: 0,part_num,name,category_id
0,3434,Sticker Sheet for Set 653-1,58
1,4219,"Sticker Sheet for Set 939-1 with flags for AU, IE",58
2,4229,Sticker Sheet for Set 295-1,58
3,4284,Sticker Sheet for Set 723-2,58
4,4285,Sticker Sheet for Set 725-2,58


In [12]:
# Checking for duplicate rows
parts_df.loc[parts_df.duplicated(keep='first') == True]

Unnamed: 0,part_num,name,category_id
11,004632,"Sticker Sheet for Sets 369-1, 575-2",58
17,004848,Sticker Sheet for Set 394-1,58
20,04324,"Sticker Sheet for Sets 600-2, 6600-1",58
23,0901,Baseplate 16 x 30 with Set 080 Yellow House Print,1
25,0902,Baseplate 16 x 24 with Set 080 Small White Hou...,1
...,...,...,...
811073,zbb022,Wheel 68mm Znap Propeller (9 x 2),29
811074,zbb022,Wheel 68mm Znap Propeller (9 x 2),29
811075,zbb022,Wheel 68mm Znap Propeller (9 x 2),29
811076,zbb022,Wheel 68mm Znap Propeller (9 x 2),29


In [13]:
# Removing duplicated rows while joining
parts_df = parts_df.drop_duplicates(keep='first')
print(f"Total parts in 2020: {len(parts_df.index)}")
# Checking for duplicate rows again
parts_df.loc[parts_df.duplicated(keep='first') == True]

Total parts in 2020: 35328


Unnamed: 0,part_num,name,category_id


In [14]:
# Extracting colors within sets in year 2020
colors_df = colors_df.rename(columns={'id':'color_id'})
temp_df = pd.merge(colors_df, inv_parts_df, how='inner', on='color_id')
colors_df = temp_df[['color_id','name','rgb','is_trans']]
print(f"Total colors rows: {len(colors_df.index)}")
print(f"Unique rows: {colors_df['name'].nunique()}")

Total colors rows: 811079
Unique rows: 174


In [15]:
# Removing duplicated rows while joining
colors_df = colors_df.drop_duplicates(keep='first')
print(f"Total colors in 2020: {len(colors_df.index)}")
# Checking for duplicate rows
colors_df.loc[colors_df.duplicated(keep='first') == True]

Total colors in 2020: 174


Unnamed: 0,color_id,name,rgb,is_trans


In [16]:
# Converting is_trans to Boolean datatype
colors_df['is_trans'] = colors_df['is_trans'].map({'t': True, 'f': False})
colors_df.head()

Unnamed: 0,color_id,name,rgb,is_trans
0,-1,[Unknown],0033B2,False
10,0,Black,05131D,False
150959,1,Blue,0055BF,False
189103,2,Green,237841,False
206273,3,Dark Turquoise,008F9B,False


In [17]:
colors_df['is_trans'].dtype

dtype('bool')

In [18]:
# Extracting themes with sets in year 2020
themes_df = themes_df.rename(columns={'id':'theme_id','name':'theme'})
temp_df = pd.merge(sets_df, themes_df, how='inner', on='theme_id')
themes_df = temp_df[['theme_id','theme']]

print(f"Total themes rows: {len(themes_df.index)}")
themes_df.head()

Total themes rows: 789


Unnamed: 0,theme_id,theme
0,497,Books
1,497,Books
2,497,Books
3,497,Books
4,497,Books


In [19]:
# Removing duplicated rows while joining
themes_df = themes_df.drop_duplicates(keep='first')
print(f"Total themes in 2020: {len(themes_df.index)}")
# Checking for duplicate rows
themes_df.loc[themes_df.duplicated(keep='first') == True]

Total themes in 2020: 86


Unnamed: 0,theme_id,theme


In [20]:
# Extracting elements with sets in year 2020
temp_df = pd.merge(elements_df, parts_df, how='inner', on='part_num')
elements_df = temp_df[['element_id','part_num','color_id']]
print(f"Total elements in 2020: {len(elements_df.index)}")
elements_df.head()

Total elements in 2020: 54237


Unnamed: 0,element_id,part_num,color_id
0,4275423,53657,1004
1,4278354,53657,41
2,4275414,53657,45
3,4275411,53657,118
4,4275220,53657,191


In [21]:
# Checking for duplicate rows
elements_df.loc[elements_df.duplicated(keep='first') == True]

Unnamed: 0,element_id,part_num,color_id


In [22]:
# Extracting inventory_minifigs within sets in year 2020
temp_df = pd.merge(inv_minifigs_df, inventories_df, how='inner', on='inventory_id')
inv_minifigs_df = temp_df[['inventory_id','fig_num']]

print(f"Total inventory_minifigs in 2020: {len(inv_minifigs_df.index)}")
inv_minifigs_df.head()

Total inventory_minifigs in 2020: 15997


Unnamed: 0,inventory_id,fig_num
0,3,fig-001549
1,4,fig-000764
2,19,fig-000555
3,25,fig-000574
4,26,fig-000842


In [23]:
# Checking for duplicate rows
inv_minifigs_df.loc[inv_minifigs_df.duplicated(keep='first') == True]

Unnamed: 0,inventory_id,fig_num


In [24]:
# Extracting minifigs within sets in year 2020
temp_df = pd.merge(minifigs_df, inv_minifigs_df, how='inner', on='fig_num')
minifigs_df = temp_df[['fig_num','name','num_parts']]

print(f"Total minifigs rows: {len(minifigs_df.index)}")
minifigs_df.head()

Total minifigs rows: 15997


Unnamed: 0,fig_num,name,num_parts
0,fig-000001,Toy Store Employee,4
1,fig-000002,Customer Kid,4
2,fig-000003,"Assassin Droid, White",8
3,fig-000004,Basic Figure,4
4,fig-000004,Basic Figure,4


In [25]:
# Removing duplicated rows while joining
minifigs_df = minifigs_df.drop_duplicates(keep='first')
print(f"Total minifigs in 2020: {len(minifigs_df.index)}")
# Checking for duplicate rows
minifigs_df.loc[minifigs_df.duplicated(keep='first') == True]

Total minifigs in 2020: 10482


Unnamed: 0,fig_num,name,num_parts


In [26]:
category_df = category_df.rename(columns={'id':'category_id','name':'category'})
temp_df = pd.merge(category_df, parts_df, how='inner', on='category_id')
category_df = temp_df[['category_id','category']]
print(f"Total category rows: {len(category_df.index)}")
category_df.head()

Total category rows: 35328


Unnamed: 0,category_id,category
0,1,Baseplates
1,1,Baseplates
2,1,Baseplates
3,1,Baseplates
4,1,Baseplates


In [27]:
# Removing duplicated rows while joining
category_df = category_df.drop_duplicates(keep='first')
print(f"Total category in 2020: {len(category_df.index)}")
# Checking for duplicate rows
category_df.loc[category_df.duplicated(keep='first') == True]

Total category in 2020: 65


Unnamed: 0,category_id,category


## Adding more data through API

## Inserting to Database

In [28]:
# Create database connection
connection_string = "postgres:postgres@localhost:5432/Lego_db"
engine = create_engine(f'postgresql://{connection_string}')

In [29]:
# Confirm tables
engine.table_names()

['themes',
 'parts',
 'colors',
 'elements',
 'parts_category',
 'minifigs',
 'inventories',
 'inventory_sets',
 'inventory_parts',
 'inventory_minifigs',
 'sets']

In [30]:
# themes
themes_df.to_sql(name='themes', con=engine, if_exists='replace', index=False)
pd.read_sql_query('select * from themes', con=engine).head()

Unnamed: 0,theme_id,theme
0,497,Books
1,155,Modular Buildings
2,673,Creator Expert
3,607,Ghostbusters
4,227,Christmas


In [31]:
# sets
sets_df.to_sql(name='sets', con=engine, if_exists='replace', index=False)
pd.read_sql_query('select * from sets', con=engine).head()

Unnamed: 0,set_num,name,theme_id,num_parts
0,0241401208-1,Cute Ideas,497,0
1,0744023726-1,Disney Princess: Enchanted Treasury,497,4
2,0744023734-1,LEGO Minifigure: A Visual History New Edition,497,5
3,0744024471-1,100 Ways to Rebuild the World,497,0
4,10270-1,Bookshop,155,2504


In [32]:
# parts
parts_df.to_sql(name='parts', con=engine, if_exists='replace', index=False)
pd.read_sql_query('select * from parts', con=engine).head()

Unnamed: 0,part_num,name,category_id
0,3434,Sticker Sheet for Set 653-1,58
1,4219,"Sticker Sheet for Set 939-1 with flags for AU, IE",58
2,4229,Sticker Sheet for Set 295-1,58
3,4284,Sticker Sheet for Set 723-2,58
4,4285,Sticker Sheet for Set 725-2,58


In [33]:
# colors
colors_df.to_sql(name='colors', con=engine, if_exists='replace', index=False)
pd.read_sql_query('select * from colors', con=engine).head()

Unnamed: 0,color_id,name,rgb,is_trans
0,-1,[Unknown],0033B2,False
1,0,Black,05131D,False
2,1,Blue,0055BF,False
3,2,Green,237841,False
4,3,Dark Turquoise,008F9B,False


In [34]:
# elements
elements_df.to_sql(name='elements', con=engine, if_exists='replace', index=False)
pd.read_sql_query('select * from elements', con=engine).head()

Unnamed: 0,element_id,part_num,color_id
0,4275423,53657,1004
1,4278354,53657,41
2,4275414,53657,45
3,4275411,53657,118
4,4275220,53657,191


In [35]:
# parts_category
category_df.to_sql(name='parts_category', con=engine, if_exists='replace', index=False)
pd.read_sql_query('select * from parts_category', con=engine).head()

Unnamed: 0,category_id,category
0,1,Baseplates
1,3,Bricks Sloped
2,4,"Duplo, Quatro and Primo"
3,5,Bricks Special
4,6,Bricks Wedged


In [36]:
# minifigs
minifigs_df.to_sql(name='minifigs', con=engine, if_exists='replace', index=False)
pd.read_sql_query('select * from minifigs', con=engine).head()

Unnamed: 0,fig_num,name,num_parts
0,fig-000001,Toy Store Employee,4
1,fig-000002,Customer Kid,4
2,fig-000003,"Assassin Droid, White",8
3,fig-000004,Basic Figure,4
4,fig-000005,Captain America with Short Legs,3


In [37]:
# Use pandas to load csv converted DataFrames into database
inventories_df.to_sql(name='inventories', con=engine, if_exists='replace', index=False)
pd.read_sql_query('select * from Inventories', con=engine).head()

Unnamed: 0,inventory_id,version,set_num
0,1,1,7922-1
1,3,1,3931-1
2,4,1,6942-1
3,15,1,5158-1
4,16,1,903-1


In [38]:
# inventory sets
inv_sets_df.to_sql(name='inventory_sets', con=engine, if_exists='replace', index=False)
pd.read_sql_query('select * from inventory_sets', con=engine).head()

Unnamed: 0,inventory_id,set_num
0,77818,41900-1
1,77818,41902-1
2,77818,41905-1
3,77818,41908-1
4,77814,60247-1


In [39]:
# inventory parts
inv_parts_df.to_sql(name='inventory_parts', con=engine, if_exists='replace', index=False)
pd.read_sql_query('select * from inventory_parts', con=engine).head()

Unnamed: 0,inventory_id,part_num,color_id
0,1,48379c01,72
1,1,48395,7
2,1,mcsport6,25
3,1,paddle,0
4,3,2343,47


In [40]:
# inventory minifigs
inv_minifigs_df.to_sql(name='inventory_minifigs', con=engine, if_exists='replace', index=False)
pd.read_sql_query('select * from inventory_minifigs', con=engine).head()

Unnamed: 0,inventory_id,fig_num
0,3,fig-001549
1,4,fig-000764
2,19,fig-000555
3,25,fig-000574
4,26,fig-000842


### Using Aggregate function count 
To add missing column number of minifigs in each set

In [41]:
# Calculating number of minifigs in each set
query = "select i.set_num, count(m.fig_num) as num_figs \
         from inventories as i \
         inner join inventory_minifigs as m \
            on i.inventory_id = m.inventory_id \
         group by set_num"
minifig_count_df = pd.read_sql_query(query, con=engine)
minifig_count_df

Unnamed: 0,set_num,num_figs
0,75178-1,5
1,6676-1,2
2,70429-1,3
3,71018-4,1
4,41380-1,2
...,...,...
6206,71009-11,1
6207,871082300512100119-1,1
6208,951805-1,2
6209,40335-1,1


In [42]:
sets_df = pd.read_sql_query('select * from sets', con=engine)
sets_df

Unnamed: 0,set_num,name,theme_id,num_parts
0,0241401208-1,Cute Ideas,497,0
1,0744023726-1,Disney Princess: Enchanted Treasury,497,4
2,0744023734-1,LEGO Minifigure: A Visual History New Edition,497,5
3,0744024471-1,100 Ways to Rebuild the World,497,0
4,10270-1,Bookshop,155,2504
...,...,...,...,...
784,TEDDYBEAR-1,Teddy Bear,232,67
785,TOUCAN2020-1,Toucan,621,22
786,TOWEL-1,Cars Microfiber Towel,501,0
787,TRADINGCARD-3,Create the World Trading Cards: Living Amazingly,501,137


In [43]:
sets_df = sets_df.merge(minifig_count_df, how='inner', on='set_num')
sets_df

Unnamed: 0,set_num,name,theme_id,num_parts,num_figs
0,0744023726-1,Disney Princess: Enchanted Treasury,497,4,1
1,0744023734-1,LEGO Minifigure: A Visual History New Edition,497,5,1
2,10270-1,Bookshop,155,2504,5
3,10273-1,Haunted House,673,3232,8
4,10275-1,Elf Club House,227,1197,4
...,...,...,...,...,...
385,9781913399085-1,Star Wars: Amazing Starships,497,5,1
386,9783960805113-1,Rätselspass für Dino-Fans,497,6,1
387,MINIFIGCOL-1,Cool & Cute Minifigure Collection,408,15,3
388,MINIFIGCOL2-1,Halloween Minifigure Collection,408,15,3


In [44]:
sets_df.to_sql(name='sets', con=engine, if_exists='replace', index=False)
pd.read_sql_query('select * from sets', con=engine).head()

Unnamed: 0,set_num,name,theme_id,num_parts,num_figs
0,0744023726-1,Disney Princess: Enchanted Treasury,497,4,1
1,0744023734-1,LEGO Minifigure: A Visual History New Edition,497,5,1
2,10270-1,Bookshop,155,2504,5
3,10273-1,Haunted House,673,3232,8
4,10275-1,Elf Club House,227,1197,4
