In [1]:
# Dependencies
import pandas as pd
from sqlalchemy import create_engine


## Data cleaning

In [31]:
# Loading all csv files into dataframes
sets_df = pd.read_csv("RebrickableCSVs/sets.csv")
parts_df = pd.read_csv("RebrickableCSVs/parts.csv")
colors_df = pd.read_csv("RebrickableCSVs/colors.csv")
themes_df = pd.read_csv("RebrickableCSVs/themes.csv")
elements_df = pd.read_csv("RebrickableCSVs/elements.csv")
inventories_df = pd.read_csv("RebrickableCSVs/inventories.csv")
inv_sets_df = pd.read_csv("RebrickableCSVs/inventory_sets.csv")
inv_parts_df = pd.read_csv("RebrickableCSVs/inventory_parts.csv")
category_df = pd.read_csv("RebrickableCSVs/part_categories.csv")
minifigs_df = pd.read_csv("RebrickableCSVs/minifigs.csv")
inv_minifigs_df = pd.read_csv("RebrickableCSVs/inventory_minifigs.csv")

sets_df.head()

Unnamed: 0,part_num,name,part_cat_id,part_material
0,3434,Sticker Sheet for Set 653-1,58,Cardboard/Paper
1,4219,"Sticker Sheet for Set 939-1 with flags for AU, IE",58,Plastic
2,4229,Sticker Sheet for Set 295-1,58,Plastic
3,4284,Sticker Sheet for Set 723-2,58,Plastic
4,4285,Sticker Sheet for Set 725-2,58,Plastic


In [3]:
# Extracting only 2020 sets
sets_df = sets_df.loc[sets_df['year']==2020]
sets_df.drop('year', axis=1, inplace=True)
print(f"Total sets in 2020: {len(sets_df.index)}")

Total sets in 2020: 789


In [4]:
# Extracting inventories with sets in year 2020
temp_df = pd.merge(sets_df, inventories_df, how='inner', on='set_num')
inventories_df = temp_df[['id','version','set_num']]
inventories_df = inventories_df.rename(columns={'id':'inventory_id'})
print(f"Total inventory in 2020: {len(inventories_df.index)}")
inventories_df.head()

Total inventory in 2020: 875


Unnamed: 0,inventory_id,version,set_num
0,74939,1,0241401208-1
1,75721,1,0744023726-1
2,75724,1,0744023734-1
3,75722,1,0744024471-1
4,43322,1,10270-1


In [5]:
# Checking the row with same set_num
inventories_df.loc[inventories_df['set_num']=='71026-18']

Unnamed: 0,inventory_id,version,set_num
526,43617,1,71026-18
527,43650,2,71026-18


In [6]:
# Checking for duplicate rows
inventories_df.loc[inventories_df.duplicated(keep='first') == True]

Unnamed: 0,inventory_id,version,set_num


In [7]:
inventories_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 875 entries, 0 to 874
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   inventory_id  875 non-null    int64 
 1   version       875 non-null    int64 
 2   set_num       875 non-null    object
dtypes: int64(2), object(1)
memory usage: 27.3+ KB


In [8]:
# Extracting inventory_parts with sets in year 2020
temp_df = pd.merge(inv_parts_df, inventories_df, how='inner', on='inventory_id')
inv_parts_df = temp_df[['inventory_id','part_num','color_id']]
print(f"Total inventory_parts rows: {len(inv_parts_df.index)}")
inv_parts_df.head()

Total inventory_parts rows: 60599


Unnamed: 0,inventory_id,part_num,color_id
0,34280,11954,322
1,34280,14720,71
2,34280,14769pr0080,15
3,34280,15458,0
4,34280,15535,0


In [9]:
# Checking for duplicate rows
inv_parts_df.loc[inv_parts_df.duplicated(keep='first') == True]

Unnamed: 0,inventory_id,part_num,color_id
11,34280,2780,0
37,34280,3713,71
105,34280,98138pr0062,15
107,34280,98138pr0076,0
215,37032,11253,71
...,...,...,...
59807,78388,85861,70
59817,78388,87994,308
60421,79137,25269,3
60427,79137,3024,3


In [10]:
# Removing duplicated rows while joining
inv_parts_df = inv_parts_df.drop_duplicates(keep='first')
print(f"Total inventory_parts in 2020: {len(inv_parts_df.index)}")
# Checking for duplicate rows again
parts_df.loc[parts_df.duplicated(keep='first') == True]

Total inventory_parts in 2020: 56585


Unnamed: 0,part_num,name,part_cat_id,part_material


In [11]:
# Extracting parts with sets in year 2020
temp_df = pd.merge(parts_df, inv_parts_df, how='inner', on='part_num')
parts_df = temp_df[['part_num','name','part_cat_id']]
parts_df = parts_df.rename(columns={'part_cat_id':'category_id'})
print(f"Total parts rows: {len(parts_df.index)}")
parts_df.head()

Total parts rows: 56585


Unnamed: 0,part_num,name,category_id
0,10050,Weapon Sword (Uruk-hai),27
1,10050,Weapon Sword (Uruk-hai),27
2,10050,Weapon Sword (Uruk-hai),27
3,10050,Weapon Sword (Uruk-hai),27
4,10050,Weapon Sword (Uruk-hai),27


In [12]:
# Checking for duplicate rows
parts_df.loc[parts_df.duplicated(keep='first') == True]

Unnamed: 0,part_num,name,category_id
1,10050,Weapon Sword (Uruk-hai),27
2,10050,Weapon Sword (Uruk-hai),27
3,10050,Weapon Sword (Uruk-hai),27
4,10050,Weapon Sword (Uruk-hai),27
5,10050,Weapon Sword (Uruk-hai),27
...,...,...,...
56566,upn0038pr0002,"Animal, Sloth with Tan Belly, Bright Light Yel...",28
56567,upn0038pr0002,"Animal, Sloth with Tan Belly, Bright Light Yel...",28
56568,upn0038pr0002,"Animal, Sloth with Tan Belly, Bright Light Yel...",28
56569,upn0038pr0002,"Animal, Sloth with Tan Belly, Bright Light Yel...",28


In [13]:
# Removing duplicated rows while joining
parts_df = parts_df.drop_duplicates(keep='first')
print(f"Total parts in 2020: {len(parts_df.index)}")
# Checking for duplicate rows again
parts_df.loc[parts_df.duplicated(keep='first') == True]

Total parts in 2020: 3635


Unnamed: 0,part_num,name,category_id


In [14]:
# Extracting colors within sets in year 2020
colors_df = colors_df.rename(columns={'id':'color_id'})
temp_df = pd.merge(colors_df, inv_parts_df, how='inner', on='color_id')
colors_df = temp_df[['color_id','name','rgb','is_trans']]
print(f"Total colors rows: {len(colors_df.index)}")
print(f"Unique rows: {colors_df['name'].nunique()}")

Total colors rows: 56585
Unique rows: 76


In [15]:
# Removing duplicated rows while joining
colors_df = colors_df.drop_duplicates(keep='first')
print(f"Total colors in 2020: {len(colors_df.index)}")
# Checking for duplicate rows
colors_df.loc[colors_df.duplicated(keep='first') == True]

Total colors in 2020: 76


Unnamed: 0,color_id,name,rgb,is_trans


In [16]:
# Converting is_trans to Boolean datatype
colors_df['is_trans'] = colors_df['is_trans'].map({'t': True, 'f': False})
colors_df.head()

Unnamed: 0,color_id,name,rgb,is_trans
0,0,Black,05131D,False
9158,1,Blue,0055BF,False
10488,2,Green,237841,False
11353,3,Dark Turquoise,008F9B,False
11773,4,Red,C91A09,False


In [17]:
colors_df['is_trans'].dtype

dtype('bool')

In [18]:
# Extracting themes with sets in year 2020
themes_df = themes_df.rename(columns={'id':'theme_id','name':'theme'})
temp_df = pd.merge(sets_df, themes_df, how='inner', on='theme_id')
themes_df = temp_df[['theme_id','theme']]

print(f"Total themes rows: {len(themes_df.index)}")
themes_df.head()

Total themes rows: 789


Unnamed: 0,theme_id,theme
0,497,Books
1,497,Books
2,497,Books
3,497,Books
4,497,Books


In [19]:
# Removing duplicated rows while joining
themes_df = themes_df.drop_duplicates(keep='first')
print(f"Total themes in 2020: {len(themes_df.index)}")
# Checking for duplicate rows
themes_df.loc[themes_df.duplicated(keep='first') == True]

Total themes in 2020: 86


Unnamed: 0,theme_id,theme


In [20]:
# Extracting elements with sets in year 2020
temp_df = pd.merge(elements_df, parts_df, how='inner', on='part_num')
elements_df = temp_df[['element_id','part_num','color_id']]
print(f"Total elements in 2020: {len(elements_df.index)}")
elements_df.head()

Total elements in 2020: 23126


Unnamed: 0,element_id,part_num,color_id
0,6194308,92926,71
1,4626650,92926,2
2,4599973,92926,71
3,4626648,92926,71
4,4626645,92926,72


In [21]:
# Checking for duplicate rows
elements_df.loc[elements_df.duplicated(keep='first') == True]

Unnamed: 0,element_id,part_num,color_id


In [22]:
# Extracting inventory_minifigs within sets in year 2020
temp_df = pd.merge(inv_minifigs_df, inventories_df, how='inner', on='inventory_id')
inv_minifigs_df = temp_df[['inventory_id','fig_num']]

print(f"Total inventory_minifigs in 2020: {len(inv_minifigs_df.index)}")
inv_minifigs_df.head()

Total inventory_minifigs in 2020: 1081


Unnamed: 0,inventory_id,fig_num
0,34280,fig-009497
1,34280,fig-009498
2,37032,fig-001939
3,37032,fig-003501
4,37032,fig-003502


In [23]:
# Checking for duplicate rows
inv_minifigs_df.loc[inv_minifigs_df.duplicated(keep='first') == True]

Unnamed: 0,inventory_id,fig_num


In [24]:
# Extracting minifigs within sets in year 2020
temp_df = pd.merge(minifigs_df, inv_minifigs_df, how='inner', on='fig_num')
minifigs_df = temp_df[['fig_num','name','num_parts']]

print(f"Total minifigs rows: {len(minifigs_df.index)}")
minifigs_df.head()

Total minifigs rows: 1081


Unnamed: 0,fig_num,name,num_parts
0,fig-000001,Toy Store Employee,4
1,fig-000002,Customer Kid,4
2,fig-000006,Lloyd Avatar,5
3,fig-000007,Lloyd / Digi Lloyd - Katana Holders,5
4,fig-000007,Lloyd / Digi Lloyd - Katana Holders,5


In [25]:
# Removing duplicated rows while joining
minifigs_df = minifigs_df.drop_duplicates(keep='first')
print(f"Total minifigs in 2020: {minifigs_df['fig_num'].count()}")
# Checking for duplicate rows
minifigs_df.loc[minifigs_df.duplicated(keep='first') == True]

Total minifigs in 2020: 839


Unnamed: 0,fig_num,name,num_parts


In [26]:
inv_sets_df = inv_sets_df[['inventory_id','set_num']]
inv_sets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2965 entries, 0 to 2964
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   inventory_id  2965 non-null   int64 
 1   set_num       2965 non-null   object
dtypes: int64(1), object(1)
memory usage: 46.5+ KB


In [27]:
category_df = category_df.rename(columns={'id':'category_id'})
category_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65 entries, 0 to 64
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   category_id  65 non-null     int64 
 1   name         65 non-null     object
dtypes: int64(1), object(1)
memory usage: 1.1+ KB


## Inserting to Database

In [28]:
# Create database connection
connection_string = "postgres:postgres@localhost:5432/Lego_db"
engine = create_engine(f'postgresql://{connection_string}')

In [29]:
# Confirm tables
engine.table_names()

['inventories',
 'inventory_sets',
 'sets',
 'themes',
 'inventory_minifigs',
 'minifigs',
 'inventory_parts',
 'parts',
 'colors',
 'parts_category',
 'elements']

In [30]:
# themes
themes_df.to_sql(name='themes', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from themes', con=engine).head()

IntegrityError: (psycopg2.errors.UniqueViolation) duplicate key value violates unique constraint "themes_pkey"
DETAIL:  Key (theme_id)=(497) already exists.

[SQL: INSERT INTO themes (theme_id, theme) VALUES (%(theme_id)s, %(theme)s)]
[parameters: ({'theme_id': 497, 'theme': 'Books'}, {'theme_id': 155, 'theme': 'Modular Buildings'}, {'theme_id': 673, 'theme': 'Creator Expert'}, {'theme_id': 607, 'theme': 'Ghostbusters'}, {'theme_id': 227, 'theme': 'Christmas'}, {'theme_id': 504, 'theme': 'Duplo'}, {'theme_id': 653, 'theme': 'DC Comics'}, {'theme_id': 640, 'theme': 'Disney Princess'}  ... displaying 10 of 86 total bound parameter sets ...  {'theme_id': 498, 'theme': 'Technic'}, {'theme_id': 232, 'theme': 'Valentine'})]
(Background on this error at: http://sqlalche.me/e/13/gkpj)

In [None]:
# sets
sets_df.to_sql(name='sets', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from sets', con=engine).head()

In [None]:
# parts
parts_df.to_sql(name='parts', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from parts', con=engine).head()

In [None]:
# colors
colors_df.to_sql(name='colors', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from colors', con=engine).head()

In [None]:
# elements
elements_df.to_sql(name='elements', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from elements', con=engine).head()

In [None]:
# parts_category
category_df.to_sql(name='parts_category', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from parts_category', con=engine).head()

In [None]:
# minifigs
minifigs_df.to_sql(name='minifigs', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from minifigs', con=engine).head()

In [None]:
# Use pandas to load csv converted DataFrames into database
inventories_df.to_sql(name='inventories', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from Inventories', con=engine).head()

In [None]:
# inventory sets
inv_sets_df.to_sql(name='inventory_sets', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from inventory_sets', con=engine).head()

In [None]:
# inventory parts
inv_parts_df.to_sql(name='inventory_parts', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from inventory_parts', con=engine).head()

In [None]:
# inventory minifigs
inv_minifigs_df.to_sql(name='inventory_minifigs', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from inventory_minifigs', con=engine).head()

In [None]:
pd.read_sql_query('select count(fignum) from minifigs join sets', con=engine).head()