In [1]:
# Dependencies
import pandas as pd
from sqlalchemy import create_engine


## Data cleaning

In [2]:
# Loading all csv files into dataframes
sets_df = pd.read_csv("Resources/sets.csv")
parts_df = pd.read_csv("Resources/parts.csv")
colors_df = pd.read_csv("Resources/colors.csv")
themes_df = pd.read_csv("Resources/themes.csv")
elements_df = pd.read_csv("Resources/elements.csv")
inventories_df = pd.read_csv("Resources/inventories.csv")
inv_sets_df = pd.read_csv("Resources/inventory_sets.csv")
inv_parts_df = pd.read_csv("Resources/inventory_parts.csv")
category_df = pd.read_csv("Resources/part_categories.csv")
minifigs_df = pd.read_csv("Resources/minifigs.csv")
inv_minifigs_df = pd.read_csv("Resources/inventory_minifigs.csv")

sets_df.head()

Unnamed: 0,set_num,name,year,theme_id,num_parts
0,001-1,Gears,1965,1,43
1,0011-2,Town Mini-Figures,1978,84,12
2,0011-3,Castle 2 for 1 Bonus Offer,1987,199,0
3,0012-1,Space Mini-Figures,1979,143,12
4,0013-1,Space Mini-Figures,1979,143,12


In [3]:
# Extracting only 2020 sets
sets_df = sets_df.loc[sets_df['year']==2020]
sets_df.drop('year', axis=1, inplace=True)
print(f"Total sets in 2020: {len(sets_df.index)}")

Total sets in 2020: 338


In [4]:
# Extracting inventories with sets in year 2020
temp_df = pd.merge(sets_df, inventories_df, how='inner', on='set_num')
inventories_df = temp_df[['id','version','set_num']]
inventories_df = inventories_df.rename(columns={'id':'inventory_id'})
print(f"Total inventory in 2020: {len(inventories_df.index)}")
inventories_df.head()

Total inventory in 2020: 339


Unnamed: 0,inventory_id,version,set_num
0,43322,1,10270-1
1,47893,1,10271-1
2,45169,1,10272-1
3,43351,1,10909-1
4,42591,1,10913-1


In [5]:
# Checking the row with same set_num
inventories_df.loc[inventories_df['set_num']=='71026-18']

Unnamed: 0,inventory_id,version,set_num
228,43617,1,71026-18
229,43650,2,71026-18


In [6]:
# Checking for duplicate rows
inventories_df.loc[inventories_df.duplicated(keep='first') == True]

Unnamed: 0,inventory_id,version,set_num


In [7]:
inventories_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 339 entries, 0 to 338
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   inventory_id  339 non-null    int64 
 1   version       339 non-null    int64 
 2   set_num       339 non-null    object
dtypes: int64(2), object(1)
memory usage: 10.6+ KB


In [8]:
# Extracting inventory_parts with sets in year 2020
temp_df = pd.merge(inv_parts_df, inventories_df, how='inner', on='inventory_id')
inv_parts_df = temp_df[['inventory_id','part_num','color_id']]
print(f"Total inventory_parts rows: {len(inv_parts_df.index)}")
inv_parts_df.head()

Total inventory_parts rows: 21144


Unnamed: 0,inventory_id,part_num,color_id
0,34280,11954,322
1,34280,14720,71
2,34280,14769pr0080,15
3,34280,15458,0
4,34280,15535,0


In [9]:
# Checking for duplicate rows
inv_parts_df.loc[inv_parts_df.duplicated(keep='first') == True]

Unnamed: 0,inventory_id,part_num,color_id
215,37032,11253,71
251,37032,3024,19
253,37032,3024,28
268,37032,3070b,15
270,37032,3070bpr0166,71
...,...,...,...
20603,47630,2780,0
20632,47630,3070b,0
20656,47630,4274,71
20663,47630,6141,46


In [10]:
# Removing duplicated rows while joining
inv_parts_df = inv_parts_df.drop_duplicates(keep='first')
print(f"Total inventory_parts in 2020: {len(inv_parts_df.index)}")
# Checking for duplicate rows again
parts_df.loc[parts_df.duplicated(keep='first') == True]

Total inventory_parts in 2020: 20196


Unnamed: 0,part_num,name,part_cat_id,part_material_id


In [11]:
# Extracting parts with sets in year 2020
temp_df = pd.merge(parts_df, inv_parts_df, how='inner', on='part_num')
parts_df = temp_df[['part_num','name','part_cat_id']]
parts_df = parts_df.rename(columns={'part_cat_id':'category_id'})
print(f"Total parts rows: {len(parts_df.index)}")
parts_df.head()

Total parts rows: 20196


Unnamed: 0,part_num,name,category_id
0,10050,Weapon Sword (Uruk-hai),27
1,10050,Weapon Sword (Uruk-hai),27
2,10111pr0075,"Duplo Figure with Headset and Cap Yellow, with...",57
3,10169,Bag / Sack with Handle,27
4,10169,Bag / Sack with Handle,27


In [12]:
# Checking for duplicate rows
parts_df.loc[parts_df.duplicated(keep='first') == True]

Unnamed: 0,part_num,name,category_id
1,10050,Weapon Sword (Uruk-hai),27
4,10169,Bag / Sack with Handle,27
7,10172,Equipment Trophy Cup Small,27
12,10197,Technic Pin Connector Hub with 2 Perpendicular...,12
13,10197,Technic Pin Connector Hub with 2 Perpendicular...,12
...,...,...,...
20182,99930,Minifig Hair Smooth Combed Sideways,65
20183,99930,Minifig Hair Smooth Combed Sideways,65
20184,99930,Minifig Hair Smooth Combed Sideways,65
20185,99930,Minifig Hair Smooth Combed Sideways,65


In [13]:
# Removing duplicated rows while joining
parts_df = parts_df.drop_duplicates(keep='first')
print(f"Total parts in 2020: {len(parts_df.index)}")
# Checking for duplicate rows again
parts_df.loc[parts_df.duplicated(keep='first') == True]

Total parts in 2020: 2971


Unnamed: 0,part_num,name,category_id


In [14]:
# Extracting colors within sets in year 2020
colors_df = colors_df.rename(columns={'id':'color_id'})
temp_df = pd.merge(colors_df, inv_parts_df, how='inner', on='color_id')
colors_df = temp_df[['color_id','name','rgb','is_trans']]
print(f"Total colors rows: {len(colors_df.index)}")
print(f"Unique rows: {colors_df['name'].nunique()}")

Total colors rows: 20196
Unique rows: 68


In [15]:
# Removing duplicated rows while joining
colors_df = colors_df.drop_duplicates(keep='first')
print(f"Total colors in 2020: {len(colors_df.index)}")
# Checking for duplicate rows
colors_df.loc[colors_df.duplicated(keep='first') == True]

Total colors in 2020: 68


Unnamed: 0,color_id,name,rgb,is_trans


In [16]:
# Converting is_trans to Boolean datatype
colors_df['is_trans'] = colors_df['is_trans'].map({'t': True, 'f': False})
colors_df.head()

Unnamed: 0,color_id,name,rgb,is_trans
0,0,Black,05131D,False
3219,1,Blue,0055BF,False
3798,2,Green,237841,False
4159,3,Dark Turquoise,008F9B,False
4355,4,Red,C91A09,False


In [17]:
colors_df['is_trans'].dtype

dtype('bool')

In [18]:
# Extracting themes with sets in year 2020
themes_df = themes_df.rename(columns={'id':'theme_id','name':'theme'})
temp_df = pd.merge(sets_df, themes_df, how='inner', on='theme_id')
themes_df = temp_df[['theme_id','theme']]

print(f"Total themes rows: {len(themes_df.index)}")
themes_df.head()

Total themes rows: 338


Unnamed: 0,theme_id,theme
0,155,Modular Buildings
1,155,Modular Buildings
2,673,Creator Expert
3,673,Creator Expert
4,673,Creator Expert


In [19]:
# Removing duplicated rows while joining
themes_df = themes_df.drop_duplicates(keep='first')
print(f"Total themes in 2020: {len(themes_df.index)}")
# Checking for duplicate rows
themes_df.loc[themes_df.duplicated(keep='first') == True]

Total themes in 2020: 51


Unnamed: 0,theme_id,theme


In [20]:
# Extracting elements with sets in year 2020
temp_df = pd.merge(elements_df, parts_df, how='inner', on='part_num')
elements_df = temp_df[['element_id','part_num','color_id']]
print(f"Total elements in 2020: {len(elements_df.index)}")
elements_df.head()

Total elements in 2020: 19999


Unnamed: 0,element_id,part_num,color_id
0,6194308,92926,71
1,4626650,92926,2
2,4599973,92926,71
3,4626648,92926,71
4,4626645,92926,72


In [21]:
# Checking for duplicate rows
elements_df.loc[elements_df.duplicated(keep='first') == True]

Unnamed: 0,element_id,part_num,color_id


In [22]:
# Extracting inventory_minifigs within sets in year 2020
temp_df = pd.merge(inv_minifigs_df, inventories_df, how='inner', on='inventory_id')
inv_minifigs_df = temp_df[['inventory_id','fig_num']]

print(f"Total inventory_minifigs in 2020: {len(inv_minifigs_df.index)}")
inv_minifigs_df.head()

Total inventory_minifigs in 2020: 398


Unnamed: 0,inventory_id,fig_num
0,34280,fig-009497
1,34280,fig-009498
2,37032,fig-001939
3,37032,fig-003501
4,37032,fig-003502


In [23]:
# Checking for duplicate rows
inv_minifigs_df.loc[inv_minifigs_df.duplicated(keep='first') == True]

Unnamed: 0,inventory_id,fig_num


In [24]:
# Extracting minifigs within sets in year 2020
temp_df = pd.merge(minifigs_df, inv_minifigs_df, how='inner', on='fig_num')
minifigs_df = temp_df[['fig_num','name','num_parts']]

print(f"Total minifigs rows: {len(minifigs_df.index)}")
minifigs_df.head()

Total minifigs rows: 398


Unnamed: 0,fig_num,name,num_parts
0,fig-000001,Toy Store Employee,4
1,fig-000002,Customer Kid,4
2,fig-000006,Lloyd Avatar,5
3,fig-000007,Lloyd / Digi Lloyd - Katana Holders,5
4,fig-000007,Lloyd / Digi Lloyd - Katana Holders,5


In [25]:
# Removing duplicated rows while joining
minifigs_df = minifigs_df.drop_duplicates(keep='first')
print(f"Total minifigs in 2020: {minifigs_df['fig_num'].count()}")
# Checking for duplicate rows
minifigs_df.loc[minifigs_df.duplicated(keep='first') == True]

Total minifigs in 2020: 361


Unnamed: 0,fig_num,name,num_parts


In [26]:
inv_sets_df = inv_sets_df[['inventory_id','set_num']]
inv_sets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3303 entries, 0 to 3302
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   inventory_id  3303 non-null   int64 
 1   set_num       3303 non-null   object
dtypes: int64(1), object(1)
memory usage: 51.7+ KB


In [27]:
category_df = category_df.rename(columns={'id':'category_id'})
category_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65 entries, 0 to 64
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   category_id  65 non-null     int64 
 1   name         65 non-null     object
dtypes: int64(1), object(1)
memory usage: 1.1+ KB


## Inserting to Database

In [34]:
# Create database connection
connection_string = "postgres:postgres@localhost:5432/Lego_db"
engine = create_engine(f'postgresql://{connection_string}')

In [35]:
# Confirm tables
engine.table_names()

['inventories',
 'inventory_sets',
 'sets',
 'themes',
 'inventory_minifigs',
 'minifigs',
 'inventory_parts',
 'parts',
 'colors',
 'parts_category',
 'elements']

In [36]:
# themes
themes_df.to_sql(name='themes', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from themes', con=engine).head()

Unnamed: 0,theme_id,theme
0,155,Modular Buildings
1,673,Creator Expert
2,504,Duplo
3,621,Classic
4,435,Ninjago


In [39]:
# sets
sets_df.to_sql(name='sets', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from sets', con=engine).head()

Unnamed: 0,set_num,name,theme_id,num_parts
0,10270-1,Bookshop,155,2504
1,10271-1,Fiat 500,673,944
2,10272-1,Old Trafford - Manchester United,673,3898
3,10909-1,Heart Box,504,80
4,10913-1,Brick Box,504,65


In [40]:
# parts
parts_df.to_sql(name='parts', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from parts', con=engine).head()

Unnamed: 0,part_num,name,category_id
0,10050,Weapon Sword (Uruk-hai),27
1,10111pr0075,"Duplo Figure with Headset and Cap Yellow, with...",57
2,10169,Bag / Sack with Handle,27
3,10169pr0001,Bag / Sack with Handle and '$' / Dollar Sign P...,27
4,10172,Equipment Trophy Cup Small,27


In [45]:
# colors
colors_df.to_sql(name='colors', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from colors', con=engine).head()

Unnamed: 0,color_id,name,rgb,is_trans
0,0,Black,05131D,False
1,1,Blue,0055BF,False
2,2,Green,237841,False
3,3,Dark Turquoise,008F9B,False
4,4,Red,C91A09,False


In [46]:
# elements
elements_df.to_sql(name='elements', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from elements', con=engine).head()

Unnamed: 0,element_id,part_num,color_id
0,6194308,92926,71
1,4626650,92926,2
2,4599973,92926,71
3,4626648,92926,71
4,4626645,92926,72


In [47]:
# parts_category
category_df.to_sql(name='parts_category', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from parts_category', con=engine).head()

Unnamed: 0,category_id,name
0,1,Baseplates
1,3,Bricks Sloped
2,4,"Duplo, Quatro and Primo"
3,5,Bricks Special
4,6,Bricks Wedged


In [41]:
# minifigs
minifigs_df.to_sql(name='minifigs', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from minifigs', con=engine).head()

Unnamed: 0,fig_num,name,num_parts
0,fig-000001,Toy Store Employee,4
1,fig-000002,Customer Kid,4
2,fig-000006,Lloyd Avatar,5
3,fig-000007,Lloyd / Digi Lloyd - Katana Holders,5
4,fig-001669,"Olivia - Bright Light Yellow Vest, Dark Pink S...",4


In [37]:
# Use pandas to load csv converted DataFrames into database
inventories_df.to_sql(name='inventories', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from Inventories', con=engine).head()

Unnamed: 0,inventory_id,version,set_num
0,43322,1,10270-1
1,47893,1,10271-1
2,45169,1,10272-1
3,43351,1,10909-1
4,42591,1,10913-1


In [42]:
# inventory sets
inv_sets_df.to_sql(name='inventory_sets', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from inventory_sets', con=engine).head()

Unnamed: 0,inventory_id,set_num
0,35,75911-1
1,35,75912-1
2,39,75048-1
3,39,75053-1
4,50,4515-1


In [43]:
# inventory parts
inv_parts_df.to_sql(name='inventory_parts', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from inventory_parts', con=engine).head()

Unnamed: 0,inventory_id,part_num,color_id
0,34280,11954,322
1,34280,14720,71
2,34280,14769pr0080,15
3,34280,15458,0
4,34280,15535,0


In [44]:
# inventory minifigs
inv_minifigs_df.to_sql(name='inventory_minifigs', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from inventory_minifigs', con=engine).head()

Unnamed: 0,inventory_id,fig_num
0,34280,fig-009497
1,34280,fig-009498
2,37032,fig-001939
3,37032,fig-003501
4,37032,fig-003502


In [None]:
pd.read_sql_query('select count(fignum) from minifigs join sets', con=engine).head()