In [1]:
#####
###
##
#    SYNOPSIS
#
#    As a small data analytics firm, our client approached us with an urgent project to collect information
#    about this year’s LEGO sets. They are part of the manufacturing process of LEGO pieces; and want to do 
#    predictive data modelling to find out what materials will be more in demand next year, based on this
#    year’s information, so that they can optimise their manufacturing process.
#
#    They have requested the information be put into database format so that it can be accessed in a way
#    that they are already familiar with.
#
#    Based on the timeframe of 1 week, we have assigned the following team members to this project:
#
#    CONTRIBUTORS
#    
#    Sylvia Broadbent @github/Supasyl
#    Cicily George @github/CicilyGeorge
#    Daniel Sobral @github/D0SO
#    John Bingley @github/JB-DA
#
#    Source and output can be found (with access) on https://github.com/Supasyl/ETL_project
#
##
###
#####


### SETTINGS
##
# Dependencies
import pandas as pd
from sqlalchemy import create_engine
import requests
import json

viewHeaders = 'no' #for displaying dataframes, yes/no
headSize = 5 #rows to show if above line = yes


### LOAD DATA FROM CSV
##
# Load pre-downloaded information
df_colors = pd.read_csv( 'data_raw/colors.csv' )
df_elements = pd.read_csv( 'data_raw/elements.csv' )
df_inventories = pd.read_csv( 'data_raw/inventories.csv' )
df_inventory_minifigs = pd.read_csv( 'data_raw/inventory_minifigs.csv' )
df_inventory_parts = pd.read_csv( 'data_raw/inventory_parts.csv' )
df_inventory_sets = pd.read_csv( 'data_raw/inventory_sets.csv' )
df_minifigs = pd.read_csv( 'data_raw/minifigs.csv' )
df_part_categories = pd.read_csv( 'data_raw/part_categories.csv' )
df_part_relationships = pd.read_csv( 'data_raw/part_relationships.csv' )
df_parts = pd.read_csv( 'data_raw/parts.csv' )
df_sets = pd.read_csv( 'data_raw/sets.csv' )
df_themes = pd.read_csv( 'data_raw/themes.csv' )


### LOAD DATA FROM API
##
# Left as proof of code only. Results were stored to sets2020.json
# API requires ' to be used in URL, converted to %27 for ease of use
# Comment out code block and api_key before publishing

# api_key = "3-DLfb-T3ZA-qQLjW"
# url = "https://brickset.com/api/v3.asmx/getSets?"
# query_url = f"{url}apiKey={api_key}&userHash=&params={{ %27year%27:%272020%27, %27pageSize%27 : 900 }}"

# response = requests.get( query_url )

# with open( 'api_sets_2020.json', 'w' ) as ii:
#     json.dump( response.json(), ii )


### CLEAN & JOIN DATA
##
# Looking at only data from 2020 sets
df_clean_sets = df_sets.loc[ df_sets[ 'year' ] == 2020 ].copy()
df_clean_sets.drop( 'year', axis = 1, inplace = True )

# Inventories
df_inventories = df_inventories.rename( columns = { 'id' : 'inventory_id' })
df_temp = pd.merge( df_clean_sets, df_inventories, how = 'inner', on = 'set_num' )
df_clean_inventories = df_temp[[ 'inventory_id', 'version', 'set_num' ]]

# Inventory Sets
df_temp = pd.merge( df_clean_sets, df_inventory_sets, how = 'inner', on = 'set_num' )
df_clean_inventory_sets = df_temp[[ 'inventory_id', 'set_num', 'quantity' ]]

# Themes
df_themes = df_themes.rename( columns = { 'id' : 'theme_id', 'name' : 'theme_name' })
df_temp = pd.merge( df_themes, df_clean_sets, how = 'inner', on = 'theme_id' )
df_clean_themes = df_temp[[ 'theme_id', 'theme_name', 'parent_id' ]]
df_clean_themes = df_clean_themes.drop_duplicates(keep='first')

# Inventory Minifigs
df_temp = pd.merge( df_clean_inventories, df_inventory_minifigs, how = 'inner', on = 'inventory_id' )
df_clean_inventory_minifigs = df_temp[[ 'inventory_id', 'fig_num', 'quantity' ]]

# Minifigs
df_temp = pd.merge( df_clean_inventory_minifigs, df_minifigs, how = 'inner', on = 'fig_num' )
df_clean_minifigs = df_temp[[ 'fig_num', 'name', 'num_parts' ]]
df_clean_minifigs = df_clean_minifigs.drop_duplicates(keep='first')

# Inventory Parts
df_temp = pd.merge( df_inventory_parts, df_clean_inventories, how = 'inner', on = 'inventory_id' )
df_clean_inventory_parts = df_temp[[ 'inventory_id', 'part_num', 'color_id', 'quantity', 'is_spare' ]]
df_clean_inventory_parts[ 'is_spare' ] = df_clean_inventory_parts[ 'is_spare' ].map({ 't' : True, 'f' : False })

# Colours
df_colors = df_colors.rename( columns = { 'id' : 'color_id' })
df_temp = pd.merge( df_clean_inventory_parts, df_colors, how = 'inner', on = 'color_id' )
df_clean_colors = df_temp[[ 'color_id', 'name', 'rgb', 'is_trans' ]]
df_clean_colors[ 'is_trans' ] = df_clean_colors[ 'is_trans' ].map({ 't' : True, 'f' : False })
df_clean_colors = df_clean_colors.drop_duplicates(keep='first')

# Parts
df_temp = pd.merge( df_clean_inventory_parts, df_parts, how = 'inner', on = 'part_num' )
df_clean_parts = df_temp[[ 'part_num', 'name', 'part_cat_id' ]]
df_clean_parts = df_clean_parts.drop_duplicates(keep='first')

# Elements
df_temp = pd.merge( df_clean_parts, df_elements, how = 'inner', on = 'part_num' )
df_clean_elements = df_temp[[ 'element_id', 'part_num', 'color_id' ]]
df_clean_elements = df_clean_elements.drop_duplicates(keep='first')

# Part Categories
df_part_categories = df_part_categories.rename( columns = { 'id' : 'part_cat_id', 'name' : 'part_name' })
df_temp = pd.merge( df_clean_parts, df_part_categories, how = 'inner', on = 'part_cat_id' )
df_clean_part_categories = df_temp[[ 'part_cat_id', 'part_name' ]]
df_clean_part_categories = df_clean_part_categories.drop_duplicates(keep='first')

# Part Relationships
df_part_relationships = df_part_relationships.rename( columns = { 'child_part_num' : 'part_num' })
df_temp = pd.merge( df_clean_parts, df_part_relationships, how = 'inner', on = 'part_num' )
df_clean_part_relationships = df_temp[[ 'rel_type', 'part_num', 'parent_part_num' ]]
df_clean_part_relationships = df_clean_part_relationships.drop_duplicates(keep='first')

# API Data Load
with open( 'api_sets_2020.json', 'r' ) as jj: #open pre-made api results
    json_d = json.load( jj )

api_sets_2020 = pd.DataFrame( json_d[ 'sets' ]) #load to dataframe

df_api_sets_2020 = api_sets_2020[[ 'number', 'rating', 'reviewCount' ]] #show only desired columns

# Set API column number to match Lego ID number
df_api_sets_2020 = df_api_sets_2020.rename( columns = { 'number' : 'set_num' })
df_api_sets_2020.set_num = df_api_sets_2020.set_num + '-1' #append -1 to column to match data in 'sets'

# Merge API data with Sets Dataframe
df_clean_sets = pd.merge( df_clean_sets, df_api_sets_2020, how = 'inner', on = 'set_num' )





### VIEW DATA (Validation Purposes Only)
##
# Uses value under 'SETTINGS' at top of file
if viewHeaders == 'yes':
    
    display( df_clean_colors \
        .head( headSize ) \
        .style.set_caption( 'Table: Colours' ))
    
    display( df_clean_elements \
        .head( headSize ) \
        .style.set_caption( 'Table: Elements' ))
    
    display( df_clean_inventories \
        .head( headSize ) \
        .style.set_caption( 'Table: Inventories' ))
    
    display( df_clean_inventory_minifigs \
        .head( headSize ) \
        .style.set_caption( 'Table: Inventory Mini-figures' ))
    
    display( df_clean_inventory_parts \
        .head( headSize ) \
        .style.set_caption( 'Table: Inventory Parts' ))
    
    display( df_clean_inventory_sets \
        .head( headSize ) \
        .style.set_caption( 'Table: Inventory Sets' ))
    
    display( df_clean_minifigs \
        .head( headSize ) \
        .style.set_caption( 'Table: Mini-figures' ))
    
    display( df_clean_part_categories \
        .head( headSize ) \
        .style.set_caption( 'Table: Part Categories' ))
    
    display( df_clean_part_relationships \
        .head( headSize ) \
        .style.set_caption( 'Table: Part Relationships' ))
    
    display( df_clean_parts \
        .head( headSize ) \
        .style.set_caption( 'Table: Parts' ))
    
    display( df_clean_sets \
        .head( headSize ) \
        .style.set_caption( 'Table: Sets' ))
    
    display( df_clean_themes \
        .head( headSize ) \
        .style.set_caption( 'Table: Themes' ))
    # END IF


### LEGACY CODE
##
#

Unnamed: 0,color_id,name,rgb,is_trans
0,322,Medium Azure,36AEBF,False
841,71,Light Bluish Gray,A0A5A9,False
8522,15,White,FFFFFF,False
15199,0,Black,05131D,False
24919,19,Tan,E4CD9E,False


Unnamed: 0,element_id,part_num,color_id
0,6031916,11954,0
1,6108906,11954,1
2,6036771,11954,4
3,6022952,11954,14
4,6164386,11954,73


Unnamed: 0,inventory_id,version,set_num
0,74939,1,0241401208-1
1,75721,1,0744023726-1
2,75724,1,0744023734-1
3,75722,1,0744024471-1
4,43322,1,10270-1


Unnamed: 0,inventory_id,fig_num,quantity
0,75721,fig-002113,1
1,75724,fig-010692,1
2,43322,fig-008333,1
3,43322,fig-008334,1
4,43322,fig-008335,1


Unnamed: 0,inventory_id,part_num,color_id,quantity,is_spare
0,34280,11954,322,2,False
1,34280,14720,71,4,False
2,34280,14769pr0080,15,2,False
3,34280,15458,0,2,False
4,34280,15535,0,4,False


Unnamed: 0,inventory_id,set_num,quantity
0,77818,41900-1,1
1,77818,41902-1,1
2,77818,41905-1,1
3,77818,41908-1,1
4,77814,60247-1,1


Unnamed: 0,fig_num,name,num_parts
0,fig-002113,"Moana - Red/Pink Shirt, Tan Skirt",4
4,fig-010692,"Classic Spaceman, Orange with Airtanks",5
5,fig-008333,"Kid, Dark Blue Torso, Dark Green Legs, Blue Cap, Lime Scarf",5
6,fig-008334,"Medium Lavender Torso, Medium Blue Legs, Black Hair",4
7,fig-008335,"Medium Dark Flesh Torso, Dark Blue Legs, Reddish Brown Hair",4


Unnamed: 0,part_cat_id,part_name
0,40,Technic Panels
29,55,Technic Beams Special
42,67,Tiles Round and Curved
204,15,Tiles Special
255,52,Technic Gears


Unnamed: 0,rel_type,part_num,parent_part_num
0,A,11954,62531
1,P,14769pr0080,14769
2,M,18575,32269
3,M,32062,3704
4,A,3738,3034


Unnamed: 0,part_num,name,part_cat_id
0,11954,Technic Panel Curved 11 x 3 with 10 Pin Holes through Panel Surface,40
5,14720,Technic Beam 3 x 5 Thick [90° Offset Centre Beam Holes],55
25,14769pr0080,Tile Round 2 x 2 with Black Spiral Print,67
26,15458,Technic Panel 3 X 11 x 1,40
39,15535,Tile 2 x 2 Round with Hole,15


Unnamed: 0,set_num,name,theme_id,num_parts,rating,reviewCount
0,10270-1,Bookshop,155,2504,4.3,3
1,10271-1,Fiat 500,673,960,4.1,0
2,10272-1,Old Trafford - Manchester United,673,3898,4.3,0
3,10273-1,Haunted House,673,3232,4.3,0
4,10274-1,ECTO-1,607,2350,0.0,0


Unnamed: 0,theme_id,theme_name,parent_id
0,1,Technic,
13,6,Airport,5.0
14,7,Construction,5.0
16,13,Riding Cycle,5.0
17,22,Creator,


In [2]:
### PUSH TO DATABASE
##
# Connect to database
# connection_string = "postgres:postgres@localhost:5432/Lego_db"
# engine = create_engine(f'postgresql://{connection_string}')
engine = create_engine(f"postgresql://postgres:postgres@localhost:5432/Lego_db")
engine.begin()
con = engine.connect()

# Check table names
engine.table_names()

# Load dataframes into database
df_clean_themes.to_sql( name = 'themes', con = engine, if_exists = 'append', index = False )
df_clean_sets.to_sql( name = 'sets', con = engine, if_exists = 'append', index = False )
df_clean_inventories.to_sql( name = 'inventories', con = engine, if_exists = 'append', index = False )
df_clean_inventory_sets.to_sql( name = 'inventory_sets', con = engine, if_exists = 'append', index = False )
df_clean_minifigs.to_sql( name = 'minifigs', con = engine, if_exists = 'append', index = False )
df_clean_inventory_minifigs.to_sql( name = 'inventory_minifigs', con = engine, if_exists = 'append', index = False )
df_clean_part_categories.to_sql( name = 'part_categories', con = engine, if_exists = 'append', index = False )
df_clean_colors.to_sql( name = 'colors', con = engine, if_exists = 'append', index = False )
df_clean_parts.to_sql( name = 'parts', con = engine, if_exists = 'append', index = False )
# df_clean_part_relationships.to_sql( name = 'part_relationships', con = engine, if_exists = 'append', index = False )
df_clean_elements.to_sql( name = 'elements', con = engine, if_exists = 'append', index = False )
df_clean_inventory_parts.to_sql( name = 'inventory_parts', con = engine, if_exists = 'append', index = False )


# Query records in database
pd.read_sql_query('select * from sets', con=engine).head()

Unnamed: 0,set_num,name,theme_id,num_parts,rating,reviewCount
0,10270-1,Bookshop,155,2504,4.3,3
1,10271-1,Fiat 500,673,960,4.1,0
2,10272-1,Old Trafford - Manchester United,673,3898,4.3,0
3,10273-1,Haunted House,673,3232,4.3,0
4,10274-1,ECTO-1,607,2350,0.0,0
