In [2]:
#####
###
##
#    SYNOPSIS
#
#    As a small data analytics firm, our client approached us with an urgent project to collect information
#    about this year’s LEGO sets. They are part of the manufacturing process of LEGO pieces; and want to do 
#    predictive data modelling to find out what materials will be more in demand next year, based on this
#    year’s information, so that they can optimise their manufacturing process.
#
#    They have requested the information be put into database format so that it can be accessed in a way
#    that they are already familiar with.
#
#    Based on the timeframe of 1 week, we have assigned the following team members to this project:
#
#    CONTRIBUTORS
#    
#    Sylvia Broadbent @github/Supasyl
#    Cicily George @github/CicilyGeorge
#    Daniel Sobral @github/D0SO
#    John Bingley @github/JB-DA
#
#    Source and output can be found (with access) on https://github.com/Supasyl/ETL_project
#
##
###
#####


### SETTINGS
##
# Dependencies
import pandas as pd
from sqlalchemy import create_engine
import requests
import json

viewHeaders = 'yes' #for displaying dataframes, yes/no
headSize = 5 #rows to show if above line = yes


### LOAD DATA FROM CSV
##
# Load pre-downloaded information
df_colors = pd.read_csv( 'data_raw/colors.csv' )
df_elements = pd.read_csv( 'data_raw/elements.csv' )
df_inventories = pd.read_csv( 'data_raw/inventories.csv' )
df_inventory_minifigs = pd.read_csv( 'data_raw/inventory_minifigs.csv' )
df_inventory_parts = pd.read_csv( 'data_raw/inventory_parts.csv' )
df_inventory_sets = pd.read_csv( 'data_raw/inventory_sets.csv' )
df_minifigs = pd.read_csv( 'data_raw/minifigs.csv' )
df_parts = pd.read_csv( 'data_raw/parts.csv' )
df_sets = pd.read_csv( 'data_raw/sets.csv' )
df_themes = pd.read_csv( 'data_raw/themes.csv' )


### LOAD DATA FROM API
##
# Left as proof of code only. Results were stored to sets2020.json
# API requires ' to be used in URL, converted to %27 for ease of use
# Comment out code block and api_key before publishing

# api_key = "3-DLfb-T3ZA-qQLjW"
# url = "https://brickset.com/api/v3.asmx/getSets?"
# query_url = f"{url}apiKey={api_key}&userHash=&params={{ %27year%27:%272020%27, %27pageSize%27 : 900 }}"

# response = requests.get( query_url )

# with open( 'api_sets_2020.json', 'w' ) as ii:
#     json.dump( response.json(), ii )






### CLEAN & JOIN DATA
##
# Looking at only data from 2020 sets
df_clean_sets = df_sets.loc[ df_sets[ 'year' ] == 2020 ].copy()
df_clean_sets.drop( 'year', axis = 1, inplace = True )


# Inventories
df_inventories = df_inventories.rename( columns = { 'id' : 'inventory_id' })
df_temp = pd.merge( df_clean_sets, df_inventories, how = 'inner', on = 'set_num' )
df_clean_inventories = df_temp[[ 'inventory_id', 'version', 'set_num' ]]

# Inventory Sets
df_temp = pd.merge( df_clean_sets, df_inventory_sets, how = 'inner', on = 'set_num' )
df_clean_inventory_sets = df_temp[[ 'inventory_id', 'set_num', 'quantity' ]]

# Themes
df_themes = df_themes.rename( columns = { 'id' : 'theme_id', 'name' : 'theme_name' })
df_temp = pd.merge( df_themes, df_clean_sets, how = 'inner', on = 'theme_id' )
df_clean_themes = df_temp[[ 'theme_id', 'theme_name', 'parent_id' ]]
df_clean_themes = df_clean_themes.drop_duplicates(keep='first')

# Inventory Minifigs
df_temp = pd.merge( df_clean_inventories, df_inventory_minifigs, how = 'inner', on = 'inventory_id' )
df_clean_inventory_minifigs = df_temp[[ 'inventory_id', 'fig_num', 'quantity' ]]

# Minifigs
df_temp = pd.merge( df_clean_inventory_minifigs, df_minifigs, how = 'inner', on = 'fig_num' )
df_clean_minifigs = df_temp[[ 'fig_num', 'name', 'num_parts' ]]
df_clean_minifigs = df_clean_minifigs.drop_duplicates(keep='first')

# Inventory Parts
df_temp = pd.merge( df_inventory_parts, df_clean_inventories, how = 'inner', on = 'inventory_id' )
df_clean_inventory_parts = df_temp[[ 'inventory_id', 'part_num', 'color_id', 'quantity', 'is_spare' ]]
df_clean_inventory_parts[ 'is_spare' ] = df_clean_inventory_parts[ 'is_spare' ].map({ 't' : True, 'f' : False })
df_clean_inventory_parts = df_clean_inventory_parts.drop_duplicates(keep='first')

# Colours
df_colors = df_colors.rename( columns = { 'id' : 'color_id' })
df_temp = pd.merge( df_clean_inventory_parts, df_colors, how = 'inner', on = 'color_id' )
df_clean_colors = df_temp[[ 'color_id', 'name', 'rgb', 'is_trans' ]]
df_clean_colors[ 'is_trans' ] = df_clean_colors[ 'is_trans' ].map({ 't' : True, 'f' : False })
df_clean_colors = df_clean_colors.drop_duplicates(keep='first')

# Parts
df_temp = pd.merge( df_clean_inventory_parts, df_parts, how = 'inner', on = 'part_num' )
df_clean_parts = df_temp[[ 'part_num', 'name' ]] # , 'part_cat_id'
df_clean_parts = df_clean_parts.drop_duplicates(keep='first')

# Elements
df_temp = pd.merge( df_clean_parts, df_elements, how = 'inner', on = 'part_num' )
df_clean_elements = df_temp[[ 'element_id', 'part_num', 'color_id' ]]
df_temp = pd.merge( df_clean_colors, df_clean_elements, how = 'inner', on = 'color_id' )
df_clean_elements = df_temp[[ 'element_id', 'part_num', 'color_id' ]]
df_clean_elements = df_clean_elements.drop_duplicates(keep='first')





# API Data Load
with open( 'api_sets_2020.json', 'r' ) as jj: #open pre-made api results
    json_d = json.load( jj )

api_sets_2020 = pd.DataFrame( json_d[ 'sets' ]) #load to dataframe

df_api_sets_2020 = api_sets_2020[[ 'number', 'rating', 'reviewCount' ]] #show only desired columns

# Set API column number to match Lego ID number
df_api_sets_2020 = df_api_sets_2020.rename( columns = { 'number' : 'set_num','reviewCount':'review_count' })
df_api_sets_2020.set_num = df_api_sets_2020.set_num + '-1' #append -1 to column to match data in 'sets'

# Merge API data with Sets Dataframe

df_clean_sets = pd.merge( df_clean_sets, df_api_sets_2020, how = 'left', on = 'set_num' )
df_clean_sets = df_clean_sets.drop_duplicates(keep='first')
df_clean_sets = df_clean_sets.drop_duplicates(['set_num'],keep='first')


### PUSH TO DATABASE
##
# Connect to database
# connection_string = "postgres:postgres@localhost:5432/Lego_db"
# engine = create_engine(f'postgresql://{connection_string}')
engine = create_engine(f"postgresql://postgres:postgres@localhost:5432/Lego_db")
engine.begin()
con = engine.connect()

# Check table names
engine.table_names()

# Load dataframes into database
df_clean_themes.to_sql( name = 'themes', con = engine, if_exists = 'append', index = False )
df_clean_sets.to_sql( name = 'sets', con = engine, if_exists = 'append', index = False )
df_clean_inventories.to_sql( name = 'inventories', con = engine, if_exists = 'append', index = False )
df_clean_inventory_sets.to_sql( name = 'inventory_sets', con = engine, if_exists = 'append', index = False )
df_clean_minifigs.to_sql( name = 'minifigs', con = engine, if_exists = 'append', index = False )
df_clean_inventory_minifigs.to_sql( name = 'inventory_minifigs', con = engine, if_exists = 'append', index = False )
# df_clean_part_categories.to_sql( name = 'part_categories', con = engine, if_exists = 'append', index = False )
df_clean_colors.to_sql( name = 'colors', con = engine, if_exists = 'append', index = False )
df_clean_parts.to_sql( name = 'parts', con = engine, if_exists = 'append', index = False )
# df_clean_part_relationships.to_sql( name = 'part_relationships', con = engine, if_exists = 'append', index = False )
df_clean_elements.to_sql( name = 'elements', con = engine, if_exists = 'append', index = False )
df_clean_inventory_parts.to_sql( name = 'inventory_parts', con = engine, if_exists = 'append', index = True )




# Query records in database for test purpose
# Query for total no.of pieces in a set
query = "select a.set_num, sum(b.quantity) as no_of_pieces\
    from inventories as a, inventory_parts as b\
    where a.inventory_id = b.inventory_id\
    group by a.set_num;"
pd.read_sql_query(query, con=engine).head()


# Query for total no.of minifigs in a set
query = "select a.set_num, sum(b.quantity) as num_figs\
    from inventories as a, inventory_minifigs as b\
    where a.inventory_id = b.inventory_id\
    group by a.set_num;"
pd.read_sql_query(query, con=engine).head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,set_num,num_figs
0,71709-1,3
1,41391-1,2
2,41405-1,5
3,75978-1,17
4,75966-1,4


In [None]:

### VIEW DATA (Validation Purposes Only)
##
# Uses value under 'SETTINGS' at top of file
if viewHeaders == 'yes':
    
    display( df_clean_colors \
        .head( headSize ) \
        .style.set_caption( 'Table: Colours' ))
    
    display( df_clean_elements \
        .head( headSize ) \
        .style.set_caption( 'Table: Elements' ))
    
    display( df_clean_inventories \
        .head( headSize ) \
        .style.set_caption( 'Table: Inventories' ))
    
    display( df_clean_inventory_minifigs \
        .head( headSize ) \
        .style.set_caption( 'Table: Inventory Mini-figures' ))
    
    display( df_clean_inventory_parts \
        .head( headSize ) \
        .style.set_caption( 'Table: Inventory Parts' ))
    
    display( df_clean_inventory_sets \
        .head( headSize ) \
        .style.set_caption( 'Table: Inventory Sets' ))
    
    display( df_clean_minifigs \
        .head( headSize ) \
        .style.set_caption( 'Table: Mini-figures' ))
    
    display( df_clean_part_categories \
        .head( headSize ) \
        .style.set_caption( 'Table: Part Categories' ))
    
    display( df_clean_part_relationships \
        .head( headSize ) \
        .style.set_caption( 'Table: Part Relationships' ))
    
    display( df_clean_parts \
        .head( headSize ) \
        .style.set_caption( 'Table: Parts' ))
    
    display( df_clean_sets \
        .head( headSize ) \
        .style.set_caption( 'Table: Sets' ))
    
    display( df_clean_themes \
        .head( headSize ) \
        .style.set_caption( 'Table: Themes' ))
    # END IF


### LEGACY CODE
##
#