# Kaggle Lego DB Files Overview

Source: https://www.kaggle.com/rtatman/lego-database

Workflow: 1        
Goal: List and inspect all files available from this dataset.    

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_colwidth', -1)

## Import Files

In [3]:
THEME_FILE = 'data/themes.csv'
SET_FILE = 'data/sets.csv'
PART_CATEGORY_FILE = 'data/part_categories.csv'
PART_FILE = 'data/parts.csv'
COLOR_FILE = 'data/colors.csv'
INVENTORY_FILE = 'data/inventories.csv'
INVENTORY_SET_FILE = 'data/inventory_sets.csv'
INVENTORY_PART_FILE = 'data/inventory_parts.csv'

In [4]:
theme_df = pd.read_csv(THEME_FILE)
theme_df.head()
theme_df.info()

Unnamed: 0,id,name,parent_id
0,1,Technic,
1,2,Arctic Technic,1.0
2,3,Competition,1.0
3,4,Expert Builder,1.0
4,5,Model,1.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 3 columns):
id           614 non-null int64
name         614 non-null object
parent_id    503 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 14.5+ KB


In [5]:
set_df = pd.read_csv(SET_FILE)
set_df.head()
set_df.info()

Unnamed: 0,set_num,name,year,theme_id,num_parts
0,00-1,Weetabix Castle,1970,414,471
1,0011-2,Town Mini-Figures,1978,84,12
2,0011-3,Castle 2 for 1 Bonus Offer,1987,199,2
3,0012-1,Space Mini-Figures,1979,143,12
4,0013-1,Space Mini-Figures,1979,143,12


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11673 entries, 0 to 11672
Data columns (total 5 columns):
set_num      11673 non-null object
name         11673 non-null object
year         11673 non-null int64
theme_id     11673 non-null int64
num_parts    11673 non-null int64
dtypes: int64(3), object(2)
memory usage: 456.1+ KB


In [6]:
partcateg_df = pd.read_csv(PART_CATEGORY_FILE)
partcateg_df.head()
partcateg_df.info()

Unnamed: 0,id,name
0,1,Baseplates
1,2,Bricks Printed
2,3,Bricks Sloped
3,4,"Duplo, Quatro and Primo"
4,5,Bricks Special


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 2 columns):
id      57 non-null int64
name    57 non-null object
dtypes: int64(1), object(1)
memory usage: 992.0+ bytes


In [7]:
part_df = pd.read_csv(PART_FILE)
part_df.head()
part_df.info()

Unnamed: 0,part_num,name,part_cat_id
0,0687b1,Set 0687 Activity Booklet 1,17
1,0901,Baseplate 16 x 30 with Set 080 Yellow House Print,1
2,0902,Baseplate 16 x 24 with Set 080 Small White House Print,1
3,0903,Baseplate 16 x 24 with Set 080 Red House Print,1
4,0904,Baseplate 16 x 24 with Set 080 Large White House Print,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25993 entries, 0 to 25992
Data columns (total 3 columns):
part_num       25993 non-null object
name           25993 non-null object
part_cat_id    25993 non-null int64
dtypes: int64(1), object(2)
memory usage: 609.3+ KB


In [8]:
color_df = pd.read_csv(COLOR_FILE)
color_df.head()
color_df.info()

Unnamed: 0,id,name,rgb,is_trans
0,-1,Unknown,0033B2,f
1,0,Black,05131D,f
2,1,Blue,0055BF,f
3,2,Green,237841,f
4,3,Dark Turquoise,008F9B,f


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135 entries, 0 to 134
Data columns (total 4 columns):
id          135 non-null int64
name        135 non-null object
rgb         135 non-null object
is_trans    135 non-null object
dtypes: int64(1), object(3)
memory usage: 4.3+ KB


In [9]:
inventory_df = pd.read_csv(INVENTORY_FILE)
inventory_df.head()
inventory_df.info()

Unnamed: 0,id,version,set_num
0,1,1,7922-1
1,3,1,3931-1
2,4,1,6942-1
3,15,1,5158-1
4,16,1,903-1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11681 entries, 0 to 11680
Data columns (total 3 columns):
id         11681 non-null int64
version    11681 non-null int64
set_num    11681 non-null object
dtypes: int64(2), object(1)
memory usage: 273.9+ KB


In [10]:
inventoryset_df = pd.read_csv(INVENTORY_SET_FILE)
inventoryset_df.head()
inventoryset_df.info()

Unnamed: 0,inventory_id,set_num,quantity
0,35,75911-1,1
1,35,75912-1,1
2,39,75048-1,1
3,39,75053-1,1
4,50,4515-1,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2846 entries, 0 to 2845
Data columns (total 3 columns):
inventory_id    2846 non-null int64
set_num         2846 non-null object
quantity        2846 non-null int64
dtypes: int64(2), object(1)
memory usage: 66.8+ KB


In [11]:
inventorypart_df = pd.read_csv(INVENTORY_PART_FILE)
inventorypart_df.head()
inventorypart_df.info()

Unnamed: 0,inventory_id,part_num,color_id,quantity,is_spare
0,1,48379c01,72,1,f
1,1,48395,7,1,f
2,1,mcsport6,25,1,f
3,1,paddle,0,1,f
4,3,11816pr0005,78,1,f


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 580251 entries, 0 to 580250
Data columns (total 5 columns):
inventory_id    580251 non-null int64
part_num        580251 non-null object
color_id        580251 non-null int64
quantity        580251 non-null int64
is_spare        580251 non-null object
dtypes: int64(3), object(2)
memory usage: 22.1+ MB
