# Kaggle Lego DB Themes

Source: https://www.kaggle.com/rtatman/lego-database/download    

Workflow: 2      
Goal: To investigate the themes used in this dataset.    

Results: Not good. Themes are broken up into parent and normal themes. The parent themes don't seem useful, like Advent. Also I found a Darth Maul set with 1.8k pieces, which is a statue/bust (not a regular scale set), but it only has parent theme of Star Wars. So assigning the parent theme for the 3.5k sets which are missing it is not doable.

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_colwidth', -1)

## Import Files

In [3]:
THEME_FILE = 'data/themes.csv'
SET_FILE = 'data/sets.csv'
PART_CATEGORY_FILE = 'data/part_categories.csv'
PART_FILE = 'data/parts.csv'
COLOR_FILE = 'data/colors.csv'
INVENTORY_FILE = 'data/inventories.csv'
INVENTORY_SET_FILE = 'data/inventory_sets.csv'
INVENTORY_PART_FILE = 'data/inventory_parts.csv'

In [4]:
theme_df = pd.read_csv(THEME_FILE)
theme_df = theme_df.rename(columns={'id': 'theme_id', 'name': 'theme_name'})
theme_df.head()
theme_df.info()

Unnamed: 0,theme_id,theme_name,parent_id
0,1,Technic,
1,2,Arctic Technic,1.0
2,3,Competition,1.0
3,4,Expert Builder,1.0
4,5,Model,1.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 3 columns):
theme_id      614 non-null int64
theme_name    614 non-null object
parent_id     503 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 14.5+ KB


In [5]:
set_df = pd.read_csv(SET_FILE)
set_df.head()
set_df.info()

Unnamed: 0,set_num,name,year,theme_id,num_parts
0,00-1,Weetabix Castle,1970,414,471
1,0011-2,Town Mini-Figures,1978,84,12
2,0011-3,Castle 2 for 1 Bonus Offer,1987,199,2
3,0012-1,Space Mini-Figures,1979,143,12
4,0013-1,Space Mini-Figures,1979,143,12


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11673 entries, 0 to 11672
Data columns (total 5 columns):
set_num      11673 non-null object
name         11673 non-null object
year         11673 non-null int64
theme_id     11673 non-null int64
num_parts    11673 non-null int64
dtypes: int64(3), object(2)
memory usage: 456.1+ KB


In [6]:
# investigate duplicate themes 

doop_df = theme_df.loc[theme_df.duplicated(subset=['theme_name'])]
doop_df.head()
doop_df.info()

Unnamed: 0,theme_id,theme_name,parent_id
23,24,Airport,23.0
25,26,Construction,23.0
26,27,Race,23.0
27,28,Harbor,23.0
29,30,Traffic,23.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 212 entries, 23 to 613
Data columns (total 3 columns):
theme_id      212 non-null int64
theme_name    212 non-null object
parent_id     203 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.6+ KB


In [7]:
# names are duplicated, but theme_id is not

doop_df = theme_df.loc[theme_df.duplicated(subset=['theme_id', 'theme_name'])]
doop_df.head()
doop_df.info()

Unnamed: 0,theme_id,theme_name,parent_id


<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 3 columns):
theme_id      0 non-null int64
theme_name    0 non-null object
parent_id     0 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 0.0+ bytes


In [8]:
theme_df.groupby(['theme_name']).count().sort_values('theme_id', ascending=False).head(10)

Unnamed: 0_level_0,theme_id,parent_id
theme_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Supplemental,18,18
Fire,12,12
Airport,11,11
Harbor,9,9
Traffic,9,9
Police,8,8
Castle,8,7
Construction,8,8
Creator,6,5
Train,6,5


In [9]:
theme_df.loc[theme_df['theme_name']=='Fire']

Unnamed: 0,theme_id,theme_name,parent_id
8,9,Fire,5.0
35,36,Fire,23.0
46,47,Fire,38.0
57,58,Fire,52.0
73,74,Fire,67.0
97,98,Fire,94.0
107,108,Fire,105.0
282,283,Fire,280.0
294,295,Fire,290.0
375,376,Fire,373.0


## Investigate Parent Themes

In [10]:
# assume that parent themes have parent_id = NaN

parent_df = theme_df.loc[theme_df['parent_id'].isna()].copy()
parent_df = parent_df.rename(columns={'theme_id': 'parent_own_id', 'name': 'parent_name'})
parent_df.head()
parent_df.shape

Unnamed: 0,parent_own_id,theme_name,parent_id
0,1,Technic,
21,22,Creator,
49,50,Town,
111,112,Racers,
125,126,Space,


(111, 3)

In [11]:
# there's 614 themes altogether
# there's 111 parent themes

child_df = theme_df.loc[theme_df['parent_id'].isna() == False]
child_df.head()
child_df.shape

Unnamed: 0,theme_id,theme_name,parent_id
1,2,Arctic Technic,1.0
2,3,Competition,1.0
3,4,Expert Builder,1.0
4,5,Model,1.0
5,6,Airport,5.0


(503, 3)

In [12]:
# join parent with children and investigate

outer_df = pd.merge(parent_df, child_df, how='outer', left_on='parent_own_id', right_on='parent_id')
outer_df.head()
outer_df.info()

Unnamed: 0,parent_own_id,theme_name_x,parent_id_x,theme_id,theme_name_y,parent_id_y
0,1.0,Technic,,2.0,Arctic Technic,1.0
1,1.0,Technic,,3.0,Competition,1.0
2,1.0,Technic,,4.0,Expert Builder,1.0
3,1.0,Technic,,5.0,Model,1.0
4,1.0,Technic,,16.0,RoboRiders,1.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 564 entries, 0 to 563
Data columns (total 6 columns):
parent_own_id    408 non-null float64
theme_name_x     408 non-null object
parent_id_x      0 non-null float64
theme_id         503 non-null float64
theme_name_y     503 non-null object
parent_id_y      503 non-null float64
dtypes: float64(4), object(2)
memory usage: 30.8+ KB


In [13]:
left_right_df = outer_df.loc[(outer_df['parent_own_id'].isna() == False) & (outer_df['theme_id'].isna() == False)]
left_right_df.head()
left_right_df.shape

Unnamed: 0,parent_own_id,theme_name_x,parent_id_x,theme_id,theme_name_y,parent_id_y
0,1.0,Technic,,2.0,Arctic Technic,1.0
1,1.0,Technic,,3.0,Competition,1.0
2,1.0,Technic,,4.0,Expert Builder,1.0
3,1.0,Technic,,5.0,Model,1.0
4,1.0,Technic,,16.0,RoboRiders,1.0


(347, 6)

In [14]:
# some parents do not have children themes

left_df = outer_df.loc[(outer_df['parent_own_id'].isna() == False) & (outer_df['theme_id'].isna() == True)]
left_df.head()
left_df.shape

Unnamed: 0,parent_own_id,theme_name_x,parent_id_x,theme_id,theme_name_y,parent_id_y
128,263.0,Pirates of the Caribbean,,,,
133,269.0,Cars,,,,
134,270.0,Ben 10,,,,
135,271.0,Prince of Persia,,,,
136,272.0,SpongeBob SquarePants,,,,


(61, 6)

In [15]:
# some children don't have parent themes

right_df = outer_df.loc[(outer_df['parent_own_id'].isna() == True) & (outer_df['theme_id'].isna() == False)]
right_df.head()
right_df.shape

Unnamed: 0,parent_own_id,theme_name_x,parent_id_x,theme_id,theme_name_y,parent_id_y
408,,,,6.0,Airport,5.0
409,,,,7.0,Construction,5.0
410,,,,8.0,Farm,5.0
411,,,,9.0,Fire,5.0
412,,,,10.0,Harbor,5.0


(156, 6)

In [16]:
left_right_df.shape[0] + left_df.shape[0] + right_df.shape[0]

564

In [17]:
# looks like some parent_id = NaN themes do not have any children
# try another approach to parent themes
# leave the theme table as is, but join parent on the right

parent_id_df = theme_df.loc[theme_df['parent_id'].isna() == False, ['parent_id']]
parent_id_df.head()
parent_id_df.shape

Unnamed: 0,parent_id
1,1.0
2,1.0
3,1.0
4,1.0
5,5.0


(503, 1)

In [18]:
parent_id_df['parent_id'].value_counts().head()
parent_id_df['parent_id'].value_counts().tail()

324.0    38
535.0    24
507.0    20
126.0    20
158.0    19
Name: parent_id, dtype: int64

252.0    1
276.0    1
524.0    1
454.0    1
302.0    1
Name: parent_id, dtype: int64

In [19]:
theme_df.loc[theme_df['theme_id'] == 158]

Unnamed: 0,theme_id,theme_name,parent_id
157,158,Star Wars,


In [20]:
parent_id_df['parent_id'].unique().shape

(78,)

In [21]:
theme_parent_df = pd.merge(theme_df, theme_df, how='left', left_on='parent_id', right_on='theme_id')
theme_parent_df.head()

Unnamed: 0,theme_id_x,theme_name_x,parent_id_x,theme_id_y,theme_name_y,parent_id_y
0,1,Technic,,,,
1,2,Arctic Technic,1.0,1.0,Technic,
2,3,Competition,1.0,1.0,Technic,
3,4,Expert Builder,1.0,1.0,Technic,
4,5,Model,1.0,1.0,Technic,


In [27]:
theme_full_df = theme_parent_df.rename(columns={'theme_id_x': 'theme_id', 'theme_name_x': 'theme_name', 
                                                'parent_id_x': 'parent_id', 'theme_name_y': 'parent_name'})
theme_full_df = theme_full_df[['theme_id', 'theme_name', 'parent_id', 'parent_name']]
theme_full_df.head()
theme_full_df.info()

Unnamed: 0,theme_id,theme_name,parent_id,parent_name
0,1,Technic,,
1,2,Arctic Technic,1.0,Technic
2,3,Competition,1.0,Technic
3,4,Expert Builder,1.0,Technic
4,5,Model,1.0,Technic


<class 'pandas.core.frame.DataFrame'>
Int64Index: 614 entries, 0 to 613
Data columns (total 4 columns):
theme_id       614 non-null int64
theme_name     614 non-null object
parent_id      503 non-null float64
parent_name    503 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 24.0+ KB


## Join With Sets

Goal: Investigate if there are sets with parent themes.

In [28]:
theme_set_df = pd.merge(set_df, theme_full_df, how='left', left_on='theme_id', right_on='theme_id')
theme_set_df.head()
theme_set_df.info()

Unnamed: 0,set_num,name,year,theme_id,num_parts,theme_name,parent_id,parent_name
0,00-1,Weetabix Castle,1970,414,471,Castle,411.0,Legoland
1,0011-2,Town Mini-Figures,1978,84,12,Supplemental,67.0,Classic Town
2,0011-3,Castle 2 for 1 Bonus Offer,1987,199,2,Lion Knights,186.0,Castle
3,0012-1,Space Mini-Figures,1979,143,12,Supplemental,126.0,Space
4,0013-1,Space Mini-Figures,1979,143,12,Supplemental,126.0,Space


<class 'pandas.core.frame.DataFrame'>
Int64Index: 11673 entries, 0 to 11672
Data columns (total 8 columns):
set_num        11673 non-null object
name           11673 non-null object
year           11673 non-null int64
theme_id       11673 non-null int64
num_parts      11673 non-null int64
theme_name     11673 non-null object
parent_id      8046 non-null float64
parent_name    8046 non-null object
dtypes: float64(1), int64(3), object(4)
memory usage: 820.8+ KB


In [None]:
# theme_set_df['theme'] = theme_set_df

In [37]:
# all sets have a theme, but not a parent theme

theme_set_df.loc[theme_set_df['parent_id'].isna()].shape

(3627, 8)

In [30]:
theme_set_df.groupby(['parent_name', 'theme_name']).count().sort_values('parent_name', ascending=True)[['set_num']]

Unnamed: 0_level_0,Unnamed: 1_level_0,set_num
parent_name,theme_name,Unnamed: 2_level_1
4 Juniors,Pirates,11
4 Juniors,Supplemental,5
9V,My Own Creation,2
9V,My Own Train,43
9V,World City,14
Advent,Star Wars,5
Advent,Pirates,1
Advent,Friends,5
Advent,Creator,4
Advent,Classic Basic,1


In [34]:
theme_set_df.loc[theme_set_df['parent_name']=='Advent'].shape

(33, 8)

In [36]:
theme_set_df.loc[theme_set_df['set_num'].str.contains('10018')]

Unnamed: 0,set_num,name,year,theme_id,num_parts,theme_name,parent_id,parent_name
44,10018-1,Darth Maul,2001,158,1868,Star Wars,,
