# Brickset Set Filter

Workflow: 2.1    
Goal: Start with the full cleaned brickset sets file (brickset_set_clean.csv), and remove sets which are not appropriate for regression analysis (because it would be comparing apples to oranges).     
Result: Creates the file brickset_filter.csv.

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as stats

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
%load_ext autoreload
%autoreload 2         # reload custom py file eveytime a new cell is run

import brickset_helper as lh

## Import Data

In [4]:
SET_FILE = 'data/brickset_set_clean.csv'

In [5]:
set_df = pd.read_csv(SET_FILE)
set_df.head()
set_df.info()

Unnamed: 0,set_no,name,url,theme_group,theme,subtheme,year,piece_cnt,minifig_cnt,inventory_url,minifig_url,price_store,price_new,price_used,rating_value,rating_votes,tags,set_type,packaging
0,722-1,"Universal Building Set, 7+",/sets/722-1/Universal-Building-Set-7,Basic,Basic,,1980.0,301.0,0.0,,,20.0,200.0,78.0,4.4,5.0,"4X4, Aircraft, Articulated Lorry, Boat, Car, C...",Normal,
1,733-1,"Universal Building Set, 7+",/sets/733-1/Universal-Building-Set-7,Basic,Basic,,1980.0,533.0,0.0,,,39.0,566.0,109.0,4.5,2.0,"Articulated Lorry, Base, Baseplate, Crane, For...",Normal,
2,744-1,"Universal Building Set with Motor, 7+",/sets/744-1/Universal-Building-Set-with-Motor-7,Basic,Basic,,1980.0,537.0,0.0,,,59.0,252.0,97.0,5.0,2.0,"8 Wheel Vehicle, Crane, Electric Motor, Mobile...",Normal,
3,820-1,Red Plates Parts Pack,/sets/820-1/Red-Plates-Parts-Pack,Basic,Basic,Supplementaries,1980.0,34.0,0.0,,,4.75,,,,,,Normal,
4,822-1,Blue Plates Parts Pack,/sets/822-1/Blue-Plates-Parts-Pack,Basic,Basic,Supplementaries,1980.0,34.0,0.0,,,4.75,29.0,5.0,,,,Normal,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13754 entries, 0 to 13753
Data columns (total 19 columns):
set_no           13754 non-null object
name             13754 non-null object
url              13754 non-null object
theme_group      13754 non-null object
theme            13754 non-null object
subtheme         10040 non-null object
year             13754 non-null float64
piece_cnt        10858 non-null float64
minifig_cnt      13754 non-null float64
inventory_url    6958 non-null object
minifig_url      6264 non-null object
price_store      8378 non-null float64
price_new        8516 non-null float64
price_used       7015 non-null float64
rating_value     6813 non-null float64
rating_votes     6813 non-null float64
tags             10265 non-null object
set_type         13753 non-null object
packaging        8919 non-null object
dtypes: float64(8), object(11)
memory usage: 2.0+ MB


In [6]:
# remove columns that definitely won't be used downstream

set_df = set_df[['set_no', 'name', 'theme_group', 'theme', 'subtheme', 'year', 'piece_cnt', 'minifig_cnt', 
                'price_store', 'price_new', 'price_used', 'rating_value', 'rating_votes', 
                'tags', 'set_type', 'packaging']]
set_df.head()

Unnamed: 0,set_no,name,theme_group,theme,subtheme,year,piece_cnt,minifig_cnt,price_store,price_new,price_used,rating_value,rating_votes,tags,set_type,packaging
0,722-1,"Universal Building Set, 7+",Basic,Basic,,1980.0,301.0,0.0,20.0,200.0,78.0,4.4,5.0,"4X4, Aircraft, Articulated Lorry, Boat, Car, C...",Normal,
1,733-1,"Universal Building Set, 7+",Basic,Basic,,1980.0,533.0,0.0,39.0,566.0,109.0,4.5,2.0,"Articulated Lorry, Base, Baseplate, Crane, For...",Normal,
2,744-1,"Universal Building Set with Motor, 7+",Basic,Basic,,1980.0,537.0,0.0,59.0,252.0,97.0,5.0,2.0,"8 Wheel Vehicle, Crane, Electric Motor, Mobile...",Normal,
3,820-1,Red Plates Parts Pack,Basic,Basic,Supplementaries,1980.0,34.0,0.0,4.75,,,,,,Normal,
4,822-1,Blue Plates Parts Pack,Basic,Basic,Supplementaries,1980.0,34.0,0.0,4.75,29.0,5.0,,,,Normal,


## Filter based on different columns

In [7]:
# filter by group theme to get rid of clothing, books, basic-size sets, etc

remove_group_ls = ['Basic', 'Educational', 'Miscellaneous', 'Pre-school', 'Vintage themes']
set_df = set_df.loc[set_df['theme_group'].isin(remove_group_ls)==False]
set_df.shape

(7117, 16)

In [8]:
# also remove the groups that have different piece size ratio (not minifig scale)

remove_group_ls = ['Constraction', 'Girls', 'Junior', 'Technical']
set_df = set_df.loc[set_df['theme_group'].isin(remove_group_ls)==False]
set_df.shape

(5397, 16)

In [9]:
# remove sets that aren't minifig scale

#set_df = set_df.loc[set_df['minifig_cnt']>=1]
#set_df.shape

# this cuts out 1700 sets
# it's actually ok to keep them, the piece-size ratio is comparable with minifig-scale (unlike technic, basic, etc)

In [10]:
# keep only the normal set types (collections are redundant with included sets)

set_df = set_df.loc[set_df['set_type']=='Normal']
set_df.shape

(4686, 16)

In [11]:
# keep only the sets that come in box (polybag, blister pack, bucket sets are different from boxes)

set_df = set_df.loc[set_df['packaging']=='Box']
set_df.shape

(3595, 16)

In [12]:
# remove columns that won't be used for analysis

set_df = set_df[['set_no', 'name', 'theme_group', 'theme', 'subtheme', 'year', 'piece_cnt', 'minifig_cnt', 
                'price_store', 'price_new', 'price_used', 'rating_value', 'rating_votes']]
set_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3595 entries, 43 to 13605
Data columns (total 13 columns):
set_no          3595 non-null object
name            3595 non-null object
theme_group     3595 non-null object
theme           3595 non-null object
subtheme        2985 non-null object
year            3595 non-null float64
piece_cnt       3591 non-null float64
minifig_cnt     3595 non-null float64
price_store     3122 non-null float64
price_new       3474 non-null float64
price_used      3405 non-null float64
rating_value    3262 non-null float64
rating_votes    3262 non-null float64
dtypes: float64(8), object(5)
memory usage: 393.2+ KB


In [13]:
# spot check

set_df['theme_group'].value_counts()

Modern day          1090
Licensed            1042
Action/Adventure     642
Historical           346
Model making         314
Racing               161
Name: theme_group, dtype: int64

In [14]:
#set_df['theme'].value_counts()

## Export to csv

In [15]:
save_path = r'data/brickset_filter.csv'
set_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'0.34 mb'