## 1. environment setup

In [1]:
import numpy as np
import pandas as pd

## 2. ETL

In [3]:
df = pd.read_csv('./lego_sets.csv')
pt = pd.read_csv('./parent_themes.csv')

In [4]:
# join the 2 data sets
df = df.merge(pt,left_on='parent_theme', right_on='name')

# drop unused column
df.drop(columns='name_y', inplace=True)


In [11]:
#drop invalid/null data
#df[df['set_num'].isnull()]
nulls_total = df['set_num'].isnull().sum()
rows_total = df.shape[0]

print(f'There are {nulls_total} invalid rows vs. {rows_total} total')
df = df.dropna(subset='set_num')

# check for duplicated
duplicated = df["set_num"].duplicated(keep=False).sum()
print(f'There are {duplicated} rows that are duplicated')

There are 153 invalid rows vs. 11986 total


## query 1: # number of licensed starwar themed lego

In [13]:
licensed = df[df['is_licensed'] == True]
star_wars = licensed[licensed['parent_theme'] == 'Star Wars']

licensed_total = licensed.shape[0]
star_wars_total = star_wars.shape[0]

q1 = int(star_wars_total/licensed_total * 100)

print(f'In this data set, {q1}% of the lego are Star Wars')

In this data set, 51% of the lego are Star Wars


## query 2: year where Star Wars sets was beaten

In [38]:
sorted = licensed.sort_values(by='year')
sorted['count'] = 1

grouped = sorted.groupby(by=['year', 'parent_theme'])['count'].count().reset_index()

# for each group, get the index of the maximum count
indices = grouped.groupby(by='year')['count'].idxmax()
result = grouped.loc[indices]

r = result[result['parent_theme'] != 'Star Wars']
q2 = r.iloc[:,0].iloc[0]

print(f'In {q2}, Star Wars was beaten')

In 2017, Star Wars was beaten


## query 3. unique sets per year

In [50]:
df = df.copy()
df['count'] = 1

# cleaning
df2 = df.drop_duplicates(subset=['set_num'], keep='first')
df2 = df[~df['set_num'].isnull()]

# query
sets_per_year = df2.groupby(by='year').sum().reset_index()

# display result
for i, r in sets_per_year.iterrows():
    year = int(r['year'].round(0))
    sets = int(r['count'].round(0))
    print(f'{year} has {sets} lego sets')

1950 has 7 lego sets
1953 has 4 lego sets
1954 has 14 lego sets
1955 has 28 lego sets
1956 has 12 lego sets
1957 has 21 lego sets
1958 has 42 lego sets
1959 has 4 lego sets
1960 has 3 lego sets
1961 has 17 lego sets
1962 has 40 lego sets
1963 has 18 lego sets
1964 has 11 lego sets
1965 has 10 lego sets
1966 has 89 lego sets
1967 has 21 lego sets
1968 has 25 lego sets
1969 has 69 lego sets
1970 has 29 lego sets
1971 has 45 lego sets
1972 has 38 lego sets
1973 has 68 lego sets
1974 has 39 lego sets
1975 has 31 lego sets
1976 has 68 lego sets
1977 has 92 lego sets
1978 has 73 lego sets
1979 has 82 lego sets
1980 has 88 lego sets
1981 has 79 lego sets
1982 has 76 lego sets
1983 has 57 lego sets
1984 has 76 lego sets
1985 has 139 lego sets
1986 has 123 lego sets
1987 has 209 lego sets
1988 has 68 lego sets
1989 has 114 lego sets
1990 has 85 lego sets
1991 has 106 lego sets
1992 has 115 lego sets
1993 has 111 lego sets
1994 has 128 lego sets
1995 has 128 lego sets
1996 has 144 lego sets
1997