# Let's dive into the Lego world!


###  Reading Data

In [None]:
cd C:\\Lego-Datasets\\datasets

In [None]:
# Import pandas
import pandas as pd

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
# Read colors data   Dataset location on local system - 'C:\\Lego-Datasets\\datasets'
colors=pd.read_csv('colors.csv')
sets=pd.read_csv('sets.csv')
themes=pd.read_csv('themes.csv')
inventory_parts=pd.read_csv('inventory_parts.csv')


### Exploratory Data Analysis

In [None]:
def check_data(df):
    print("Dataset info:",'\n' )
    print(df.info(),'\n')
    print("Initial 5 Rows: \n", df.head(),'\n')
    print("Duplicate Values: \n", df.duplicated().sum(),'\n')
    print("Null Values: \n", df.isna().sum(),'\n')
    print('Unique values per column: \n', df.nunique(),'\n')
    print("Descriptive statistics: \n", df.describe())

In [None]:
check_data(colors)

In [None]:
check_data(sets)

In [None]:
check_data(themes)

In [None]:
check_data(inventory_parts)

In [None]:
#Checking color distribution in inventory

import seaborn as sns
import matplotlib.pyplot as plt
print('Figure 1.')
inventory_parts['color_id'].value_counts().plot(kind='bar',figsize=(20,10))

##### Graph shows certain colours mostly used among all sets compared to other colors

## What is the number of Lego sets released per year?

In [None]:

# Create a summary of average number of sets by year: `sets_by_year`
sets_by_year= sets.groupby('year').set_num.count()
sets_by_year

# Plot trends in average number of sets by year
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})
print('Figure 2.')
lines = sets_by_year.plot.line()

In [None]:
avg_sets_by_year= sets.groupby('year').set_num.count()
avg_sets_by_year.mean()

## What is the average number of Lego parts per year?

In [None]:
# Create a summary of average number of parts by year: `parts_by_year`
parts_by_year= sets.groupby('year')['num_parts'].mean().round(2)

# Plot trends in average number of parts by year
print('Figure 3.')
lines = parts_by_year.plot.line()

In [None]:
parts_by_year.mean()

## How the number of themes shipped has varied over the years?

In [None]:
# Create a summary of average number of themes per year: `themes_by_year`
# themes_by_year: Number of themes shipped by year
themes_by_year = sets.groupby('year')['theme_id'].nunique()
themes_by_year
print('Figure 4.')
lines = themes_by_year.plot.line()

In [None]:
themes_by_year.mean()

## How many unique color lego bricks are available? 


In [None]:
# How many distinct colors are available?
num_colors = len(pd.unique(colors['name']))
# Print num_colors
print("No.of.unique values :", 
      num_colors)


## What is the distribution of transparent vs. non-transparent colors?

In [None]:
# colors_summary: Distribution of colors based on transparency

colors_summary = colors.groupby('is_trans').count()
colors_summary


##  What are the 5 most popular colors used in Lego parts?

In [None]:
# merging datasets inventory_parts and colors to get the name of the highest used color bricks
mergedf=pd.merge(inventory_parts,colors, left_on='color_id', right_on='id')
mergedf.head()   

In [None]:
# top 5 colors
top_5_colors=mergedf.groupby(['name'])['quantity'].sum().sort_values(ascending=False).head()
top_5_colors=pd.DataFrame(top_5_colors)
top_5_colors.rename(columns = {'quantity':'Quantity'},inplace = True)
display(top_5_colors)


In [None]:
# Creating visualization to displat the data using plotly
print('Figure 5.')
import plotly.express as px
pd.options.plotting.backend='plotly'
colors = ['black', 'lightsteelblue', 'white', 'darkslategray', 'red']
fig = px.bar(top_5_colors,x=top_5_colors.index,y='Quantity',color=top_5_colors.index,
            color_discrete_sequence=colors,
             height=700, width=900,
             title='Five most popular Lego colors',
             labels={'quantity':'Total quantity'}, 
             template='plotly_dark')
fig.update_xaxes(title=None)
fig.update_traces(marker_line_width=1, marker_line_color='white')
fig.update_layout(showlegend=False)

fig.show()

## What are the top 10 Lego themes?

In [None]:
#joining datasets theme and sets to get the name of theme for highest number of sets 
merge_set_theme=pd.merge(sets,themes, left_on='theme_id', right_on='id')
merge_set_theme.head()

In [None]:
# sorting to get the top 10 themes
top_10_themes=merge_set_theme.groupby(['name_y'])['set_num'].count().sort_values(ascending=False).head(10)
top_10_themes=pd.DataFrame(top_10_themes)
top_10_themes = top_10_themes.rename(columns={'set_num': 'Number_of_sets'}, index={'name_y': 'Theme'})
display(top_10_themes)

In [None]:
# Creating visualization to displat the data using plotly

print('Figure 6.')
fig = px.bar(top_10_themes, 
             y=top_10_themes.index,
             x='Number_of_sets', 
             color=top_10_themes.index, 
             color_continuous_scale='Rainbow_r',  
             template='plotly_dark', 
             labels={'Number_of_sets':'Number of Sets','name_y':'Theme Name'},
             title='Top 10 Lego Themes'
            )

fig.show()