In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
%matplotlib inline
from pandas_profiling import ProfileReport
import pandas.util.testing as tm

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data_file_path = "/kaggle/input/videogamesales/vgsales.csv"
company_region_path = "/kaggle/input/videogamescompaniesregions/video-games-developers.csv"
total_sales_column = "Total_Sales"

In [None]:
# Defining all our palette colours.
primary_blue = "#496595"
primary_blue2 = "#85a1c1"
primary_blue3 = "#3f4d63"
primary_grey = "#c6ccd8"
primary_black = "#202022"
primary_bgcolor = "#f4f0ea"

primary_green = px.colors.qualitative.Plotly[2]

plt.rcParams['axes.facecolor'] = primary_bgcolor

colors = [primary_blue, primary_blue2, primary_blue3, primary_grey, primary_black, primary_bgcolor, primary_green]
sns.palplot(sns.color_palette(colors))

<h2>Loading data and cleaning

In [None]:
df_data = pd.read_csv(data_file_path)
df_data.head()

In [None]:
df_data.Publisher.unique()

In [None]:
df_region = pd.read_csv(company_region_path)
df_region.head()

In [None]:
df_region.Developer.unique()

left_onlabel or list, or array-like
Column or index level names to join on in the left DataFrame. Can also be an array or list of arrays of the length of the left DataFrame. These arrays are treated as if they are columns.

right_onlabel or list, or array-like
Column or index level names to join on in the right DataFrame. Can also be an array or list of arrays of the length of the right DataFrame. These arrays are treated as if they are columns.

In [None]:
df = pd.merge(df_data, df_region[['Developer', 'Country']], left_on='Publisher', right_on='Developer', how='left')
df.head()

In [None]:
df.columns

In [None]:
final_profile = ProfileReport(df, title='Video games report')

In [None]:
final_profile

In [None]:
df.columns

In [None]:
if 'Total_Shipped' in df.columns:
    df[total_sales_column] = df['Total_Shipped'].fillna(0) + df['Global_Sales'].fillna(0)
else:
    regions = ['NA', 'JP', 'EU', 'Other']
    region_sales_sufix = '_Sales'
    df[total_sales_column] = df['Global_Sales']

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
df_copy = df.copy()
df_copy = df[df['Year'].notna()]
df_copy = df_copy.sort_values('Year', ascending=True)

In [None]:
df_copy.head()

In [None]:
df_copy.dtypes

In [None]:
fig = px.histogram(df_copy, x='Platform', animation_frame = 'Year', range_y=[0, 550])
fig.update_xaxes(type='category')
# Alphabetical order
fig.update_xaxes(categoryorder = 'category ascending')
fig.show()

In [None]:
top_df_copy = df_copy.groupby(['Platform', 'Year']).agg({total_sales_column: 'count'}).reset_index()
top_df_copy.columns = ['Platform', 'Year', 'Count']
top_df_copy = top_df_copy[top_df_copy['Year'].isin([2016, 2017, 2018, 2019])]
top_df_copy.Year.unique()

In [None]:
top_df_copy = df_copy.groupby(['Platform', 'Year']).agg({total_sales_column: 'count'}).reset_index()
top_df_copy.columns = ['Platform', 'Year', 'Count']
top_df_copy = top_df_copy[top_df_copy['Year'].isin([2016, 2017, 2018, 2019, 2020])]
top_df_copy = top_df_copy[top_df_copy['Count']>top_df_copy['Count'].sum()*0.01]
top_df_copy['Year'] = top_df_copy['Year'].astype(str)

In [None]:
top_df_copy.Year.unique()

In [None]:
fig = px.bar(top_df_copy, x='Platform', y='Count', color='Year', barmode='group')
fig.update_layout(title="Total released video-games by platform")
fig.update_xaxes(type='category')
fig.update_xaxes(categoryorder='category ascending')
fig.show()

<h2>Sales Analysis

In [None]:
sales_df = df_copy.groupby(['Platform', 'Year']).agg({total_sales_column: 'sum'}).reset_index()
sales_df = sales_df.sort_values('Year', ascending=True)
sales_df.head()

In [None]:
# We use bar here as we are not counting frequency like histogram
fig = px.bar(sales_df, x='Platform', y=total_sales_column, animation_frame='Year', range_y=[0, 150])
fig.update_xaxes(type='category')
fig.update_xaxes(categoryorder='category ascending')
fig.show()

In [None]:
sales_df = sales_df[sales_df['Year'].isin([2016, 2017, 2018, 2019])]
sales_df = sales_df[sales_df[total_sales_column] > sales_df[total_sales_column].sum() * 0.005]
sales_df['Year'] = sales_df['Year'].astype(str)

fig = px.bar(
    sales_df,
    x='Platform',
    y=total_sales_column,
    color='Year',
    barmode="group"
)
fig.update_layout(title="Total sales by platforms (Millions)")
fig.update_xaxes(categoryorder='category ascending')
fig.show()

In [None]:
all_time_df = df_copy.groupby(['Platform']).agg({total_sales_column:'sum'}).reset_index()
all_time_df = all_time_df[all_time_df[total_sales_column]>all_time_df[total_sales_column].sum()*0.03]

In [None]:
all_time_df.head()

In [None]:
fig = px.bar(all_time_df, x='Platform', y=total_sales_column)
fig.update_layout(title='Total sales of all time for the most relevant platforms (in Millions)')
fig.update_xaxes(type='category')
fig.update_xaxes(categoryorder='category ascending')
fig.show()

In [None]:
region_sales_sufix = '_Sales'
regions = ['NA', 'JP', 'EU', 'Other']
regions_agg = {}

for region in regions:
    regions_agg[region + region_sales_sufix] = 'sum'

regions_agg[total_sales_column] = 'sum'
regions_agg

In [None]:
# We use it to aggregate the NA, JP, EU, Other and Total sales in the dataset
df_loc = df_copy.groupby(['Year']).agg(regions_agg).reset_index()
df_loc = df_loc.sort_values('Year', ascending=True)
df_loc.head()

In [None]:
fig = go.Figure()
for region in regions:
    fig.add_trace(go.Scatter(
    x = df_loc['Year'],
    y = df_loc[region+region_sales_sufix],
    mode='lines',
    name=region
))

fig.update_layout(title="Total sales per year by region (Millions)")
fig.update_xaxes(type='category')
fig.show()

In [None]:
year_geo_df = df_copy[["Year",'NA_Sales','EU_Sales','JP_Sales','Other_Sales']]
year_geo_df

In [None]:
year_geo_df[['NA_mean','EU_mean','JP_mean','Other_mean']] = year_geo_df.groupby('Year')[['NA_Sales','EU_Sales','JP_Sales','Other_Sales']].transform('sum')
year_geo_df = year_geo_df.drop(['NA_Sales','EU_Sales','JP_Sales','Other_Sales'], axis=1)
year_geo_df

In [None]:
year_geo_df = year_geo_df.drop_duplicates()
year_geo_df = year_geo_df.sort_values("Year")
year_geo_df

In [None]:
#Creates dataframes with Place (Containing region sale - Na, eu, jp or other), Year and the mean sales
temp_df1 = pd.DataFrame({'Place': ['NA_Sales']*year_geo_df.shape[0], 'Year':year_geo_df['Year'], 'Sales': year_geo_df['NA_mean']})
temp_df2 = pd.DataFrame({'Place': ['EU_Sales']*year_geo_df.shape[0], 'Year': year_geo_df['Year'], 'Sales': year_geo_df['EU_mean']})
temp_df3 = pd.DataFrame({'Place': ['JP_Sales']*year_geo_df.shape[0], 'Year': year_geo_df['Year'], 'Sales': year_geo_df['JP_mean']})
temp_df4 = pd.DataFrame({'Place': ['Other_Sales']*year_geo_df.shape[0], 'Year': year_geo_df['Year'], 'Sales': year_geo_df['Other_mean']})

In [None]:
final = pd.concat([temp_df1,temp_df2,temp_df3,temp_df4], axis=0)
final = final.sort_values("Year")
final

In [None]:
fig=px.bar(
    final,
    x='Place', 
    y="Sales", 
    animation_frame="Year",
    animation_group="Place", 
    color="Place", 
    hover_name="Place",
    range_y=[0, 200]
)
fig.update_layout(title="Year sales distribution by region",title_x=0.5)
fig.show()

<h2>Sales and genre

In [None]:
df_genre = df_copy.groupby(['Genre']).agg(regions_agg)
df_genre = df_genre.sort_values(total_sales_column, ascending=False)
df_genre.head()

In [None]:
df_genre.T

In [None]:
# The transpose is used to make it look better
# The column is dropped on axis 1
fig = px.imshow(df_genre.drop(total_sales_column, 1).T)
fig.update_layout(title="Sales distribution by genre and region (Millions)")
fig.show()

In [None]:
df_g4 = df_copy[df_copy['Year'].isin([2016, 2017, 2018, 2019])]
df_g4 = df_g4.groupby(['Genre']).agg(regions_agg)
df_g4 = df_g4.sort_values(total_sales_column, ascending=False)
df_g4

In [None]:
fig = px.imshow(df_g4.drop(total_sales_column, 1).T)
fig.update_layout(title="Sales distribution by genre and region over the last 4 years")
fig.show()

In [None]:
df_genre_tots = df_genre.reset_index()
df_genre_tots

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_genre_tots['Genre'],
                        y=df_genre_tots[total_sales_column],
                        mode='lines+markers'))

fig.update_layout(title="Total sales by genre(Millions)")
fig.update_xaxes(type='category')
fig.show()

In [None]:
df_genre_tops = list(df_genre_tots[df_genre_tots[total_sales_column]>df_genre_tots[total_sales_column].sum()*0.03]['Genre'])
df_genre_tops

In [None]:
# Use the labels argument in pie to change up the names of the labels on the graph
df_gg = df_copy[df_copy['Genre'].isin(df_genre_tops)]
fig = px.pie(df_gg, values=total_sales_column, names='Genre',
            title='Population of European continent', hover_data=['Genre'],
            hole = 0.2)
fig.show()

In [None]:
fig  = go.Figure()
fig.add_trace(go.Pie(
    labels=df_gg['Genre'], 
    values=df_gg[total_sales_column], 
    pull=[0, 0, 0.1, 0.05, 0, 0, 0.05, 0, 0.05],
))
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(title="Percent of sales by Genre")
fig.show()

In [None]:
df_g4.head()

In [None]:
df_g4_t = df_g4.reset_index().sort_values(total_sales_column, ascending=False)
df_g4_t

In [None]:
fig = go.Figure()
    
fig.add_trace(go.Scatter(
    x=df_g4_t['Genre'], 
    y=df_g4_t[total_sales_column], 
    mode='lines+markers',
))
fig.update_layout(title="Total sales by genre (Millions)")
fig.update_xaxes(type='category')
# fig.update_xaxes(categoryorder='total descending')
fig.show()

In [None]:
df_copy.columns

In [None]:
df_publishers = df_copy.copy()
df_publishers['Country'] = df_publishers.Country.fillna(value='Unknown')

In [None]:
df_publishers = df_publishers.groupby(['Publisher', 'Country']).agg({total_sales_column: ['sum', 'count']}).reset_index()
df_publishers.columns = ['Publisher', 'Country', 'Sales_rev', 'Sales_count']
df_publishers = df_publishers[df_publishers.Publisher!='Unknown']
df_publishers.head()

In [None]:
df_publishers = df_publishers[(df_publishers['Sales_rev'] > df_publishers['Sales_rev'].sum() * 0.01) |
                  (df_publishers['Sales_count'] > df_publishers['Sales_count'].sum() * 0.01)
]

In [None]:
df_listing_publishers = list(df_publishers.Publisher.unique())

In [None]:
df_copy.head()

In [None]:
df_publishers_sales = df_copy.loc[:, ['Year', 'Publisher', total_sales_column]]
df_publishers_sales.head()

In [None]:
# Way to create new column that can be stored as a new dataframe column
# df_publishers_sales.groupby(['Publisher', 'Year'])[total_sales_column].transform('sum')\
df_publishers_sales = df_publishers_sales.groupby(['Publisher', 'Year']).agg({total_sales_column:'sum'}).reset_index()

In [None]:
df_publishers_sales = df_publishers_sales.sort_values("Year")

In [None]:
df_publishers_sales = df_publishers_sales[df_publishers_sales['Year']>=2006]
top_publishers = [
    'Nintendo', 
    'Sony Computer Entertainment',
    'Microsoft Game Studios',
    'Konami Digital Entertainment',
    'Electronic Arts'
]
df_publishers_sales = df_publishers_sales.loc[df_publishers_sales['Publisher'].isin(top_publishers)]

In [None]:
df_publishers_sales

In [None]:
fig=px.bar(
    df_publishers_sales,
    x='Publisher', 
    y=total_sales_column, 
    animation_frame="Year", 
    animation_group="Publisher", 
    color="Publisher", 
    hover_name="Publisher",
    range_y=[0,200]
)
fig.update_layout(title_text="Top Publisher Game Sale by Year", xaxis_domain=[0.05, 1.0])
fig.show()

In [None]:
# # ad-hoc adjustment for Microsoft and Bandai
# microsoft_row = pub_tdf[pub_tdf['Publisher'].str.startswith('Microsoft')].sum()
# microsoft_row['Publisher'] = 'Microsoft'
# microsoft_row['Country'] = 'United States'
# bandai_row = pub_tdf[pub_tdf['Publisher'].str.endswith('Bandai')].sum()
# bandai_row['Publisher'] = 'Namco Bandai'
# bandai_row['Country'] = 'Japan'

# # Drop old columns
# pub_tdf = pub_tdf[~pub_tdf['Publisher'].str.startswith('Microsoft')]
# pub_tdf = pub_tdf[~pub_tdf['Publisher'].str.endswith('Bandai')]

# # Append new rows to teh DF
# pub_tdf = pub_tdf.append(microsoft_row, ignore_index=True)
# pub_tdf = pub_tdf.append(bandai_row, ignore_index=True)

In [None]:
df_publishers[df_publishers['Publisher'].str.startswith('Microsoft')]

In [None]:
df_publishers[df_publishers['Publisher'].str.endswith('Bandai')]

In [None]:
df_publishers = df_publishers.sort_values('Sales_rev', ascending=False)
df_publishers.head()

In [None]:
fig = px.scatter(
    df_publishers,
    x='Publisher',
    y='Sales_rev',
    size='Sales_count',
    color='Country',
)
fig.update_xaxes(categoryorder='total descending')
fig.update_layout(title="Sales by publisher and region (Millions)")
fig.show()

In [None]:
df_copy.head()

Let’s compare the syntax of the two functions. In groupby(), we pass the column we want to group by in the parentheses and in pivot_table() the equivalent parameter is the index/columns. In groupby(), to choose the column to aggregate, we use subsetting with brackets while in pivot_table() we pass it to values. Finally, to choose the aggregating function, we use method chaining in groupby() whereas, pivot_table() provides aggfunc argument.

In [None]:
EU = df_copy.pivot_table('EU_Sales', columns='Publisher', aggfunc='sum')
EU

In [None]:
EU = df_copy.pivot_table('EU_Sales', columns='Publisher', aggfunc='sum').T
EU = EU.sort_values(by='EU_Sales', ascending=False).iloc[0:5]
EU_publishers = EU.index

JP = df_copy.pivot_table('JP_Sales', columns='Publisher', aggfunc='sum').T
JP = JP.sort_values(by='JP_Sales', ascending=False).iloc[0:5]
JP_publishers = JP.index

NA = df_copy.pivot_table('NA_Sales', columns='Publisher', aggfunc='sum').T
NA = NA.sort_values(by='NA_Sales', ascending=False).iloc[0:5]
NA_publishers = NA.index

Other = df_copy.pivot_table('Other_Sales', columns='Publisher', aggfunc='sum').T
Other = Other.sort_values(by='Other_Sales', ascending=False).iloc[0:5]
Other_publishers = Other.index

Global = df_copy.pivot_table('Global_Sales', columns='Publisher', aggfunc='sum').T
Global = Global.sort_values(by='Global_Sales', ascending=False).iloc[0:5]
Global_publishers = Global.index

In [None]:
# Initialize figure
fig = go.Figure()
# In marker: Color defines the parameter that decides the position/color on the colorscale
# Add Traces
fig.add_trace(
    go.Bar(y=NA['NA_Sales'],
           x=NA_publishers,
           name="North America",
          marker={'color': NA['NA_Sales'],'colorscale': 'tealgrn'}))
fig.add_trace(
    go.Bar(y=EU['EU_Sales'],
           x=EU_publishers,
           name="Europe",
           marker={'color': EU['EU_Sales'],'colorscale': 'tealgrn'},
           visible=False))
fig.add_trace(
    go.Bar(y=JP['JP_Sales'],
           x=JP_publishers,
           name="Japan",
           marker={'color': JP['JP_Sales'],'colorscale': 'tealgrn'},
           visible=False))

fig.add_trace(
    go.Bar(y=Other['Other_Sales'],
           x=Other_publishers,
           name="Others",
           marker={'color': Other['Other_Sales'],'colorscale': 'tealgrn'},
           visible=False))

fig.add_trace(
    go.Bar(y=Global['Global_Sales'],
           x=Global_publishers,
           name="Global",
           marker={'color': Global['Global_Sales'],'colorscale': 'tealgrn'},
               visible=False ))

buttons = []
countries = ['North America', 'Europe', 'Japan', 'Others', 'Global']

for i, country in enumerate(countries):
    buttons.append(dict(
        label=country,
        method="update",
        args=[{"visible": [False] * i + [True] + [False] * (3-i+1)},
              {"title": f"Top 5 Publishers for {country}"}]
    ))

fig.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            direction="right",
            active=0,
            x=1.17,
            y=1.2,
            buttons=buttons,
        )
    ])

# Set title
fig.update_layout(
    title_text="Top 5 Publishers per region",
    xaxis_domain=[0.05, 1.0]
)

fig.show()

In [None]:
pub_genre_df = df_copy.groupby(['Publisher', 'Genre']).agg(regions_agg).reset_index()
pub_genre_df = pub_genre_df[pub_genre_df['Publisher'].isin(df_listing_publishers)]
pub_genre_df = pub_genre_df[pub_genre_df['Genre'].isin(df_genre_tops)]
pub_genre_df.head()

In [None]:
fig = px.scatter(
    pub_genre_df,
    x='Publisher',
    y=total_sales_column,
    color='Genre',
)
fig.update_xaxes(categoryorder='total descending')
fig.update_layout(title="Sales by publisher and genre (Millions)")
fig.show()

In [None]:
pub_genre_df = df_copy.groupby(['Publisher', 'Genre']).agg(regions_agg).reset_index()
pub_genre_df = pub_genre_df[pub_genre_df['Publisher'].isin(df_listing_publishers[:10])]
pub_genre_df = pub_genre_df[pub_genre_df['Genre'].isin(df_genre_tops)]
pub_genre_df.head()

In [None]:
pub_genre_pivot_df = pub_genre_df.pivot(
    index='Publisher', columns='Genre', values=total_sales_column)
pub_genre_pivot_df

In [None]:
z = pub_genre_pivot_df.values
x = pub_genre_pivot_df.columns.tolist()
y = pub_genre_pivot_df.index.tolist()
print(z)
print(x)
print(y)

In [None]:
# Evenly round to the given number of decimals.
z_text = np.around(z)
print(z_text)

In [None]:
fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='viridis')
fig.update_xaxes(categoryorder='total descending')
fig.update_layout(title="Sales by publisher and genre (Millions)")
fig.show()

In [None]:
fig = px.box(df_copy[df_copy['Genre'].isin(df_genre_tops)], 
             y=total_sales_column, 
             color='Genre',
             hover_data=['Name', 'Year'],
            )
fig.update_layout(title="Games sells boxplot by genre")
fig.show()

In [None]:
platform_tops = ['PS4', 'PSV', 'XOne', 'PC']

In [None]:
plat_genre_df = df_copy[(df_copy['Genre'].isin(df_genre_tops[:4])) 
                        & (df_copy['Platform'].isin(platform_tops[:4]))]
plat_genre_df.head()

In [None]:
fig = px.sunburst(plat_genre_df, path=['Genre', 'Platform'], values=total_sales_column)
fig.show()

In [None]:
plat_pub_df = df_copy[(df_copy['Platform'].isin(platform_tops[:4])) & (df_copy['Publisher'].isin(df_listing_publishers[:5]))]

fig = px.sunburst(plat_pub_df, path=['Platform', 'Publisher'], values=total_sales_column)
fig.show()

In [None]:
genre_pub_genre_df = df_copy[(df_copy['Genre'].isin(df_genre_tops[:4])) & 
                         (df_copy['Publisher'].isin(df_listing_publishers[:5])) & 
                         (df_copy['Platform'].isin(platform_tops[:4]))
                        ]

fig = px.sunburst(genre_pub_genre_df, path=['Genre', 'Platform', 'Publisher'], values=total_sales_column)
fig.show()