In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

In [None]:
file_path = '../input/videogamesales/vgsales.csv'
df = pd.read_csv(file_path)
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
len(df[df['Year'].isnull() | df['Publisher'].isnull()])/ len(df) * 100

In [None]:
df.dropna(inplace=True)

In [None]:
df.info()

**Year has the dtype as float, so we will convert that to int**

In [None]:
df['Year'] = df['Year'].astype(int)

In [None]:
df.columns



---



## Variable identification

**Categorical variables**
  * Platform
  * Year
  * Genre
  * Publisher

**Numerical variables**
  * NA_Sales
  * EU_Sales
  * JP_Sales
  * Other_Sales
  * Global_Sales


### Platform

In [None]:
df['Platform'].value_counts().to_dict().keys()

**There are total of 31 platforms**
**let's take a look at the top five platforms**

In [None]:
platforms = list(df['Platform'].value_counts()[:5].to_dict().keys())
sales_cols = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']
global_sales = []
na_sales = []
eu_sales = []
jp_sales = []
other_sales = []
for platform in platforms:
  global_sales.append(df[df['Platform'] == platform]['Global_Sales'].sum())
  na_sales.append(df[df['Platform'] == platform]['NA_Sales'].sum())
  eu_sales.append(df[df['Platform'] == platform]['EU_Sales'].sum())
  jp_sales.append(df[df['Platform'] == platform]['JP_Sales'].sum())
  other_sales.append(df[df['Platform'] == platform]['Other_Sales'].sum())
  

**Let's plot some graphs to know the sales figures for the top five platform**

In [None]:
explode = [0.1,0.1,0.1,0.1,0.1]
colors = ['lightseagreen', 'mediumpurple', 'aquamarine', 'lawngreen', 'mediumorchid']
def platform_pie(title, sales):
  plt.title(title)
  plt.pie(sales, 
          labels = platforms,
          wedgeprops=dict(width=0.2),
          shadow = True,
          colors = colors, 
          explode = explode, 
          startangle = 90, 
          autopct='%1.1f%%');

In [None]:
plt.figure(figsize= (10,10), dpi = 100)

plt.subplot(2,2,1)
platform_pie('Sales in North America'.upper(), na_sales)

plt.subplot(2,2,2)
platform_pie('Sales in Europe'.upper(), eu_sales)

plt.subplot(2,2,3)
platform_pie('Sales in Japan'.upper(), jp_sales)

plt.subplot(2,2,4)
platform_pie('Sales in Other countries'.upper(), other_sales)

In [None]:
plt.figure(figsize= (5,5), dpi = 100)
platform_pie('Global Sales'.upper(), global_sales)

* NES and GB are two dominant platforms overall
* among these two, NES has more sales than GB in North America and Japan, whereas GB has more sales in Europe



---



### Year

In [None]:
plt.figure(figsize= (7,5), dpi= 100)
plt.xticks(rotation = 90)
sns.countplot(x = 'Year', data = df);

In [None]:
# let's define a function to plot the graphs
def Year_plots(title, sales, color):
  plt.figure(figsize = (7,5), dpi= 100)
  plt.title(title)
  plt.xticks(rotation = 90)
  sns.lineplot(x = 'Year', 
              y = sales,
              color = color, 
              data = df)

**Let's see in which year most sales were made**

In [None]:
Year_plots('Year v/s JP Sales', 'JP_Sales', 'red')

In [None]:
Year_plots('Year v/s EU Sales', 'EU_Sales', 'blue')

In [None]:
Year_plots('Year v/s NA Sales', 'NA_Sales', 'green')

In [None]:
Year_plots('Year v/s Other Sales', 'Other_Sales', 'black')

In [None]:
Year_plots('Year v/s Global Sales', 'Global_Sales', 'purple')

**More sales in all the categories were made in the years 1984, 1985, 1988,1989,1990 and 1992**



---



### Genre


In [None]:
plt.figure(figsize=(7,5), dpi = 100)
plt.xticks(rotation = 90)
sns.countplot(x = 'Genre', data = df);

**We have more number of games from the genres 'Action' and 'Sports' in our dataset**

In [None]:
# function to plot graphs 
def genre_plots(title, sales, palette):
  plt.figure(figsize = (7,5), dpi= 100)
  plt.title(title)
  plt.xticks(rotation = 90)
  sns.barplot(x = 'Genre', 
              y = sales,
              data = df, 
              palette = palette)

In [None]:
genre_plots('Sales in Japan as per Genres', 'JP_Sales', 'flare')

**Video games of Genre 'Role-Playing' has made the most sales in Japan**

In [None]:
genre_plots('Sales in North America as per Different Genres', 'NA_Sales', 'husl')

**In case of North America, the genres 'Platform' and 'Shooter' as have made more sales than other genres**

In [None]:
genre_plots('Sales in Europe as per different genres', 'EU_Sales', 'pastel')

**The sales in Europe as similar to sales in North America. 'Platform' and 'Shooter' genre have made more sales than other genres**

In [None]:
genre_plots('Sales in other regions as per genres', 'Other_Sales', 'Set2')

**In other regions, genres 'Shooter' and 'Racing' have more sales**



---



### Publisher

In [None]:
df['Publisher'].value_counts()[:10]

In [None]:
len(df['Publisher'].unique())

**There are 576 publishers in total**

**lets take a look at the total and average global sales figures of top 10 publishers**

In [None]:
# lets take a look at the total and average global sales figures of top 10 publishers
publishers = list(df['Publisher'].value_counts()[:10].to_dict().keys())
total_sales = []
average_sales = []
for publisher in publishers:
  total_sales.append(df[df['Publisher'] == publisher]['Global_Sales'].sum())
  average_sales.append(df[df['Publisher'] == publisher]['Global_Sales'].mean())
my_dict = {
    'publishers': publishers,
    'total_sales': total_sales,
    'average_sales': average_sales
}

publisher_df = pd.DataFrame(my_dict)
publisher_df

In [None]:
plt.figure(figsize= (7,5), dpi = 100)

plt.title('Top 10 publishers and their total sales')
plt.xticks(rotation = 90)
sns.barplot(x = 'publishers',
            y = 'total_sales', 
            data = publisher_df,
            palette = 'flare');

**Nintendo, Electronic Arts and Activision are the top three companies with highest total sales**

In [None]:
plt.figure(figsize= (7,5), dpi = 100)
plt.title('Top 10 publishers and their average sales')
plt.xticks(rotation = 90)
sns.barplot(x = 'publishers',
            y = 'average_sales', 
            data = publisher_df,
            palette = 'husl');

**Nintendo, Take-Two Interactive and Sony Computer Entertainment are the top three companies with highest average sales**



---



## Numerical Varibles

**Let's take a look at which region has the highest total sales**

In [None]:
sales_cols = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']
total = []
average = []
for sales_col in sales_cols:
  total.append(df[sales_col].sum())
  average.append(df[sales_col].mean())

In [None]:
explode = [0.1,0.0,0,0]
plt.figure(figsize=(7,5), dpi = 100)


bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)

colors = ['darkturquoise', 'darkviolet', 'navy', 'orange']
plt.title('Total Sales per region'.upper())
plt.pie(total, 
        labels = [i.upper() for i in sales_cols], 
        explode = explode, 
        shadow = True,
        colors = colors, 
        wedgeprops=dict(width=0.2),
        autopct='%1.1f%%');

**North America region has the highest number of total sales and highest average number of sales as well**



---

---





# Asking and Answering questions

**which are the top-selling games in each genre in each region and globally?**

In [None]:
# let's define a function for it 
# this function will return a dataframe containing the highest saling video games in each genre for a given region 
def highest_genre(df,region_sales): 
  name_list = []
  genre_list = list(df['Genre'].value_counts().to_dict().keys())
  sale_list = []

  for genre in genre_list: 
    name_list.append(df[df['Genre'] == genre][['Name', region_sales]].sort_values(region_sales, ascending = True)['Name'].iloc[-1])
    sale_list.append(df[df['Genre'] == genre][['Name', region_sales]].sort_values(region_sales, ascending = True)[region_sales].iloc[-1])

  gs_dict = {
      'Name': name_list,
      'Genre': genre_list,
      'Sales': sale_list
  }

  return pd.DataFrame(gs_dict, index= list(range(1, len(genre_list) + 1)))

In [None]:
print('The highest saling video games in each genre for every region are: \n'.upper())
regions = ['North America', 'Europe', 'Japan', 'Other regions']
i = 0
for col in sales_cols:
  print(regions[i])
  i = i + 1
  print(highest_genre(df, col))
  print('\n')

**Which platforms have sold video games of all the genre types?**

In [None]:
genre_table = pd.pivot_table(df, 
                             index = 'Platform', 
                             columns = ['Genre'],
                             values= ['NA_Sales','JP_Sales', 'EU_Sales', 'Other_Sales', 'Global_Sales'], 
                             aggfunc= np.sum)
genre_table.dropna(inplace = True)
# print(list(genre_table.index))

**The platforms which have sold the video games of all the genres are:**
* 3DS
* DS
* GBA
* GC
* N64
* PC
* PS
* PS2
* PS3
* PS4
* PSP
* PSV
* SAT
* SNES
* Wii
* WiiU
* X360
* XB

**The platforms which made highest sales for each genre in each region and how much**

In [None]:
# this function is similar to highest_genre() function
def highest_platform(df, region_sales):
  name_list = []
  genre_list = list(df['Genre'].value_counts().to_dict().keys())
  sale_list = []

  for genre in genre_list: 
    name_list.append(df[df['Genre'] == genre][['Platform', region_sales]].sort_values(region_sales, ascending = True)['Platform'].iloc[-1])
    sale_list.append(df[df['Genre'] == genre][['Platform', region_sales]].sort_values(region_sales, ascending = True)[region_sales].iloc[-1])

  gs_dict = {
          'Platform': name_list,
          'Genre': genre_list,
          region_sales: sale_list
      }

  return pd.DataFrame(gs_dict, index= list(range(1, len(genre_list) + 1)))

In [None]:
i = 0
print('The platforms which made highest sales for each genre in each region: \n'.upper())
for col in sales_cols:
  print(regions[i])
  i = i + 1
  print(highest_platform(df,col))
  print('\n')

**Most no. of games were sold in the year 2009. let's explore that year**

In [None]:
year_2009_df = df[df['Year'] == 2009] 
year_2009_df

In [None]:
plt.figure(figsize= (7,5), dpi = 100)
plt.title('Global sales made per genre in the year 2009')
plt.xticks(rotation = 60)
sns.barplot(x = 'Genre', 
            y = 'Global_Sales', 
            data = year_2009_df);

**Electronic Arts is the company with most number of games sold in the dataset.**
**Let's explore this company's sales**

In [None]:
df['Publisher'].value_counts()[:10]

In [None]:
ea_df = df[df['Publisher'] == 'Electronic Arts']
ea_df

In [None]:
sales_cols
total_sales = []
average_sales = []
for col in sales_cols:
  total_sales.append(ea_df[col].sum())
  average_sales.append(ea_df[col].mean())
print(total_sales)
print(average_sales)

In [None]:
plt.figure(figsize= (7,5), dpi = 100)
plt.title("Total Sales made by Electronic Arts in each region")
colors = ['mediumblue', 'darkmagenta', 'aquamarine', 'red', 'lime']
plt.pie(total_sales, 
        labels = sales_cols,
        wedgeprops=dict(width=0.2),
        shadow = True,
        colors = colors,
        startangle = 90, 
        autopct='%1.1f%%');

In [None]:
ea_df.groupby('Genre')[sales_cols + ['Global_Sales']].mean()

In [None]:
ea_df.groupby('Genre')[sales_cols + ['Global_Sales']].sum()

* **The genre 'Shooter' has made the highest average sales in each region and globally as well** 
* **The genre 'Sports' has made the highest total sales in every region and globally as well.**
* **No games were sold by Electronic Arts by the genre 'Puzzle' in Japan**
* **In North America, the genre 'Adventure' made the lowest total sales and the genre 'Platform' made the lowest average sales.**
* **In Europe, the genre 'Strategy' made the lowest sales.**

##References
* https://pandas.pydata.org/docs/
* https://numpy.org/doc/
* https://matplotlib.org/stable/contents.html
* https://seaborn.pydata.org/
