In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

First we have to import the data into a pandas DataFrame

In [3]:
vg = pd.read_csv('vg_data.csv')

We check that the data looks correct by looking at the head

In [4]:
vg.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


Then we use .info to take a deeper look into the data and see if there are issues

In [5]:
vg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
Rank            16598 non-null int64
Name            16598 non-null object
Platform        16598 non-null object
Year            16327 non-null float64
Genre           16598 non-null object
Publisher       16540 non-null object
NA_Sales        16598 non-null float64
EU_Sales        16598 non-null float64
JP_Sales        16598 non-null float64
Other_Sales     16598 non-null float64
Global_Sales    16598 non-null float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB


Since the entries are inconsistent, we know there are some issues, so first we drop any values that are filled with 'None' or 'NaN'

In [6]:
games = vg.dropna()

Then check .info again to see if the numbers are at least now consistent

In [7]:
games.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16291 entries, 0 to 16597
Data columns (total 11 columns):
Rank            16291 non-null int64
Name            16291 non-null object
Platform        16291 non-null object
Year            16291 non-null float64
Genre           16291 non-null object
Publisher       16291 non-null object
NA_Sales        16291 non-null float64
EU_Sales        16291 non-null float64
JP_Sales        16291 non-null float64
Other_Sales     16291 non-null float64
Global_Sales    16291 non-null float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.5+ MB


Now that we have 16,291 entries for each attribute, we can look into the attribute data to see if there are any glaring mistakes

In [8]:
games.describe()

Unnamed: 0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,16291.0,16291.0,16291.0,16291.0,16291.0,16291.0,16291.0
mean,8290.190228,2006.405561,0.265647,0.147731,0.078833,0.048426,0.54091
std,4792.65445,5.832412,0.822432,0.509303,0.311879,0.190083,1.567345
min,1.0,1980.0,0.0,0.0,0.0,0.0,0.01
25%,4132.5,2003.0,0.0,0.0,0.0,0.0,0.06
50%,8292.0,2007.0,0.08,0.02,0.0,0.01,0.17
75%,12439.5,2010.0,0.24,0.11,0.04,0.04,0.48
max,16600.0,2020.0,41.49,29.02,10.22,10.57,82.74


Since it is currently 2017 and the data set is from 2016, we know that the max of 2020 is an error. It is either due to the name of the game having 2020 or an entry error. We will remove anything that is greater than 2016. 

In [16]:
cleaned_games = games.loc[games['Year'] < 2017, :]

In [17]:
cleaned_games.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16287 entries, 0 to 16597
Data columns (total 11 columns):
Rank            16287 non-null int64
Name            16287 non-null object
Platform        16287 non-null object
Year            16287 non-null float64
Genre           16287 non-null object
Publisher       16287 non-null object
NA_Sales        16287 non-null float64
EU_Sales        16287 non-null float64
JP_Sales        16287 non-null float64
Other_Sales     16287 non-null float64
Global_Sales    16287 non-null float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.5+ MB


In [18]:
cleaned_games.describe()

Unnamed: 0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,16287.0,16287.0,16287.0,16287.0,16287.0,16287.0,16287.0
mean,8288.969853,2006.402775,0.265695,0.147768,0.078849,0.048437,0.541022
std,4792.138597,5.830382,0.822525,0.50936,0.311916,0.190105,1.56752
min,1.0,1980.0,0.0,0.0,0.0,0.0,0.01
25%,4131.5,2003.0,0.0,0.0,0.0,0.0,0.06
50%,8291.0,2007.0,0.08,0.02,0.0,0.01,0.17
75%,12437.5,2010.0,0.24,0.11,0.04,0.04,0.48
max,16600.0,2016.0,41.49,29.02,10.22,10.57,82.74
