In [1]:
# Import Dependencies
import pandas as pd

In [2]:
# Import Data
path = "resources/vgsales.csv"
vg_sales_df = pd.read_csv(path)
vg_sales_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [3]:
# Drop columns
vg_sales_df.drop(["Year", "Other_Sales", "Global_Sales"], axis=1, inplace=True)
vg_sales_df.set_index("Rank", inplace=True)
vg_sales_df.head()

Unnamed: 0_level_0,Name,Platform,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Wii Sports,Wii,Sports,Nintendo,41.49,29.02,3.77
2,Super Mario Bros.,NES,Platform,Nintendo,29.08,3.58,6.81
3,Mario Kart Wii,Wii,Racing,Nintendo,15.85,12.88,3.79
4,Wii Sports Resort,Wii,Sports,Nintendo,15.75,11.01,3.28
5,Pokemon Red/Pokemon Blue,GB,Role-Playing,Nintendo,11.27,8.89,10.22


In [4]:
# Reduce number of rows
# Heroku database has a row limit of 10,000 rows
# Exploratory analysis of the dataset reveals dropping tail end of data will not skew results
# Due to database constraints, drop the last 6,798 rows to leave 9,800 rows
n = 6798
vg_sales_df.drop(vg_sales_df.tail(n).index, inplace=True)
vg_sales_df.count()

Name         9800
Platform     9800
Genre        9800
Publisher    9770
NA_Sales     9800
EU_Sales     9800
JP_Sales     9800
dtype: int64

In [5]:
# Get counts of unique values from dataframe
# Returns total of 373 values across Platform, Genre, and Publisher
# Will add an additonal "N/A" value for null values in Publisher
print(vg_sales_df.nunique())

Name         6720
Platform       28
Genre          12
Publisher     333
NA_Sales      409
EU_Sales      305
JP_Sales      244
dtype: int64


### Game Platform

In [6]:
# Get unique Platform values
vg_sales_df.head()

Unnamed: 0_level_0,Name,Platform,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Wii Sports,Wii,Sports,Nintendo,41.49,29.02,3.77
2,Super Mario Bros.,NES,Platform,Nintendo,29.08,3.58,6.81
3,Mario Kart Wii,Wii,Racing,Nintendo,15.85,12.88,3.79
4,Wii Sports Resort,Wii,Sports,Nintendo,15.75,11.01,3.28
5,Pokemon Red/Pokemon Blue,GB,Role-Playing,Nintendo,11.27,8.89,10.22


In [7]:
# Check for null values
vg_sales_df["Platform"].isnull().sum()

0

In [8]:
# Create csv from unique values for database storage


### Genre

In [9]:
# Get unique Genre values
vg_sales_df["Genre"].unique()

array(['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Misc',
       'Shooter', 'Simulation', 'Action', 'Fighting', 'Adventure',
       'Strategy'], dtype=object)

In [10]:
# Check for null values
vg_sales_df["Genre"].isnull().sum()

0

In [11]:
# Create csv from unique values for database storage


### Publisher

In [12]:
# Get unique Publisher values
vg_sales_df["Publisher"].unique()

array(['Nintendo', 'Microsoft Game Studios', 'Take-Two Interactive',
       'Sony Computer Entertainment', 'Activision', 'Ubisoft',
       'Bethesda Softworks', 'Electronic Arts', 'Sega', 'SquareSoft',
       'Atari', '505 Games', 'Capcom', 'GT Interactive',
       'Konami Digital Entertainment',
       'Sony Computer Entertainment Europe', 'Square Enix', 'LucasArts',
       'Virgin Interactive', 'Warner Bros. Interactive Entertainment',
       'Universal Interactive', 'Eidos Interactive', 'RedOctane',
       'Vivendi Games', 'Enix Corporation', 'Namco Bandai Games',
       'Palcom', 'Hasbro Interactive', 'THQ', 'Fox Interactive',
       'Acclaim Entertainment', 'MTV Games', 'Disney Interactive Studios',
       nan, 'Majesco Entertainment', 'Codemasters', 'Red Orb', 'Level 5',
       'Arena Entertainment', 'Midway Games', 'JVC', 'Deep Silver',
       '989 Studios', 'NCSoft', 'UEP Systems', 'Parker Bros.', 'Maxis',
       'Imagic', 'Tecmo Koei', 'Valve Software', 'ASCII Entertainment',


In [13]:
# Check for null values
vg_sales_df["Publisher"].isnull().sum()

30

In [14]:
# Create csv from unique values for database storage

### Regional Sales


In [15]:
# Create a reduced dataframe for regional sales data
regional_sales_df = vg_sales_df
regional_sales_df.drop(["Name", "Platform", "Genre", "Publisher"],axis=1, inplace=True)
regional_sales_df.head()

Unnamed: 0_level_0,NA_Sales,EU_Sales,JP_Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,41.49,29.02,3.77
2,29.08,3.58,6.81
3,15.85,12.88,3.79
4,15.75,11.01,3.28
5,11.27,8.89,10.22


In [16]:
# Check number of rows
regional_sales_df.count()

NA_Sales    9800
EU_Sales    9800
JP_Sales    9800
dtype: int64

In [17]:
# Check for null values
regional_sales_df.isnull().sum()

NA_Sales    0
EU_Sales    0
JP_Sales    0
dtype: int64

In [18]:
# Create csv from reduced dataframe for database storage
