In [1]:
# Import Dependencies
import pandas as pd

In [2]:
# Import Data
path = "resources/vgsales.csv"
vg_sales_df = pd.read_csv(path)
vg_sales_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [3]:
# Drop columns
vg_sales_df.drop(["Year", "Other_Sales", "Global_Sales"], axis=1, inplace=True)
vg_sales_df.set_index("Rank", inplace=True)
vg_sales_df.head()

Unnamed: 0_level_0,Name,Platform,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Wii Sports,Wii,Sports,Nintendo,41.49,29.02,3.77
2,Super Mario Bros.,NES,Platform,Nintendo,29.08,3.58,6.81
3,Mario Kart Wii,Wii,Racing,Nintendo,15.85,12.88,3.79
4,Wii Sports Resort,Wii,Sports,Nintendo,15.75,11.01,3.28
5,Pokemon Red/Pokemon Blue,GB,Role-Playing,Nintendo,11.27,8.89,10.22


In [4]:
# Reduce number of rows
# Heroku database has a row limit of 10,000 rows
# Exploratory analysis of the dataset reveals dropping tail end of data will not skew results
# Due to database constraints, drop the last 6,798 rows to leave 9,800 rows
# Publisher will be dropped since we are not using that information and there are null values
n = 6798
vg_sales_df.drop(vg_sales_df.tail(n).index, inplace=True)
vg_sales_df.count()

Name         9800
Platform     9800
Genre        9800
Publisher    9770
NA_Sales     9800
EU_Sales     9800
JP_Sales     9800
dtype: int64

In [5]:
# Drop Publisher
vg_sales_df.drop(["Publisher"], axis=1, inplace=True)
vg_sales_df.head()

Unnamed: 0_level_0,Name,Platform,Genre,NA_Sales,EU_Sales,JP_Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Wii Sports,Wii,Sports,41.49,29.02,3.77
2,Super Mario Bros.,NES,Platform,29.08,3.58,6.81
3,Mario Kart Wii,Wii,Racing,15.85,12.88,3.79
4,Wii Sports Resort,Wii,Sports,15.75,11.01,3.28
5,Pokemon Red/Pokemon Blue,GB,Role-Playing,11.27,8.89,10.22


In [6]:
# Get counts of unique values from dataframe
# Returns total of 373 values across Platform, Genre, and Publisher
# Will add an additonal "N/A" value for null values in Publisher
print(vg_sales_df.nunique())

Name        6720
Platform      28
Genre         12
NA_Sales     409
EU_Sales     305
JP_Sales     244
dtype: int64


### Game Platform

In [7]:
# Check for null values
vg_sales_df["Platform"].isnull().sum()

0

In [8]:
# Get unique Platform values and convert into a list
platforms = vg_sales_df["Platform"].unique().tolist()
print(platforms)

['Wii', 'NES', 'GB', 'DS', 'X360', 'PS3', 'PS2', 'SNES', 'GBA', '3DS', 'PS4', 'N64', 'PS', 'XB', 'PC', '2600', 'PSP', 'XOne', 'GC', 'WiiU', 'GEN', 'DC', 'PSV', 'SAT', 'SCD', 'WS', 'NG', 'TG16']


In [9]:
# Convert platform list to series
platform_series = pd.Series(platforms)
platform_series

0      Wii
1      NES
2       GB
3       DS
4     X360
5      PS3
6      PS2
7     SNES
8      GBA
9      3DS
10     PS4
11     N64
12      PS
13      XB
14      PC
15    2600
16     PSP
17    XOne
18      GC
19    WiiU
20     GEN
21      DC
22     PSV
23     SAT
24     SCD
25      WS
26      NG
27    TG16
dtype: object

In [10]:
# Convert platform_series into a csv file
platform_series.to_csv('resources/vg_platforms.csv')

### Genre

In [11]:
# Check for null values
vg_sales_df["Genre"].isnull().sum()

0

In [12]:
# Get unique Genre values and convert into a list
genres = vg_sales_df["Genre"].unique().tolist()
print(genres)

['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Misc', 'Shooter', 'Simulation', 'Action', 'Fighting', 'Adventure', 'Strategy']


In [13]:
# Convert genres list to series
genres_series = pd.Series(genres)
genres_series

0           Sports
1         Platform
2           Racing
3     Role-Playing
4           Puzzle
5             Misc
6          Shooter
7       Simulation
8           Action
9         Fighting
10       Adventure
11        Strategy
dtype: object

In [14]:
# Convert genre_series into a csv file
genres_series.to_csv('resources/vg_genres.csv')

### Games

In [15]:
# Drop Platform & Genre columns to create games dataframe
vg_sales_df.drop(["Platform", "Genre"], axis=1, inplace=True)
vg_sales_df

Unnamed: 0_level_0,Name,NA_Sales,EU_Sales,JP_Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Wii Sports,41.49,29.02,3.77
2,Super Mario Bros.,29.08,3.58,6.81
3,Mario Kart Wii,15.85,12.88,3.79
4,Wii Sports Resort,15.75,11.01,3.28
5,Pokemon Red/Pokemon Blue,11.27,8.89,10.22
...,...,...,...,...
9797,Rush,0.11,0.00,0.00
9798,Ecco the Dolphin,0.00,0.00,0.12
9799,The Legend of Heroes: Trails of Cold Steel II,0.01,0.00,0.11
9800,Looney Tunes: Sheep Raider,0.07,0.05,0.00


In [17]:
#Save vg_sales dataframe to csv
vg_sales_df.to_csv('resources/vg_games.csv')