## Setup and Dependencies

In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np

In [2]:
# Load data file into pandas
price_data = pd.read_csv("../data/all_console_prices.csv", encoding='utf-8')
games_data = pd.read_csv("../data/vgsales.csv", encoding='utf-8')
games_data.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


# Games Sales Data
<hr>

### Remove Extra Platforms

In [3]:
games_clean = (games_data[(games_data['Platform'] == '2600') | (games_data['Platform'] == 'NES')
                                     | (games_data['Platform'] == 'GEN') | (games_data['Platform'] == 'SNES')
                                     | (games_data['Platform'] == 'N64') | (games_data['Platform'] == 'SCD')
                                     | (games_data['Platform'] == 'SAT') | (games_data['Platform'] == 'PS')]).reset_index(drop=True)

games_clean.count()

Rank            2191
Name            2191
Platform        2191
Year            2164
Genre           2191
Publisher       2188
NA_Sales        2191
EU_Sales        2191
JP_Sales        2191
Other_Sales     2191
Global_Sales    2191
dtype: int64

### Remove Rank Column & Drop Blank Year

In [4]:
games_clean_df = games_clean[['Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']].sort_values(by=['Platform', 'Name']).reset_index(drop=True)

games_clean_df['Year'].replace('', np.nan)
games_clean_df = games_clean_df.dropna()
games_clean_df['Year'] = games_clean_df['Year'].astype(int)


games_clean_df.count()

Name            2164
Platform        2164
Year            2164
Genre           2164
Publisher       2164
NA_Sales        2164
EU_Sales        2164
JP_Sales        2164
Other_Sales     2164
Global_Sales    2164
dtype: int64

In [5]:
games_clean_df.head()

Unnamed: 0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,Action Force,2600,1982,Action,CPG Products,0.5,0.03,0.0,0.01,0.54
2,Adventures of Tron,2600,1981,Action,Mattel Interactive,0.63,0.03,0.0,0.01,0.67
3,Air Raid,2600,1981,Action,Men-A-Vision,0.72,0.04,0.0,0.01,0.77
5,Airlock,2600,1981,Action,Data Age,0.36,0.02,0.0,0.0,0.39
6,Alien,2600,1981,Action,20th Century Fox Video Games,0.74,0.04,0.0,0.01,0.79


### Convert Sales to Currency

In [6]:
games_clean_df['NA_Sales']  = games_clean_df['NA_Sales'] .multiply(1000000).astype(int).replace(0, np.NaN)
games_clean_df['EU_Sales']  = games_clean_df['EU_Sales'] .multiply(1000000).astype(int).replace(0, np.NaN)
games_clean_df['JP_Sales']  = games_clean_df['JP_Sales'] .multiply(1000000).astype(int).replace(0, np.NaN)
games_clean_df['Other_Sales']  = games_clean_df['Other_Sales'] .multiply(1000000).astype(int).replace(0, np.NaN)
games_clean_df['Global_Sales']  = games_clean_df['Global_Sales'] .multiply(1000000).astype(int).replace(0, np.NaN)

games_clean_df.head()

Unnamed: 0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,Action Force,2600,1982,Action,CPG Products,500000.0,30000.0,,10000.0,540000
2,Adventures of Tron,2600,1981,Action,Mattel Interactive,630000.0,30000.0,,10000.0,670000
3,Air Raid,2600,1981,Action,Men-A-Vision,720000.0,40000.0,,10000.0,770000
5,Airlock,2600,1981,Action,Data Age,360000.0,20000.0,,,390000
6,Alien,2600,1981,Action,20th Century Fox Video Games,740000.0,40000.0,,10000.0,790000


### Make Game Names Upper Case & Remove Puncutation

In [7]:
games_clean_df['Name'] = games_clean_df['Name'].str.upper() 
games_clean_df['Name'] = games_clean_df['Name'].str.replace(r'[^\w\s]+', '')

games_clean_df.head()

Unnamed: 0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,ACTION FORCE,2600,1982,Action,CPG Products,500000.0,30000.0,,10000.0,540000
2,ADVENTURES OF TRON,2600,1981,Action,Mattel Interactive,630000.0,30000.0,,10000.0,670000
3,AIR RAID,2600,1981,Action,Men-A-Vision,720000.0,40000.0,,10000.0,770000
5,AIRLOCK,2600,1981,Action,Data Age,360000.0,20000.0,,,390000
6,ALIEN,2600,1981,Action,20th Century Fox Video Games,740000.0,40000.0,,10000.0,790000


### Find Average NA_Sales Price and Add Column

In [8]:
games_clean_df[["NA_Sales", "EU_Sales", "JP_Sales", "Global_Sales"]].mean()


NA_Sales        503339.930033
EU_Sales        205306.395960
JP_Sales        412167.150632
Global_Sales    714930.681608
dtype: float64

In [9]:
# NA_Average = games_clean_df[["NA_Sales"]].mean().astype(float)
# EU_Average = games_clean_df[["NA_Sales"]].mean().astype(float)
# JP_Average = games_clean_df[["NA_Sales"]].mean().astype(float)
# Other_Average = games_clean_df[["NA_Sales"]].mean().astype(float)
# Global_Average = games_clean_df[["NA_Sales"]].mean().astype(float)


# Price Data
<hr>

### Display Data Frame

In [10]:
price_data_df = price_data[['Console', 'Game Title', 'Price']].sort_values(by=['Console', 'Game Title']).reset_index(drop=True)

price_data_df.head()

Unnamed: 0,Console,Game Title,Price
0,2600,3-D Tic-Tac-Toe [Atari],23.31
1,2600,3-D Tic-Tac-Toe [Sears],83.88
2,2600,A Game of Concentration,28.36
3,2600,Adventure [Atari],79.72
4,2600,Adventure [Sears],111.35


### Make Game Title Upper Case & Remove Punctuation

In [11]:
price_data_df['Game Title'] = price_data_df['Game Title'].str.upper() 
price_data_df['Game Title'] = price_data_df['Game Title'].str.replace(r'[^\w\s]+', '')
price_data_df.head()

Unnamed: 0,Console,Game Title,Price
0,2600,3D TICTACTOE ATARI,23.31
1,2600,3D TICTACTOE SEARS,83.88
2,2600,A GAME OF CONCENTRATION,28.36
3,2600,ADVENTURE ATARI,79.72
4,2600,ADVENTURE SEARS,111.35


### Find Average Price and Add Column

In [50]:
mean = price_data_df[["Price"]].mean()
median = price_data_df[["Price"]].median()

print(mean)
print(median)


Price    79.94837
dtype: float64
Price    30.57
dtype: float64


In [51]:
price_data_df['Mean'] = np.where(price_data_df[['Price']] > mean, True, False)
price_data_df['Median'] = np.where(price_data_df[['Price']] > median, True, False)
price_data_df.head(20)

Unnamed: 0,Console,Game Title,Price,Threshold,Mean,Median
0,2600,3D TICTACTOE ATARI,23.31,False,False,False
1,2600,3D TICTACTOE SEARS,83.88,True,True,True
2,2600,A GAME OF CONCENTRATION,28.36,False,False,False
3,2600,ADVENTURE ATARI,79.72,True,False,True
4,2600,ADVENTURE SEARS,111.35,True,True,True
5,2600,ADVENTURES OF TRON,16.1,False,False,False
6,2600,AIR RAID,0.0,False,False,False
7,2600,AIR RAIDERS,15.02,False,False,False
8,2600,AIRSEA BATTLE,26.15,False,False,False
9,2600,AIRLOCK,17.0,False,False,False


## Merge Data
<hr>

In [52]:
merged_df = pd.merge(games_clean_df, price_data_df,  how='inner', left_on=['Name','Platform'], right_on = ['Game Title','Console'])

merged_df.head()

Unnamed: 0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Console,Game Title,Price,Threshold,Mean,Median
0,ADVENTURES OF TRON,2600,1981,Action,Mattel Interactive,630000.0,30000.0,,10000.0,670000,2600,ADVENTURES OF TRON,16.1,False,False,False
1,AIR RAID,2600,1981,Action,Men-A-Vision,720000.0,40000.0,,10000.0,770000,2600,AIR RAID,0.0,False,False,False
2,AIRLOCK,2600,1981,Action,Data Age,360000.0,20000.0,,,390000,2600,AIRLOCK,17.0,False,False,False
3,ALIEN,2600,1981,Action,20th Century Fox Video Games,740000.0,40000.0,,10000.0,790000,2600,ALIEN,61.24,True,False,True
4,ARMOR AMBUSH,2600,1981,Action,Mattel Interactive,150000.0,10000.0,,,170000,2600,ARMOR AMBUSH,9.92,False,False,False


In [53]:
merged_df.count()

Name            1402
Platform        1402
Year            1402
Genre           1402
Publisher       1402
NA_Sales        1286
EU_Sales        1263
JP_Sales         449
Other_Sales      928
Global_Sales    1402
Console         1402
Game Title      1402
Price           1402
Threshold       1402
Mean            1402
Median          1402
dtype: int64

In [54]:
# Export file as csv
merged_df.to_csv(r'..\data\merged_games_df.csv', encoding='utf-8', index=True)