# ETL Project
## Extract, Transform and Load videogames data

In [1]:
## Dependencies
import pandas as pd
import datetime as dt
import pymongo

## Console Videogames (data.world)

Dataset retrieved from: https://data.world/sumitrock/videogames

### Import data

In [2]:
## Import CSV into DataFrame
console_file = "input/Video_Games.csv"
console_df = pd.read_csv(console_file, encoding='UTF-8')
console_df.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,,


In [3]:
## Select Columns for Analysis
console_columns = ['Name', 'Platform', 'Year_of_Release', 'Genre', 'Developer', 'Publisher', 'User_Score']
console_compact = console_df[console_columns]
console_compact.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Developer,Publisher,User_Score
0,Wii Sports,Wii,2006.0,Sports,Nintendo,Nintendo,8.0
1,Super Mario Bros.,NES,1985.0,Platform,,Nintendo,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,Nintendo,8.3
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,Nintendo,8.0
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,,Nintendo,


### User_Score as float

In [4]:
condition = (console_compact['User_Score'].notnull()) & (console_compact['User_Score'] != 'tbd')
console_compact['User_Score'].loc[condition] = console_compact['User_Score'].loc[condition].values.astype(float)
console_compact.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Developer,Publisher,User_Score
0,Wii Sports,Wii,2006.0,Sports,Nintendo,Nintendo,8.0
1,Super Mario Bros.,NES,1985.0,Platform,,Nintendo,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,Nintendo,8.3
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,Nintendo,8.0
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,,Nintendo,


### Year_of_Release as int

In [5]:
console_compact.insert(len(console_compact.columns), "Year_Integer", console_compact['Year_of_Release'].values.astype(int))
console_compact.drop("Year_of_Release", axis=1, inplace=True)
console_compact.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,Name,Platform,Genre,Developer,Publisher,User_Score,Year_Integer
0,Wii Sports,Wii,Sports,Nintendo,Nintendo,8.0,2006
1,Super Mario Bros.,NES,Platform,,Nintendo,,1985
2,Mario Kart Wii,Wii,Racing,Nintendo,Nintendo,8.3,2008
3,Wii Sports Resort,Wii,Sports,Nintendo,Nintendo,8.0,2009
4,Pokemon Red/Pokemon Blue,GB,Role-Playing,,Nintendo,,1996


### Clean DataFrame

In [6]:
columns = {
    'Name':'name',
    'Platform':'platform',
    'Genre':'genre',
    'Developer':'developer',
    'Publisher':'publisher',
    'Year_Integer':'year_of_release',
    'User_Score':'user_score'}

console_clean = console_compact.rename(columns = columns)

console_clean.head()

Unnamed: 0,name,platform,genre,developer,publisher,user_score,year_of_release
0,Wii Sports,Wii,Sports,Nintendo,Nintendo,8.0,2006
1,Super Mario Bros.,NES,Platform,,Nintendo,,1985
2,Mario Kart Wii,Wii,Racing,Nintendo,Nintendo,8.3,2008
3,Wii Sports Resort,Wii,Sports,Nintendo,Nintendo,8.0,2009
4,Pokemon Red/Pokemon Blue,GB,Role-Playing,,Nintendo,,1996


## Steam Videogames (kaggle.com)
Dataset retrieved from: https://www.kaggle.com/nikdavis/steam-store-games

### Import data

In [8]:
## Import CSV into DataFrame
steam_file = "input/steam.csv"
steam_df = pd.read_csv(steam_file, encoding='UTF-8')
steam_df.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


In [9]:
## Select Columns for Analysis
steam_columns = ["name", "platforms", "release_date", "genres", "developer", "publisher", "positive_ratings", "negative_ratings"]
steam_compact = steam_df[steam_columns]
steam_compact.head()

Unnamed: 0,name,platforms,release_date,genres,developer,publisher,positive_ratings,negative_ratings
0,Counter-Strike,windows;mac;linux,2000-11-01,Action,Valve,Valve,124534,3339
1,Team Fortress Classic,windows;mac;linux,1999-04-01,Action,Valve,Valve,3318,633
2,Day of Defeat,windows;mac;linux,2003-05-01,Action,Valve,Valve,3416,398
3,Deathmatch Classic,windows;mac;linux,2001-06-01,Action,Valve,Valve,1273,267
4,Half-Life: Opposing Force,windows;mac;linux,1999-11-01,Action,Gearbox Software,Valve,5250,288


### Get Main Genre from Steam DataFrame

In [10]:
## Get main genre from genres
main_genre = []
for index, row in steam_compact.iterrows():
    main_genre.append(row['genres'].split(';')[0])
steam_compact['main_genre'] = main_genre
steam_compact.drop('genres', axis=1, inplace=True)

steam_compact.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,name,platforms,release_date,developer,publisher,positive_ratings,negative_ratings,main_genre
0,Counter-Strike,windows;mac;linux,2000-11-01,Valve,Valve,124534,3339,Action
1,Team Fortress Classic,windows;mac;linux,1999-04-01,Valve,Valve,3318,633,Action
2,Day of Defeat,windows;mac;linux,2003-05-01,Valve,Valve,3416,398,Action
3,Deathmatch Classic,windows;mac;linux,2001-06-01,Valve,Valve,1273,267,Action
4,Half-Life: Opposing Force,windows;mac;linux,1999-11-01,Gearbox Software,Valve,5250,288,Action


### Generate "Steam" Platform column

In [11]:
steam_labels = ["Steam" for i in range(len(steam_compact))]
steam_compact.insert(len(steam_compact.columns), "platforms_steam", pd.DataFrame(steam_labels))
steam_compact.drop("platforms", axis=1, inplace=True)

steam_compact.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,name,release_date,developer,publisher,positive_ratings,negative_ratings,main_genre,platforms_steam
0,Counter-Strike,2000-11-01,Valve,Valve,124534,3339,Action,Steam
1,Team Fortress Classic,1999-04-01,Valve,Valve,3318,633,Action,Steam
2,Day of Defeat,2003-05-01,Valve,Valve,3416,398,Action,Steam
3,Deathmatch Classic,2001-06-01,Valve,Valve,1273,267,Action,Steam
4,Half-Life: Opposing Force,1999-11-01,Gearbox Software,Valve,5250,288,Action,Steam


### Get Year of Release from Release Date

In [12]:
#Steam Year of Release
datetime_list = []

for x in steam_compact["release_date"]:
    x = dt.datetime.strptime(x, '%Y-%m-%d')
    datetime_list.append(x)
    
steam_compact['release_date'] = datetime_list

steam_compact['release_year'] = (steam_compact['release_date'].dt.year).astype(int)
steam_compact.drop("release_date", axis=1, inplace=True)

steam_compact.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,name,developer,publisher,positive_ratings,negative_ratings,main_genre,platforms_steam,release_year
0,Counter-Strike,Valve,Valve,124534,3339,Action,Steam,2000
1,Team Fortress Classic,Valve,Valve,3318,633,Action,Steam,1999
2,Day of Defeat,Valve,Valve,3416,398,Action,Steam,2003
3,Deathmatch Classic,Valve,Valve,1273,267,Action,Steam,2001
4,Half-Life: Opposing Force,Gearbox Software,Valve,5250,288,Action,Steam,1999


### Compute User Score

In [13]:
steam_compact["User_Score"] = steam_compact["positive_ratings"]/(steam_compact["positive_ratings"]+steam_compact["negative_ratings"])*10
steam_compact = steam_compact[["name","platforms_steam","release_year","main_genre","developer","publisher","User_Score"]]
steam_compact.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,name,platforms_steam,release_year,main_genre,developer,publisher,User_Score
0,Counter-Strike,Steam,2000,Action,Valve,Valve,9.738882
1,Team Fortress Classic,Steam,1999,Action,Valve,Valve,8.397874
2,Day of Defeat,Steam,2003,Action,Valve,Valve,8.956476
3,Deathmatch Classic,Steam,2001,Action,Valve,Valve,8.266234
4,Half-Life: Opposing Force,Steam,1999,Action,Gearbox Software,Valve,9.479957


### Clean DataFrame

In [14]:
columns = {
    'platforms_steam':'platform',
    'main_genre':'genre',
    'developer':'developer',
    'publisher':'publisher',
    'release_year':'year_of_release',
    'User_Score':'user_score'}
steam_clean = steam_compact.rename(columns = columns)
steam_clean.head()

Unnamed: 0,name,platform,year_of_release,genre,developer,publisher,user_score
0,Counter-Strike,Steam,2000,Action,Valve,Valve,9.738882
1,Team Fortress Classic,Steam,1999,Action,Valve,Valve,8.397874
2,Day of Defeat,Steam,2003,Action,Valve,Valve,8.956476
3,Deathmatch Classic,Steam,2001,Action,Valve,Valve,8.266234
4,Half-Life: Opposing Force,Steam,1999,Action,Gearbox Software,Valve,9.479957


## Store DataFrames in MongoDB

In [15]:
## Setup Database connection (MongoDB)
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

## Define database in Mongo
db = client.VideoGamesDB

In [16]:
## Add Console DataFrame
#console_dict = console_clean.to_dict("records")
# db.console.insert_many(console_dict)

In [17]:
## Add Steam DataFrame
# steam_dict = steam_clean.to_dict("records")
# db.steam.insert_many(steam_dict)

In [18]:
## Add Console & Steam Dataframes to MongoDB
console_dict = console_clean.to_dict("records")
steam_dict = steam_clean.to_dict("records")
db.united.insert_many(console_dict)
db.united.insert_many(steam_dict)

<pymongo.results.InsertManyResult at 0x1c1fe3ee508>