# Data Preparation

This Jupyter Notebook cleans & prepares the raw Steam Games dataset for further analysis.

In [28]:
# Import libraries
import numpy as np
import pandas as pd
import re

from dateutil.parser import parse
from sklearn.preprocessing import MultiLabelBinarizer

In [29]:
games = pd.read_csv('datasets/raw_steam_games.csv')
games

Unnamed: 0,url,types,name,desc_snippet,recent_reviews,all_reviews,release_date,developer,publisher,popular_tags,game_details,languages,achievements,genre,game_description,mature_content,minimum_requirements,recommended_requirements,original_price,discount_price
0,https://store.steampowered.com/app/379720/DOOM/,app,DOOM,Now includes all three premium DLC packs (Unto...,"Very Positive,(554),- 89% of the 554 user revi...","Very Positive,(42,550),- 92% of the 42,550 use...","May 12, 2016",id Software,"Bethesda Softworks,Bethesda Softworks","FPS,Gore,Action,Demons,Shooter,First-Person,Gr...","Single-player,Multi-player,Co-op,Steam Achieve...","English,French,Italian,German,Spanish - Spain,...",54.0,Action,"About This Game Developed by id software, the...",,"Minimum:,OS:,Windows 7/8.1/10 (64-bit versions...","Recommended:,OS:,Windows 7/8.1/10 (64-bit vers...",$19.99,$14.99
1,https://store.steampowered.com/app/578080/PLAY...,app,PLAYERUNKNOWN'S BATTLEGROUNDS,PLAYERUNKNOWN'S BATTLEGROUNDS is a battle roya...,"Mixed,(6,214),- 49% of the 6,214 user reviews ...","Mixed,(836,608),- 49% of the 836,608 user revi...","Dec 21, 2017",PUBG Corporation,"PUBG Corporation,PUBG Corporation","Survival,Shooter,Multiplayer,Battle Royale,PvP...","Multi-player,Online Multi-Player,Stats","English,Korean,Simplified Chinese,French,Germa...",37.0,"Action,Adventure,Massively Multiplayer",About This Game PLAYERUNKNOWN'S BATTLEGROUND...,Mature Content Description The developers de...,"Minimum:,Requires a 64-bit processor and opera...","Recommended:,Requires a 64-bit processor and o...",$29.99,
2,https://store.steampowered.com/app/637090/BATT...,app,BATTLETECH,Take command of your own mercenary outfit of '...,"Mixed,(166),- 54% of the 166 user reviews in t...","Mostly Positive,(7,030),- 71% of the 7,030 use...","Apr 24, 2018",Harebrained Schemes,"Paradox Interactive,Paradox Interactive","Mechs,Strategy,Turn-Based,Turn-Based Tactics,S...","Single-player,Multi-player,Online Multi-Player...","English,French,German,Russian",128.0,"Action,Adventure,Strategy",About This Game From original BATTLETECH/Mec...,,"Minimum:,Requires a 64-bit processor and opera...","Recommended:,Requires a 64-bit processor and o...",$39.99,
3,https://store.steampowered.com/app/221100/DayZ/,app,DayZ,The post-soviet country of Chernarus is struck...,"Mixed,(932),- 57% of the 932 user reviews in t...","Mixed,(167,115),- 61% of the 167,115 user revi...","Dec 13, 2018",Bohemia Interactive,"Bohemia Interactive,Bohemia Interactive","Survival,Zombies,Open World,Multiplayer,PvP,Ma...","Multi-player,Online Multi-Player,Steam Worksho...","English,French,Italian,German,Spanish - Spain,...",,"Action,Adventure,Massively Multiplayer",About This Game The post-soviet country of Ch...,,"Minimum:,OS:,Windows 7/8.1 64-bit,Processor:,I...","Recommended:,OS:,Windows 10 64-bit,Processor:,...",$44.99,
4,https://store.steampowered.com/app/8500/EVE_On...,app,EVE Online,EVE Online is a community-driven spaceship MMO...,"Mixed,(287),- 54% of the 287 user reviews in t...","Mostly Positive,(11,481),- 74% of the 11,481 u...","May 6, 2003",CCP,"CCP,CCP","Space,Massively Multiplayer,Sci-fi,Sandbox,MMO...","Multi-player,Online Multi-Player,MMO,Co-op,Onl...","English,German,Russian,French",,"Action,Free to Play,Massively Multiplayer,RPG,...",About This Game,,"Minimum:,OS:,Windows 7,Processor:,Intel Dual C...","Recommended:,OS:,Windows 10,Processor:,Intel i...",Free,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40828,https://store.steampowered.com/app/899836/Rock...,app,Rocksmith® 2014 Edition – Remastered – Sabaton...,,,,"Feb 12, 2019",Ubisoft - San Francisco,,"Casual,Simulation","Single-player,Shared/Split Screen,Downloadable...","English,German,French,Italian,Spanish - Spain,...",,"Casual,Simulation","About This Content Play ""Ghost Division"" by S...",,"Minimum:,OS:,Windows Vista, Windows 7, Windows...","Recommended:,OS:,Windows Vista, Windows 7, Win...",$2.99,
40829,https://store.steampowered.com/app/899832/Rock...,app,Rocksmith® 2014 Edition – Remastered – Stone T...,,,,"Feb 5, 2019",Ubisoft - San Francisco,,"Casual,Simulation","Single-player,Shared/Split Screen,Downloadable...","English,German,French,Italian,Spanish - Spain,...",,"Casual,Simulation","About This Content Play ""Trippin’ on a Hole i...",,"Minimum:,OS:,Windows Vista, Windows 7, Windows...","Recommended:,OS:,Windows Vista, Windows 7, Win...",$2.99,
40830,https://store.steampowered.com/app/906840/Fant...,app,Fantasy Grounds - Quests of Doom 4: A Midnight...,,,,"Jul 31, 2018","SmiteWorks USA, LLC",,"RPG,Indie,Strategy,Software,Turn-Based,Fantasy...","Multi-player,Co-op,Cross-Platform Multiplayer,...",English,,"Indie,RPG,Strategy",About This Content Quests of Doom 4: A Midni...,,"Minimum:,OS:,Windows 7x , 8x or 10x,Processor:...","Recommended:,OS:,Windows 7x , 8x or 10x,Proces...",$7.99,
40831,https://store.steampowered.com/app/906635/Mega...,app,Mega Man X5 Sound Collection,,,,"Jul 24, 2018","CAPCOM CO., LTD","CAPCOM CO., LTD,CAPCOM CO., LTD",Action,"Single-player,Downloadable Content,Steam Achie...","English,French,Italian,German,Spanish - Spain,...",,Action,About This Content Get equipped with the stun...,,"Minimum:,OS:,WINDOWS® 7 (64bit),Processor:,Int...","Recommended:,OS:,WINDOWS®10 (64bit),Processor:...",$9.99,


## Remove malformed rows

In [30]:
games = games[games['types'] == 'app'] # remove non-game rows
games = games.dropna(subset = ['all_reviews', 'release_date', 'languages']) # remove games that do not have `all_review`/`release_date`, i.e. unreleased games
games = games[~games['all_reviews'].str.contains('Need more user reviews to generate a score')] # remove games that do not have enough user reviews yet
games

Unnamed: 0,url,types,name,desc_snippet,recent_reviews,all_reviews,release_date,developer,publisher,popular_tags,game_details,languages,achievements,genre,game_description,mature_content,minimum_requirements,recommended_requirements,original_price,discount_price
0,https://store.steampowered.com/app/379720/DOOM/,app,DOOM,Now includes all three premium DLC packs (Unto...,"Very Positive,(554),- 89% of the 554 user revi...","Very Positive,(42,550),- 92% of the 42,550 use...","May 12, 2016",id Software,"Bethesda Softworks,Bethesda Softworks","FPS,Gore,Action,Demons,Shooter,First-Person,Gr...","Single-player,Multi-player,Co-op,Steam Achieve...","English,French,Italian,German,Spanish - Spain,...",54.0,Action,"About This Game Developed by id software, the...",,"Minimum:,OS:,Windows 7/8.1/10 (64-bit versions...","Recommended:,OS:,Windows 7/8.1/10 (64-bit vers...",$19.99,$14.99
1,https://store.steampowered.com/app/578080/PLAY...,app,PLAYERUNKNOWN'S BATTLEGROUNDS,PLAYERUNKNOWN'S BATTLEGROUNDS is a battle roya...,"Mixed,(6,214),- 49% of the 6,214 user reviews ...","Mixed,(836,608),- 49% of the 836,608 user revi...","Dec 21, 2017",PUBG Corporation,"PUBG Corporation,PUBG Corporation","Survival,Shooter,Multiplayer,Battle Royale,PvP...","Multi-player,Online Multi-Player,Stats","English,Korean,Simplified Chinese,French,Germa...",37.0,"Action,Adventure,Massively Multiplayer",About This Game PLAYERUNKNOWN'S BATTLEGROUND...,Mature Content Description The developers de...,"Minimum:,Requires a 64-bit processor and opera...","Recommended:,Requires a 64-bit processor and o...",$29.99,
2,https://store.steampowered.com/app/637090/BATT...,app,BATTLETECH,Take command of your own mercenary outfit of '...,"Mixed,(166),- 54% of the 166 user reviews in t...","Mostly Positive,(7,030),- 71% of the 7,030 use...","Apr 24, 2018",Harebrained Schemes,"Paradox Interactive,Paradox Interactive","Mechs,Strategy,Turn-Based,Turn-Based Tactics,S...","Single-player,Multi-player,Online Multi-Player...","English,French,German,Russian",128.0,"Action,Adventure,Strategy",About This Game From original BATTLETECH/Mec...,,"Minimum:,Requires a 64-bit processor and opera...","Recommended:,Requires a 64-bit processor and o...",$39.99,
3,https://store.steampowered.com/app/221100/DayZ/,app,DayZ,The post-soviet country of Chernarus is struck...,"Mixed,(932),- 57% of the 932 user reviews in t...","Mixed,(167,115),- 61% of the 167,115 user revi...","Dec 13, 2018",Bohemia Interactive,"Bohemia Interactive,Bohemia Interactive","Survival,Zombies,Open World,Multiplayer,PvP,Ma...","Multi-player,Online Multi-Player,Steam Worksho...","English,French,Italian,German,Spanish - Spain,...",,"Action,Adventure,Massively Multiplayer",About This Game The post-soviet country of Ch...,,"Minimum:,OS:,Windows 7/8.1 64-bit,Processor:,I...","Recommended:,OS:,Windows 10 64-bit,Processor:,...",$44.99,
4,https://store.steampowered.com/app/8500/EVE_On...,app,EVE Online,EVE Online is a community-driven spaceship MMO...,"Mixed,(287),- 54% of the 287 user reviews in t...","Mostly Positive,(11,481),- 74% of the 11,481 u...","May 6, 2003",CCP,"CCP,CCP","Space,Massively Multiplayer,Sci-fi,Sandbox,MMO...","Multi-player,Online Multi-Player,MMO,Co-op,Onl...","English,German,Russian,French",,"Action,Free to Play,Massively Multiplayer,RPG,...",About This Game,,"Minimum:,OS:,Windows 7,Processor:,Intel Dual C...","Recommended:,OS:,Windows 10,Processor:,Intel i...",Free,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40557,https://store.steampowered.com/app/652810/Grab...,app,Grabity,"Dance the tango of death in Grabity, a fast-pa...",,"Positive,(17),- 100% of the 17 user reviews fo...","May 30, 2018",Team Ninja Thumbs,"Team Ninja Thumbs,Team Ninja Thumbs","Indie,Action,4 Player Local,Local Multiplayer,...","Single-player,Multi-player,Online Multi-Player...","English,German,French,Portuguese,Spanish - Spain",15.0,"Action,Indie",About This Game Welcome to Grabity's ballist...,,"Minimum:,Requires a 64-bit processor and opera...","Recommended:,Requires a 64-bit processor and o...",Download Demo,
40598,https://store.steampowered.com/app/848410/Deta...,app,Detached: Non-VR Edition,"Detached, a suspenseful interstellar duel that...",,"Mostly Positive,(14),- 78% of the 14 user revi...","Jul 24, 2018",Anshar Studios,"Anshar Studios,Anshar Studios","Simulation,Indie,Space,Exploration,Atmospheric...","Single-player,Multi-player,Online Multi-Player...","English,French,German,Simplified Chinese,Polish",47.0,"Indie,Simulation",About This Game Enter space in the non-VR ve...,,"Minimum:,Requires a 64-bit processor and opera...","Recommended:,Requires a 64-bit processor and o...",$14.99,
40611,https://store.steampowered.com/app/454330/A_Ro...,app,A Room Beyond,Solve a mysterious criminal case in this dark ...,,"Positive,(11),- 90% of the 11 user reviews for...","Jun 13, 2017",René Bühling,"René Bühling,René Bühling","Adventure,Indie,Point & Click","Single-player,Steam Achievements,Steam Trading...","English,German,French,Italian,Spanish - Spain",44.0,"Adventure,Indie","About This Game A ROOM BEYOND, is fantasy poi...",,,,$7.99,
40728,https://store.steampowered.com/app/763990/Chas...,app,Chasing the Stars,Chasing the Stars is a steampunk-ish multichoi...,,"Mostly Positive,(10),- 70% of the 10 user revi...","Jan 23, 2019",Ertal Games,"Ertal Games,Ertal Games","Nudity,Sexual Content,Indie,Steampunk,Story Ri...","Single-player,Steam Achievements,Profile Featu...",English,12.0,Indie,About This Game After building some satellite...,Mature Content Description The developers de...,,,$9.99,


## Remove unused columns

In [31]:
games = games.drop(['url', 'types', 'desc_snippet', 'recent_reviews', 'game_description', 'game_details', 'minimum_requirements', 'recommended_requirements', 'achievements'], axis = 1) # All these fields are either not useful or too difficult to extract useful information from.
games


Unnamed: 0,name,all_reviews,release_date,developer,publisher,popular_tags,languages,genre,mature_content,original_price,discount_price
0,DOOM,"Very Positive,(42,550),- 92% of the 42,550 use...","May 12, 2016",id Software,"Bethesda Softworks,Bethesda Softworks","FPS,Gore,Action,Demons,Shooter,First-Person,Gr...","English,French,Italian,German,Spanish - Spain,...",Action,,$19.99,$14.99
1,PLAYERUNKNOWN'S BATTLEGROUNDS,"Mixed,(836,608),- 49% of the 836,608 user revi...","Dec 21, 2017",PUBG Corporation,"PUBG Corporation,PUBG Corporation","Survival,Shooter,Multiplayer,Battle Royale,PvP...","English,Korean,Simplified Chinese,French,Germa...","Action,Adventure,Massively Multiplayer",Mature Content Description The developers de...,$29.99,
2,BATTLETECH,"Mostly Positive,(7,030),- 71% of the 7,030 use...","Apr 24, 2018",Harebrained Schemes,"Paradox Interactive,Paradox Interactive","Mechs,Strategy,Turn-Based,Turn-Based Tactics,S...","English,French,German,Russian","Action,Adventure,Strategy",,$39.99,
3,DayZ,"Mixed,(167,115),- 61% of the 167,115 user revi...","Dec 13, 2018",Bohemia Interactive,"Bohemia Interactive,Bohemia Interactive","Survival,Zombies,Open World,Multiplayer,PvP,Ma...","English,French,Italian,German,Spanish - Spain,...","Action,Adventure,Massively Multiplayer",,$44.99,
4,EVE Online,"Mostly Positive,(11,481),- 74% of the 11,481 u...","May 6, 2003",CCP,"CCP,CCP","Space,Massively Multiplayer,Sci-fi,Sandbox,MMO...","English,German,Russian,French","Action,Free to Play,Massively Multiplayer,RPG,...",,Free,
...,...,...,...,...,...,...,...,...,...,...,...
40557,Grabity,"Positive,(17),- 100% of the 17 user reviews fo...","May 30, 2018",Team Ninja Thumbs,"Team Ninja Thumbs,Team Ninja Thumbs","Indie,Action,4 Player Local,Local Multiplayer,...","English,German,French,Portuguese,Spanish - Spain","Action,Indie",,Download Demo,
40598,Detached: Non-VR Edition,"Mostly Positive,(14),- 78% of the 14 user revi...","Jul 24, 2018",Anshar Studios,"Anshar Studios,Anshar Studios","Simulation,Indie,Space,Exploration,Atmospheric...","English,French,German,Simplified Chinese,Polish","Indie,Simulation",,$14.99,
40611,A Room Beyond,"Positive,(11),- 90% of the 11 user reviews for...","Jun 13, 2017",René Bühling,"René Bühling,René Bühling","Adventure,Indie,Point & Click","English,German,French,Italian,Spanish - Spain","Adventure,Indie",,$7.99,
40728,Chasing the Stars,"Mostly Positive,(10),- 70% of the 10 user revi...","Jan 23, 2019",Ertal Games,"Ertal Games,Ertal Games","Nudity,Sexual Content,Indie,Steampunk,Story Ri...",English,Indie,Mature Content Description The developers de...,$9.99,


## Clean prices

In [32]:
pattern = re.compile('demo|prologue|prototype|try|trial|试玩|Guest', re.IGNORECASE) # remove all trial versions

def map_original_price(x):
    if pd.isna(x):
        return 0.0
    elif re.match(r'(\$\d+(\.\d{2})?)', x):
        return float(x[1:])
    else:
        return 0.0

mask = games['original_price'].str.contains(pattern, na=False)
games = games.loc[~mask]

games = games[~games['original_price'].isin(['1.020', '650560'])]  # remove rows with malformed prices
games['original_price'] = games['original_price'].apply(map_original_price)


def map_discount_price(x):
    if pd.isna(x):
        return x
    elif re.match(r'(\$\d+(\.\d{2})?)', x):
        return float(x[1:])
    else:
        return 0.0

games['discount_price'] = games['discount_price'].apply(map_discount_price)
games['discount_price'] = games['discount_price'].fillna(games['original_price'])
games = games[(games['original_price'] >= games['discount_price'])]
games['discount_percentage'] = (games['original_price'] - games['discount_price']) / games['original_price'] * 100
games['discount_percentage'] = games['discount_percentage'].fillna(0)


## Clean release dates

In [34]:
from datetime import datetime

unix_epoch = datetime(1970, 1, 1)

games['release_date'] = pd.to_datetime(games['release_date'], format='mixed')
games.insert(games.columns.get_loc('release_date') + 1, 'release_date_epoch_days', games['release_date'].apply(lambda x: (x - unix_epoch).days))
games


ValueError: time data 'May 12, 2016' does not match format 'mixed' (match)

## Clean genres

In [35]:
ignored = {
    # Non-game genres
    'Accounting', 'Animation & Modeling', 'Audio Production', 'Game Development', 'Movie', 'Photo Editing', 'Software Training', 'Utilities', 'Video Production', 'Web Publishing',
    'Design & Illustration', 'Education', 'Valve',
    # Already represented in original_price column
    'Free to Play',
    # These categories are not really "genre"s
    'Early Access', 'Indie',
}

def one_hot_encode_genres(frame):
    frame['genre'] = frame['genre'].apply(lambda x: [] if pd.isna(x) else [genre for genre in x.split(',') if genre not in ignored])
    frame = frame[frame['genre'].apply(len) > 0]

    binarizer = MultiLabelBinarizer(sparse_output = True)
    genres = pd.DataFrame.sparse.from_spmatrix(binarizer.fit_transform(frame['genre']), index=frame.index, columns=binarizer.classes_)
    genres = genres.add_prefix(f'genre_')

    frame = pd.concat([frame, genres], axis=1)
    return frame

games = one_hot_encode_genres(games)
games


Unnamed: 0,name,all_reviews,release_date,developer,publisher,popular_tags,languages,genre,mature_content,original_price,...,discount_percentage,genre_Action,genre_Adventure,genre_Casual,genre_Massively Multiplayer,genre_RPG,genre_Racing,genre_Simulation,genre_Sports,genre_Strategy
0,DOOM,"Very Positive,(42,550),- 92% of the 42,550 use...","May 12, 2016",id Software,"Bethesda Softworks,Bethesda Softworks","FPS,Gore,Action,Demons,Shooter,First-Person,Gr...","English,French,Italian,German,Spanish - Spain,...",[Action],,19.99,...,25.012506,1,0,0,0,0,0,0,0,0
1,PLAYERUNKNOWN'S BATTLEGROUNDS,"Mixed,(836,608),- 49% of the 836,608 user revi...","Dec 21, 2017",PUBG Corporation,"PUBG Corporation,PUBG Corporation","Survival,Shooter,Multiplayer,Battle Royale,PvP...","English,Korean,Simplified Chinese,French,Germa...","[Action, Adventure, Massively Multiplayer]",Mature Content Description The developers de...,29.99,...,0.000000,1,1,0,1,0,0,0,0,0
2,BATTLETECH,"Mostly Positive,(7,030),- 71% of the 7,030 use...","Apr 24, 2018",Harebrained Schemes,"Paradox Interactive,Paradox Interactive","Mechs,Strategy,Turn-Based,Turn-Based Tactics,S...","English,French,German,Russian","[Action, Adventure, Strategy]",,39.99,...,0.000000,1,1,0,0,0,0,0,0,1
3,DayZ,"Mixed,(167,115),- 61% of the 167,115 user revi...","Dec 13, 2018",Bohemia Interactive,"Bohemia Interactive,Bohemia Interactive","Survival,Zombies,Open World,Multiplayer,PvP,Ma...","English,French,Italian,German,Spanish - Spain,...","[Action, Adventure, Massively Multiplayer]",,44.99,...,0.000000,1,1,0,1,0,0,0,0,0
4,EVE Online,"Mostly Positive,(11,481),- 74% of the 11,481 u...","May 6, 2003",CCP,"CCP,CCP","Space,Massively Multiplayer,Sci-fi,Sandbox,MMO...","English,German,Russian,French","[Action, Massively Multiplayer, RPG, Strategy]",,0.00,...,0.000000,1,0,0,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40481,雨上がりのハナビィ Ameagari no Hanaby,"Positive,(30),- 100% of the 30 user reviews fo...","Sep 22, 2018",Enigmatic Network,"Enigmatic Network,Enigmatic Network","Indie,Adventure,Action,Casual,Sexual Content,Cute","English,Japanese","[Action, Adventure, Casual]",Mature Content Description The developers de...,13.99,...,0.000000,1,1,1,0,0,0,0,0,0
40524,Lil Big Invasion,"Positive,(13),- 100% of the 13 user reviews fo...","Aug 4, 2016",Andreas Britten,"Andreas Britten,Andreas Britten","Action,Indie,Adventure,Puzzle,Atmospheric,Cute...",English,"[Action, Adventure]",,2.99,...,0.000000,1,1,0,0,0,0,0,0,0
40598,Detached: Non-VR Edition,"Mostly Positive,(14),- 78% of the 14 user revi...","Jul 24, 2018",Anshar Studios,"Anshar Studios,Anshar Studios","Simulation,Indie,Space,Exploration,Atmospheric...","English,French,German,Simplified Chinese,Polish",[Simulation],,14.99,...,0.000000,0,0,0,0,0,0,1,0,0
40611,A Room Beyond,"Positive,(11),- 90% of the 11 user reviews for...","Jun 13, 2017",René Bühling,"René Bühling,René Bühling","Adventure,Indie,Point & Click","English,German,French,Italian,Spanish - Spain",[Adventure],,7.99,...,0.000000,0,1,0,0,0,0,0,0,0


## Clean developers & publishers

In [40]:
games.dropna(subset=['developer', 'publisher'], how='all', inplace=True) # remove malformed rows

# Replace NaN values, this is often the case where the developer & publisher are the same
games['developer'] = games['developer'].fillna(games['publisher'])
games['publisher'] = games['publisher'].fillna(games['developer'])
games['publisher'] = games['publisher'].str.split(',').str.get(0)
games

Unnamed: 0,name,all_reviews,recommend,release_date,developer,publisher,popular_tags,languages,genre,mature_content,...,genre_Action,genre_Adventure,genre_Casual,genre_Massively Multiplayer,genre_RPG,genre_Racing,genre_Simulation,genre_Sports,genre_Strategy,Percentage
0,DOOM,92,True,"May 12, 2016",id Software,Bethesda Softworks,"[FPS, Gore, Action, Demons, Shooter, First-Per...","[English, French, Italian, German, Spanish - S...",[Action],False,...,1,0,0,0,0,0,0,0,0,92.0
1,PLAYERUNKNOWN'S BATTLEGROUNDS,49,False,"Dec 21, 2017",PUBG Corporation,PUBG Corporation,"[Survival, Shooter, Multiplayer, Battle Royale...","[English, Korean, Simplified Chinese, French, ...","[Action, Adventure, Massively Multiplayer]",True,...,1,1,0,1,0,0,0,0,0,49.0
2,BATTLETECH,71,True,"Apr 24, 2018",Harebrained Schemes,Paradox Interactive,"[Mechs, Strategy, Turn-Based, Turn-Based Tacti...","[English, French, German, Russian]","[Action, Adventure, Strategy]",False,...,1,1,0,0,0,0,0,0,1,71.0
3,DayZ,61,False,"Dec 13, 2018",Bohemia Interactive,Bohemia Interactive,"[Survival, Zombies, Open World, Multiplayer, P...","[English, French, Italian, German, Spanish - S...","[Action, Adventure, Massively Multiplayer]",False,...,1,1,0,1,0,0,0,0,0,61.0
4,EVE Online,74,True,"May 6, 2003",CCP,CCP,"[Space, Massively Multiplayer, Sci-fi, Sandbox...","[English, German, Russian, French]","[Action, Massively Multiplayer, RPG, Strategy]",False,...,1,0,0,1,1,0,0,0,1,74.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40481,雨上がりのハナビィ Ameagari no Hanaby,100,True,"Sep 22, 2018",Enigmatic Network,Enigmatic Network,"[Indie, Adventure, Action, Casual, Sexual Cont...","[English, Japanese]","[Action, Adventure, Casual]",True,...,1,1,1,0,0,0,0,0,0,100.0
40524,Lil Big Invasion,100,True,"Aug 4, 2016",Andreas Britten,Andreas Britten,"[Action, Indie, Adventure, Puzzle, Atmospheric...",[English],"[Action, Adventure]",False,...,1,1,0,0,0,0,0,0,0,100.0
40598,Detached: Non-VR Edition,78,True,"Jul 24, 2018",Anshar Studios,Anshar Studios,"[Simulation, Indie, Space, Exploration, Atmosp...","[English, French, German, Simplified Chinese, ...",[Simulation],False,...,0,0,0,0,0,0,1,0,0,78.0
40611,A Room Beyond,90,True,"Jun 13, 2017",René Bühling,René Bühling,"[Adventure, Indie, Point & Click]","[English, German, French, Italian, Spanish - S...",[Adventure],False,...,0,1,0,0,0,0,0,0,0,90.0


## Finding the bar to set recommended games base on all_reviews score

In [37]:


pattern = r'(\d+)%'


# extract percentage from Sentiment column using regular expressions
games['Percentage'] = games['all_reviews'].str.extract(pattern)


# convert percentage column to float
games['Percentage'] = games['Percentage'].astype(float)


# use str.contains to count number of rows with sentiment of Very Positive or Mostly Positive, and percentage above 70%
count = len(games[(games['all_reviews'].str.contains('Positive')) & (games['Percentage'] >= 70)])
print("Number of rows with sentiment of Positive, and percentage above 70%:", count)

count = len(games[(games['all_reviews'].str.contains('Positive')) & (games['Percentage'] < 70)])
print("Number of rows with sentiment of Positive, and percentage below 70%:", count)

count = len(games[((games['all_reviews'].str.contains('Mixed')) |(games['all_reviews'].str.contains('Negative')))& (games['Percentage'] >= 70)])
print("Number of rows with sentiment of Mixed and Negative, and percentage above 70%:", count)

count = len(games[((games['all_reviews'].str.contains('Mixed')) |(games['all_reviews'].str.contains('Negative')))& (games['Percentage'] < 70)])
print("Number of rows with sentiment of Mixed and Negative, and percentage below 70%:", count)

Number of rows with sentiment of Positive, and percentage above 70%: 7550
Number of rows with sentiment of Positive, and percentage below 70%: 0
Number of rows with sentiment of Mixed and Negative, and percentage above 70%: 0
Number of rows with sentiment of Mixed and Negative, and percentage below 70%: 4070


We can see that all games are accounted for 7550+4070=11620 which the amount of rows of games from the cleaning done above.
Showing that the bar for games with positive connotatin are indeed set at 70% of positive reviews.

## Clean other data

In [38]:
def map_all_reviews(x):
    match = re.search(r"\d{1,3}%", x)
    if match:
        return int(match.group(0)[:-1])

games['all_reviews'] = games['all_reviews'].apply(map_all_reviews)
games.insert(games.columns.get_loc('all_reviews') + 1, 'recommend', games['all_reviews'].apply(lambda x: x >= 70))

games['languages'] = games['languages'].apply(lambda x: [] if pd.isna(x) else list(x.split(',')))
games['popular_tags'] = games['popular_tags'].apply(lambda x: [] if pd.isna(x) else list(x.split(',')))
games['mature_content'] = games['mature_content'].apply(lambda x: not pd.isna(x))
games['publisher'] = games['publisher'].str.split(',').str.get(0)
games

Unnamed: 0,name,all_reviews,recommend,release_date,developer,publisher,popular_tags,languages,genre,mature_content,...,genre_Action,genre_Adventure,genre_Casual,genre_Massively Multiplayer,genre_RPG,genre_Racing,genre_Simulation,genre_Sports,genre_Strategy,Percentage
0,DOOM,92,True,"May 12, 2016",id Software,Bethesda Softworks,"[FPS, Gore, Action, Demons, Shooter, First-Per...","[English, French, Italian, German, Spanish - S...",[Action],False,...,1,0,0,0,0,0,0,0,0,92.0
1,PLAYERUNKNOWN'S BATTLEGROUNDS,49,False,"Dec 21, 2017",PUBG Corporation,PUBG Corporation,"[Survival, Shooter, Multiplayer, Battle Royale...","[English, Korean, Simplified Chinese, French, ...","[Action, Adventure, Massively Multiplayer]",True,...,1,1,0,1,0,0,0,0,0,49.0
2,BATTLETECH,71,True,"Apr 24, 2018",Harebrained Schemes,Paradox Interactive,"[Mechs, Strategy, Turn-Based, Turn-Based Tacti...","[English, French, German, Russian]","[Action, Adventure, Strategy]",False,...,1,1,0,0,0,0,0,0,1,71.0
3,DayZ,61,False,"Dec 13, 2018",Bohemia Interactive,Bohemia Interactive,"[Survival, Zombies, Open World, Multiplayer, P...","[English, French, Italian, German, Spanish - S...","[Action, Adventure, Massively Multiplayer]",False,...,1,1,0,1,0,0,0,0,0,61.0
4,EVE Online,74,True,"May 6, 2003",CCP,CCP,"[Space, Massively Multiplayer, Sci-fi, Sandbox...","[English, German, Russian, French]","[Action, Massively Multiplayer, RPG, Strategy]",False,...,1,0,0,1,1,0,0,0,1,74.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40481,雨上がりのハナビィ Ameagari no Hanaby,100,True,"Sep 22, 2018",Enigmatic Network,Enigmatic Network,"[Indie, Adventure, Action, Casual, Sexual Cont...","[English, Japanese]","[Action, Adventure, Casual]",True,...,1,1,1,0,0,0,0,0,0,100.0
40524,Lil Big Invasion,100,True,"Aug 4, 2016",Andreas Britten,Andreas Britten,"[Action, Indie, Adventure, Puzzle, Atmospheric...",[English],"[Action, Adventure]",False,...,1,1,0,0,0,0,0,0,0,100.0
40598,Detached: Non-VR Edition,78,True,"Jul 24, 2018",Anshar Studios,Anshar Studios,"[Simulation, Indie, Space, Exploration, Atmosp...","[English, French, German, Simplified Chinese, ...",[Simulation],False,...,0,0,0,0,0,0,1,0,0,78.0
40611,A Room Beyond,90,True,"Jun 13, 2017",René Bühling,René Bühling,"[Adventure, Indie, Point & Click]","[English, German, French, Italian, Spanish - S...",[Adventure],False,...,0,1,0,0,0,0,0,0,0,90.0


In [39]:
# We store the results as Pickle instead of CSV since it allows us to preserve data structures, i.e. sparse lists.
games.to_pickle('datasets/cleaned_steam_games.pkl')

# To read the file, `df = pd.read_pickle('datasets/cleaned_steam_games.pkl')`