In [36]:
import pandas as pd
import numpy as np
import re

In [37]:
#Input Data from Kaggle
inputdata = pd.read_csv('Data/games.csv')

In [38]:
indata = inputdata.copy()
indata.reset_index(inplace=True)
indata.columns

Index(['index', 'AppID', 'Name', 'Release date', 'Estimated owners',
       'Peak CCU', 'Required age', 'Price', 'DiscountDLC count',
       'About the game', 'Supported languages', 'Full audio languages',
       'Reviews', 'Header image', 'Website', 'Support url', 'Support email',
       'Windows', 'Mac', 'Linux', 'Metacritic score', 'Metacritic url',
       'User score', 'Positive', 'Negative', 'Score rank', 'Achievements',
       'Recommendations', 'Notes', 'Average playtime forever',
       'Average playtime two weeks', 'Median playtime forever',
       'Median playtime two weeks', 'Developers', 'Publishers', 'Categories',
       'Genres', 'Tags', 'Screenshots', 'Movies'],
      dtype='object')

In [39]:
column_names = indata.columns[1:]
data = indata.drop(columns=['Price'])
data.head()
data.columns = column_names

In [40]:

data.to_csv('Data/Full_games.csv', index=False)

In [41]:
data.columns

Index(['AppID', 'Name', 'Release date', 'Estimated owners', 'Peak CCU',
       'Required age', 'Price', 'DiscountDLC count', 'About the game',
       'Supported languages', 'Full audio languages', 'Reviews',
       'Header image', 'Website', 'Support url', 'Support email', 'Windows',
       'Mac', 'Linux', 'Metacritic score', 'Metacritic url', 'User score',
       'Positive', 'Negative', 'Score rank', 'Achievements', 'Recommendations',
       'Notes', 'Average playtime forever', 'Average playtime two weeks',
       'Median playtime forever', 'Median playtime two weeks', 'Developers',
       'Publishers', 'Categories', 'Genres', 'Tags', 'Screenshots', 'Movies'],
      dtype='object')

In [42]:
#Create multiple dataframes with different information

#Description of the Games - important for NLP
game_descriptions = data[['AppID','Name','Price','About the game','Reviews','Categories','Genres','Tags']]

#Game aspects - important for the recommendation system
game_aspects = data[['AppID','Name','Developers','Publishers','Release date','Achievements','Price','Categories','Genres','Tags', 'Positive','Negative', 'Header image']]

#Game Visuals
game_visuals = data[['AppID','Name','Header image','Screenshots','Movies','Website']]

#Game Analytics
game_analytics = data[['AppID','Name','Release date','Average playtime forever','Average playtime two weeks','Median playtime forever','Median playtime two weeks','Estimated owners','Peak CCU','Metacritic score','User score','Score rank','Recommendations','Positive','Negative']]


In [43]:
#Lets start the recommendation system
#We will use the game_aspects dataframe to create a recommendation system
#We will use the tags, genres, categories as the main features to recommend games 
#We will also use Price, Positive, Negative, release date, and weight them less than the previous features


print(game_aspects.head())
print(game_aspects.shape)

     AppID                   Name             Developers  \
0    20200       Galactic Bowling  Perpetual FX Creative   
1   655370           Train Bandit           Rusty Moyher   
2  1732930           Jolt Project          Campião Games   
3  1355720               Henosis™      Odd Critter Games   
4  1139950  Two Weeks in Painland          Unusual Games   

              Publishers  Release date  Achievements  Price  \
0  Perpetual FX Creative  Oct 21, 2008            30  19.99   
1           Wild Rooster  Oct 12, 2017            12   0.99   
2          Campião Games  Nov 17, 2021             0   4.99   
3      Odd Critter Games  Jul 23, 2020             0   5.99   
4          Unusual Games   Feb 3, 2020            17   0.00   

                                          Categories  \
0  Single-player,Multi-player,Steam Achievements,...   
1  Single-player,Steam Achievements,Full controll...   
2                                      Single-player   
3              Single-player,Full co

In [44]:
#Data Cleaning time
#Potentially put into price range
# Turn Date into a value

rec_data = game_aspects.copy()


In [45]:
#Make the positive and negative into 1 value
#Use the Wilson Score Interval to penalize games with low total number of reviews

rec_data['wilson_score'] = 0

def clean_rates(row):
    if row['Positive'] == 0 & row['Negative'] == 0:
        return 0
    else:
        z = 1.96
        n = row['Positive'] + row['Negative']
        phat = row['Positive'] / n
        rate = (phat + z*z/(2*n) - z * np.sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n)
        return rate



rec_data['wilson_score'] = rec_data.apply(clean_rates, axis=1)

#Remove all games with 0 reviews and no tags

rec_data = rec_data[~((rec_data['wilson_score'] == 0) | (rec_data['Tags'].isna()))]
#rec_data = rec_data[rec_data['wilson_score'] > 0.15]
rec_data


Unnamed: 0,AppID,Name,Developers,Publishers,Release date,Achievements,Price,Categories,Genres,Tags,Positive,Negative,Header image,wilson_score
0,20200,Galactic Bowling,Perpetual FX Creative,Perpetual FX Creative,"Oct 21, 2008",30,19.99,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",6,11,https://cdn.akamai.steamstatic.com/steam/apps/...,0.173095
1,655370,Train Bandit,Rusty Moyher,Wild Rooster,"Oct 12, 2017",12,0.99,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",53,5,https://cdn.akamai.steamstatic.com/steam/apps/...,0.813561
3,1355720,Henosis™,Odd Critter Games,Odd Critter Games,"Jul 23, 2020",0,5.99,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",3,0,https://cdn.akamai.steamstatic.com/steam/apps/...,0.438494
4,1139950,Two Weeks in Painland,Unusual Games,Unusual Games,"Feb 3, 2020",17,0.00,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",50,8,https://cdn.akamai.steamstatic.com/steam/apps/...,0.750739
5,1469160,Wartune Reborn,7Road,7Road,"Feb 26, 2021",0,0.00,"Single-player,Multi-player,MMO,PvP,Online PvP,...","Adventure,Casual,Free to Play,Massively Multip...","Turn-Based Combat,Massively Multiplayer,Multip...",87,49,https://cdn.akamai.steamstatic.com/steam/apps/...,0.556204
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97393,3133480,Climbing with Silly Cat,Xeroyd,Xeroyd,"Aug 23, 2024",11,1.99,"Single-player,Steam Achievements,Family Sharing","Casual,Indie","Cats,Casual,2D Platformer,Pixel Graphics,Platf...",4,0,https://shared.akamai.steamstatic.com/store_it...,0.510100
97396,2960850,INCEL SIMULATOR,Freak defense,Freak defense,"Jul 30, 2024",0,0.99,"Single-player,Family Sharing","Casual,Indie,Simulation","Idler,Life Sim,Dark Humor,Story Rich,Choices M...",14,1,https://shared.akamai.steamstatic.com/store_it...,0.701829
97401,2435240,Sulphur Memories: Alchemist,Spellweaver,Spellweaver,"Aug 9, 2024",0,11.99,"Single-player,Family Sharing","Indie,RPG,Simulation,Early Access","Traditional Roguelike,Exploration,Crafting,Imm...",10,1,https://shared.akamai.steamstatic.com/store_it...,0.622635
97402,2214970,Get To The Gate,Maximan,Maximan,"Aug 19, 2024",37,19.99,"Single-player,Steam Achievements,Full controll...","Action,Adventure,RPG","Dungeon Crawler,RPG,Fantasy,Grid-Based Movemen...",3,0,https://shared.akamai.steamstatic.com/store_it...,0.438494


In [46]:
#Create New column of combined information from categories genres and tags

rec_data['combined_info'] = rec_data[['Categories','Genres','Tags']].fillna('').agg(','.join,axis=1).str.strip()


#remove rows where tags categories and genres are all missing
id_to_remove = rec_data[rec_data['Categories'].isna() & rec_data['Tags'].isna() & rec_data["Genres"].isna()]['AppID']
rec_data = rec_data[~rec_data['AppID'].isin(id_to_remove)]


#remove extra commas at the end of combined info for cleaning sake
def clean_row(row):
    if pd.isna(row['Tags']) and pd.isna(row['Genres']):
        return row['combined_info'][:-2]
    elif pd.isna(row['Tags']):
        return row['combined_info'][:-1]
    elif pd.isna(row['Categories']):
        return row['combined_info'][1:]
    return row['combined_info']


rec_data['combined_info'] = rec_data.apply(clean_row, axis=1)
rec_data


Unnamed: 0,AppID,Name,Developers,Publishers,Release date,Achievements,Price,Categories,Genres,Tags,Positive,Negative,Header image,wilson_score,combined_info
0,20200,Galactic Bowling,Perpetual FX Creative,Perpetual FX Creative,"Oct 21, 2008",30,19.99,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",6,11,https://cdn.akamai.steamstatic.com/steam/apps/...,0.173095,"Single-player,Multi-player,Steam Achievements,..."
1,655370,Train Bandit,Rusty Moyher,Wild Rooster,"Oct 12, 2017",12,0.99,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",53,5,https://cdn.akamai.steamstatic.com/steam/apps/...,0.813561,"Single-player,Steam Achievements,Full controll..."
3,1355720,Henosis™,Odd Critter Games,Odd Critter Games,"Jul 23, 2020",0,5.99,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",3,0,https://cdn.akamai.steamstatic.com/steam/apps/...,0.438494,"Single-player,Full controller support,Adventur..."
4,1139950,Two Weeks in Painland,Unusual Games,Unusual Games,"Feb 3, 2020",17,0.00,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",50,8,https://cdn.akamai.steamstatic.com/steam/apps/...,0.750739,"Single-player,Steam Achievements,Adventure,Ind..."
5,1469160,Wartune Reborn,7Road,7Road,"Feb 26, 2021",0,0.00,"Single-player,Multi-player,MMO,PvP,Online PvP,...","Adventure,Casual,Free to Play,Massively Multip...","Turn-Based Combat,Massively Multiplayer,Multip...",87,49,https://cdn.akamai.steamstatic.com/steam/apps/...,0.556204,"Single-player,Multi-player,MMO,PvP,Online PvP,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97393,3133480,Climbing with Silly Cat,Xeroyd,Xeroyd,"Aug 23, 2024",11,1.99,"Single-player,Steam Achievements,Family Sharing","Casual,Indie","Cats,Casual,2D Platformer,Pixel Graphics,Platf...",4,0,https://shared.akamai.steamstatic.com/store_it...,0.510100,"Single-player,Steam Achievements,Family Sharin..."
97396,2960850,INCEL SIMULATOR,Freak defense,Freak defense,"Jul 30, 2024",0,0.99,"Single-player,Family Sharing","Casual,Indie,Simulation","Idler,Life Sim,Dark Humor,Story Rich,Choices M...",14,1,https://shared.akamai.steamstatic.com/store_it...,0.701829,"Single-player,Family Sharing,Casual,Indie,Simu..."
97401,2435240,Sulphur Memories: Alchemist,Spellweaver,Spellweaver,"Aug 9, 2024",0,11.99,"Single-player,Family Sharing","Indie,RPG,Simulation,Early Access","Traditional Roguelike,Exploration,Crafting,Imm...",10,1,https://shared.akamai.steamstatic.com/store_it...,0.622635,"Single-player,Family Sharing,Indie,RPG,Simulat..."
97402,2214970,Get To The Gate,Maximan,Maximan,"Aug 19, 2024",37,19.99,"Single-player,Steam Achievements,Full controll...","Action,Adventure,RPG","Dungeon Crawler,RPG,Fantasy,Grid-Based Movemen...",3,0,https://shared.akamai.steamstatic.com/store_it...,0.438494,"Single-player,Steam Achievements,Full controll..."


In [47]:
rec_data['combined_info'] = rec_data['combined_info'].str.replace('-', ' ')

In [48]:
#Now time to put the price in a range - 1000 is most expensive

bin_edges = [-0.01, 0.99, 5, 10, 20, 40, 60, 70, 1000]
bin_labels = ['Free to Under $1', '$1 to $5', '$5 to $10', '$10 to $20', '$20 to $40', '$40 to $60', '$60 to $70', '$70+']

rec_data['Price_Range'] = pd.cut(rec_data['Price'], bins=bin_edges, labels=bin_labels)
rec_data

Unnamed: 0,AppID,Name,Developers,Publishers,Release date,Achievements,Price,Categories,Genres,Tags,Positive,Negative,Header image,wilson_score,combined_info,Price_Range
0,20200,Galactic Bowling,Perpetual FX Creative,Perpetual FX Creative,"Oct 21, 2008",30,19.99,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",6,11,https://cdn.akamai.steamstatic.com/steam/apps/...,0.173095,"Single player,Multi player,Steam Achievements,...",$10 to $20
1,655370,Train Bandit,Rusty Moyher,Wild Rooster,"Oct 12, 2017",12,0.99,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",53,5,https://cdn.akamai.steamstatic.com/steam/apps/...,0.813561,"Single player,Steam Achievements,Full controll...",Free to Under $1
3,1355720,Henosis™,Odd Critter Games,Odd Critter Games,"Jul 23, 2020",0,5.99,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",3,0,https://cdn.akamai.steamstatic.com/steam/apps/...,0.438494,"Single player,Full controller support,Adventur...",$5 to $10
4,1139950,Two Weeks in Painland,Unusual Games,Unusual Games,"Feb 3, 2020",17,0.00,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",50,8,https://cdn.akamai.steamstatic.com/steam/apps/...,0.750739,"Single player,Steam Achievements,Adventure,Ind...",Free to Under $1
5,1469160,Wartune Reborn,7Road,7Road,"Feb 26, 2021",0,0.00,"Single-player,Multi-player,MMO,PvP,Online PvP,...","Adventure,Casual,Free to Play,Massively Multip...","Turn-Based Combat,Massively Multiplayer,Multip...",87,49,https://cdn.akamai.steamstatic.com/steam/apps/...,0.556204,"Single player,Multi player,MMO,PvP,Online PvP,...",Free to Under $1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97393,3133480,Climbing with Silly Cat,Xeroyd,Xeroyd,"Aug 23, 2024",11,1.99,"Single-player,Steam Achievements,Family Sharing","Casual,Indie","Cats,Casual,2D Platformer,Pixel Graphics,Platf...",4,0,https://shared.akamai.steamstatic.com/store_it...,0.510100,"Single player,Steam Achievements,Family Sharin...",$1 to $5
97396,2960850,INCEL SIMULATOR,Freak defense,Freak defense,"Jul 30, 2024",0,0.99,"Single-player,Family Sharing","Casual,Indie,Simulation","Idler,Life Sim,Dark Humor,Story Rich,Choices M...",14,1,https://shared.akamai.steamstatic.com/store_it...,0.701829,"Single player,Family Sharing,Casual,Indie,Simu...",Free to Under $1
97401,2435240,Sulphur Memories: Alchemist,Spellweaver,Spellweaver,"Aug 9, 2024",0,11.99,"Single-player,Family Sharing","Indie,RPG,Simulation,Early Access","Traditional Roguelike,Exploration,Crafting,Imm...",10,1,https://shared.akamai.steamstatic.com/store_it...,0.622635,"Single player,Family Sharing,Indie,RPG,Simulat...",$10 to $20
97402,2214970,Get To The Gate,Maximan,Maximan,"Aug 19, 2024",37,19.99,"Single-player,Steam Achievements,Full controll...","Action,Adventure,RPG","Dungeon Crawler,RPG,Fantasy,Grid-Based Movemen...",3,0,https://shared.akamai.steamstatic.com/store_it...,0.438494,"Single player,Steam Achievements,Full controll...",$10 to $20


In [49]:
#Finally, lets do the date
#find the oldest date and subtract it from the rest of the dates

rec_data['Release date'] = pd.to_datetime(rec_data['Release date'], format='mixed')

rec_data['Release date'].min()
reference_date = pd.to_datetime('1997-06-29')
rec_data['dayssincereference'] = (pd.to_datetime(rec_data['Release date']) - reference_date).dt.days



In [50]:
#remove duplicates if name and developers and publishers are the same
rec_data = rec_data.drop_duplicates(subset=['Name','Developers','Publishers'], keep='first')

In [51]:
import re
rec_data_test = rec_data.loc[rec_data.groupby(['Name', 'Developers', 'Publishers'])['Achievements'].idxmax()]
rec_data_test


emoji_pattern = re.compile('[\U00010000-\U0010ffff]|[\u2600-\u26FF]|[\u2700-\u27BF]|[\u2300-\u23FF]|[\u2B50]|[\u20E3]')
def remove_emojis(text):
    # Regular expression pattern for matching emoji characters
    return emoji_pattern.sub('', text)

rec_data_test['Name'] = rec_data_test['Name'].apply(remove_emojis)
rec_data_test



Unnamed: 0,AppID,Name,Developers,Publishers,Release date,Achievements,Price,Categories,Genres,Tags,Positive,Negative,Header image,wilson_score,combined_info,Price_Range,dayssincereference
81427,2556940,! Shakabula *,Skermunkel,Skermunkel,2023-10-13,0,14.99,"Single-player,Full controller support,Steam Cloud","Action,Indie,RPG,Early Access","Early Access,Action,RPG,JRPG,Shooter,Bullet He...",1,1,https://cdn.akamai.steamstatic.com/steam/apps/...,0.094529,"Single player,Full controller support,Steam Cl...",$10 to $20,9602
31138,449940,! That Bastard Is Trying To Steal Our Gold !,WTFOMGames,WTFOMGames,2016-03-03,0,2.99,"Single-player,Steam Trading Cards,Partial Cont...","Action,Adventure,Casual,Indie","Puzzle-Platformer,Dark Humor,Dungeon Crawler,M...",26,62,https://cdn.akamai.steamstatic.com/steam/apps/...,0.210307,"Single player,Steam Trading Cards,Partial Cont...",$1 to $5,6822
35088,1287250,! Wild Russia !,Andreev Worlds,Andreev Worlds,2020-04-28,9,19.99,"Single-player,Steam Achievements,Partial Contr...","Action,Adventure,Casual","Adventure,Action,Casual,Horror,Post-apocalypti...",41,22,https://cdn.akamai.steamstatic.com/steam/apps/...,0.527512,"Single player,Steam Achievements,Partial Contr...",$10 to $20,8339
18961,866510,!AnyWay!,EYEFRONT,EYEFRONT,2018-06-06,4997,1.79,"Single-player,Multi-player,Steam Achievements,...","Adventure,Casual,Indie","Side Scroller,Precision Platformer,2D Platform...",246,113,https://cdn.akamai.steamstatic.com/steam/apps/...,0.635448,"Single player,Multi player,Steam Achievements,...",$1 to $5,7647
51433,870990,!LABrpgUP!,UPandQ,UPandQ,2018-06-13,2021,0.99,Single-player,"Adventure,Casual,Indie,RPG","Adventure,Indie,Casual,RPG",21,27,https://cdn.akamai.steamstatic.com/steam/apps/...,0.307011,"Single player,Adventure,Casual,Indie,RPG,Adven...",Free to Under $1,7654
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34879,1071920,Foreign Frugglers,Ultimo Games,Ultimo Games,2019-06-26,16,3.99,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Arcade,Retro,Shoot 'Em Up,Pixel G...",19,0,https://cdn.akamai.steamstatic.com/steam/apps/...,0.831816,"Single player,Steam Achievements,Full controll...",$1 to $5,8032
33527,460250,Circles,Jeroen Wimmers,Jeroen Wimmers,2017-02-17,8,2.99,"Single-player,Steam Achievements,Steam Cloud","Casual,Indie","Experimental,Minimalist,Relaxing,Abstract,Puzz...",83,8,https://cdn.akamai.steamstatic.com/steam/apps/...,0.836012,"Single player,Steam Achievements,Steam Cloud,C...",$1 to $5,7173
42364,965340,Human Rocket Person,2nd Studio,2nd Studio,2018-11-14,22,1.99,"Single-player,Steam Achievements,Full controll...","Action,Indie,Simulation","Indie,Action,Sexual Content,Nudity,Simulation,...",54,7,https://cdn.akamai.steamstatic.com/steam/apps/...,0.781555,"Single player,Steam Achievements,Full controll...",$1 to $5,7808
52519,806220,Absolute Blue,Intermediaware,Intermediaware,2018-04-21,0,3.99,"Single-player,Partial Controller Support","Action,Indie","Action,Indie,Shoot 'Em Up,Bullet Hell,Arcade,S...",17,13,https://cdn.akamai.steamstatic.com/steam/apps/...,0.391970,"Single player,Partial Controller Support,Actio...",$1 to $5,7601


In [54]:
recommendation_data = rec_data_test[['AppID','Name', 'Achievements','combined_info','Price','Price_Range','dayssincereference','wilson_score', 'Header image']]

recommendation_data.to_csv('Data/rec_allgames.csv', index=False)



#Games to recommend (Games with achievements and wilson score > 0.2)
Games_to_recommend = recommendation_data[(recommendation_data['Achievements'] > 0) & (recommendation_data['wilson_score'] > 0.2)].sort_values(by='AppID')

Games_to_recommend.to_csv('Data/rec_data.csv', index=False)



In [55]:
print(recommendation_data.shape[0], 'Games in total')
print(Games_to_recommend.shape[0], 'Games in recommendation data')

64824 Games in total
34333 Games in recommendation data


We should only be using these two dataframes for the recommendation system. Not Full_games.csv or games.csv
