In [None]:
import numpy as np
import pandas as pd

We create our first function, which will unnest any json text into plain text

In [None]:
def unnest(path, dicc):
    """
    This function allows us to read json text as plain.
    
    Parameters
    ----------
    path : str
        path where the json file is located.
    dicc : dict
        dictionary with key-value pairs.

    returns
    -------
    DataFrame
        result of converting said dictionary to a DataFrame.

    Examples
    --------
    >>> unnest('./folder/file.csv', dictionary)
    {
    "names": ["Michael", "Jim", "Dwight"], 
    "Position": ["Manager", "Salesman", "Salesman"],
    "Age": [42, 29, 27]
    }
    """

    df = []
    with open(path, encoding = 'UTF-8-SIG') as f:
        for line in f:
            df.append(eval(line))
    
    for i in df:
        for clave, valor in i.items():
            dicc[clave].append(valor)
    dicc = pd.DataFrame(dicc)
    return dicc

In [None]:
# We make our dictionary with key names same as the column names and an empty list.
diccionarioReviews = {'user_id' : [], 'user_url': [], 'reviews': []}

reviews = unnest('./DataSets/australian_user_reviews.json', diccionarioReviews)

In [None]:
reviews.head()

In [None]:
reviews.info()

We drop any duplicate value our dataset may have

In [None]:
reviews[reviews['user_id'].duplicated(keep = False)].sort_values(by = 'user_id')

In [None]:
reviews.drop_duplicates(subset = 'user_id', keep = False, inplace = True)

In [None]:
reviews.head().sort_values(by = 'user_id')

We repeat this process with our items DataSet.

In [None]:
items = {'user_id' : [], 'items_count': [], 'steam_id': [], 'user_url': [], 'items': []}
items = unnest('./DataSets/australian_users_items.json', items)


In [None]:
items.head()

In [None]:
items.info()

We check for any duplicate value.

In [None]:
items[items['user_id'].duplicated(keep = False)].sort_values(by = 'user_id')

We drop said values

In [None]:
items.drop_duplicates(subset = 'user_id', keep = 'first', inplace = True)

We make our Steam DataSet

In [None]:
df = []

with open('./DataSets/output_steam_games.json', encoding = 'UTF-8-SIG') as f:
    for line in f:
        if '"id": NaN'in line:
            continue
        line = line.replace("true", "True")
        line = line.replace("false", "False")
        line = line.replace("NaN", "None")
        df.append(eval(line))

steam = pd.DataFrame(df)

In [None]:
steam[steam['title'].duplicated(keep = False)].sort_values(by = 'title')

In [None]:
steam.drop_duplicates(subset = 'id', keep = 'first', inplace = True)

We transform the 'release_date' values from str type to a Date type

In [None]:
# Passing errors = 'coerce' will force an out-of-bounds date to NaT, in addition to forcing 
# non-dates (or non-parseable dates) to NaT.
steam['release_date'] = pd.to_datetime(steam['release_date'], format = '%Y-%m-%d', errors = 'coerce')

In [None]:
steam['price']

We see that the price column has mixed type values, we normalise these values changing them to numbers

In [None]:
# We do a function just in case we have multiple columns/DataSets to change.
def toZero(var):
    """
    Take parameter and change it to zero, regardless of its type.

    parameters
    ----------
    n : str
        Non numeric value.

    examples
    --------
    >>> toZero('Hello')
    0
    >>> toZero(5)
    5.0
    """
    
    try:
        return float(var)
    except:
        return 0


In [None]:
steam['price'] = steam['price'].apply(toZero)

In [None]:
steam.head(1)

In [None]:
# Get rid of the unnecesary columns. This columns will not be used in our API.
steamColumns = ['publisher', 'app_name', 'url', 'reviews_url', 'early_access', 'specs', 'url', 'reviews_url']
steam.drop(steamColumns, axis = 1, inplace = True)

steam.head()

We procede to unnest the remaining list-columns into several columns.

In [None]:
# Check the key names.
reviews['reviews'][0]

In [6]:
reviewDF ={'user_id': [], 'funny': [],'posted': [], 'last_edited': [], 'item_id': [], 'helpful':[], 'recommend': [], 'review': [] }

for index, review in enumerate(reviews['reviews']):
    for r in list(review):
        for key, value in r.items():
            reviewDF[key].append(value)
        reviewDF['user_id'].append(reviews.iloc[index]['user_id'])

reviewDF = pd.DataFrame(reviewDF)
reviewDF.head()

NameError: name 'reviews' is not defined

Transform the text in the `posted` column into a Date value.

In [5]:
from datetime import datetime
reviewDF['posted'] = reviewDF['posted'].str.replace('Posted ', "")
reviewDF['posted'] = pd.to_datetime(reviewDF['posted'], format = '%B%d%Y.', errors = 'ignore')

NameError: name 'reviewDF' is not defined

In [None]:
reviewDF.drop(columns= ['funny', 'last_edited', 'helpful'], inplace = True)
reviewDF

Unnest the items column in the items DataFrame

In [None]:
itemsDF = {'user_id': [], 'item_id': [], 'item_name': [], 'playtime_forever': [], 'playtime_2weeks': []}

for index, item_list in enumerate(items['items']):
    for item in item_list:
        itemsDF['user_id'].append(items.iloc[index]['user_id'])
        for key in itemsDF.keys():
            if key != 'user_id':
                itemsDF[key].append(item.get(key, None))  # Use get() to handle missing keys

itemsDF = pd.DataFrame(itemsDF)
itemsDF.head()


Apply sentiment analysis to the review column.

In [None]:
from textblob import TextBlob

def sentiment_analysis(column):
    '''
    Read text from a DataFrame, create a numeric column corresponding to previous one.
    This new column will have one for a neutral review, zero for bad review and two for a good review.

    Parameters
    ----------
    column : pandas.core.series.Series
        column name in which the analysis will be applied

    Returns
    -------
    DataFrame
        A DataFrame with a numeric column corresponding to review

    Examples
    --------
    >>> sentiment_analysis(positiveText)
    2
    >>> sentiment_analysis(negativeText)
    0
    >>> sentiment_analysis(neutralText)
    1
    '''
    analysis = TextBlob(column)
    if analysis.sentiment.polarity < 0:
        return 0
    elif analysis.sentiment.polarity == 0:
        return 1
    else:
        return 2

Apply the function to the DataFrame

In [None]:
reviewDF['sentiment_analysis'] = reviewDF['review'].apply(sentiment_analysis)
reviewDF['sentiment_analysis'].fillna(1, inplace = True)

Drop the review column, which will no longer be needed.

In [None]:
reviewDF.drop('review', axis = 1, inplace = True)

In [7]:
review

Unnamed: 0,user_id,posted,item_id,recommend,sentiment_analysis
0,76561197970982479,"November 5, 2011.",1250,True,2
1,76561197970982479,"July 15, 2011.",22200,True,2
2,76561197970982479,"April 21, 2011.",43110,True,2
3,js41637,"June 24, 2014.",251610,True,2
4,js41637,"September 8, 2013.",227300,True,0
...,...,...,...,...,...
57563,76561198312638244,July 10.,70,True,2
57564,76561198312638244,July 8.,362890,True,2
57565,LydiaMorley,July 3.,273110,True,2
57566,LydiaMorley,July 20.,730,True,2


In [None]:
itemsDF

We add a 'year' column to the steam DataSet, taken from the 'release_date' column.

In [9]:
steam['year'] = pd.to_datetime(steam['release_date']).dt.year
steam

Unnamed: 0,genres,title,release_date,tags,price,id,developer,year
0,"['Action', 'Casual', 'Indie', 'Simulation', 'S...",Lost Summoner Kitty,2018-01-04,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim...",4.99,761140,Kotoshiro,2018
1,"['Free to Play', 'Indie', 'RPG', 'Strategy']",Ironbound,2018-01-04,"['Free to Play', 'Strategy', 'Indie', 'RPG', '...",0.00,643980,Secret Level SRL,2018
2,"['Casual', 'Free to Play', 'Indie', 'Simulatio...",Real Pool 3D - Poolians,2017-07-24,"['Free to Play', 'Simulation', 'Sports', 'Casu...",0.00,670290,Poolians.com,2017
3,"['Action', 'Adventure', 'Casual']",弹炸人2222,2017-12-07,"['Action', 'Adventure', 'Casual']",0.99,767400,彼岸领域,2017
4,"['Action', 'Adventure', 'Simulation']",Battle Royale Trainer,2018-01-04,"['Action', 'Adventure', 'Simulation', 'FPS', '...",3.99,772540,Trickjump Games Ltd,2018
...,...,...,...,...,...,...,...,...
28348,"['Action', 'Adventure', 'Casual', 'Indie']",Kebab it Up!,2018-01-04,"['Action', 'Indie', 'Casual', 'Violent', 'Adve...",1.99,745400,Bidoniera Games,2018
28349,"['Casual', 'Indie', 'Simulation', 'Strategy']",Colony On Mars,2018-01-04,"['Strategy', 'Indie', 'Casual', 'Simulation']",1.99,773640,"Nikita ""Ghost_RUS""",2018
28350,"['Casual', 'Indie', 'Strategy']",LOGistICAL: South Africa,2018-01-04,"['Strategy', 'Indie', 'Casual']",4.99,733530,Sacada,2018
28351,"['Indie', 'Racing', 'Simulation']",Russian Roads,2018-01-04,"['Indie', 'Simulation', 'Racing']",1.99,610660,Laush Dmitriy Sergeevich,2018


### We drop unnecesary columns.

In [10]:
steam.drop(columns = ['release_date', 'tags', 'price', 'developer'], inplace= True)

In [11]:
steam.head()

Unnamed: 0,genres,title,id,year
0,"['Action', 'Casual', 'Indie', 'Simulation', 'S...",Lost Summoner Kitty,761140,2018
1,"['Free to Play', 'Indie', 'RPG', 'Strategy']",Ironbound,643980,2018
2,"['Casual', 'Free to Play', 'Indie', 'Simulatio...",Real Pool 3D - Poolians,670290,2017
3,"['Action', 'Adventure', 'Casual']",弹炸人2222,767400,2017
4,"['Action', 'Adventure', 'Simulation']",Battle Royale Trainer,772540,2018


### Now we get rid of any null values that take space in our DataFrames

In [3]:
steam.isnull().sum()

genres          0
title           0
release_date    0
tags            0
price           0
id              0
developer       0
dtype: int64

We repeat this process for the remaining DataSets.

In [None]:
itemsDF.isnull().sum()

In [None]:
reviewDF.isnull().sum()

Save the DataFrames.

In [None]:
import parquet
import pyarrow
import fastparquet
reviewDF.to_parquet('./data/review.parquet', index = False)
itemsDF.to_parquet('./data/items.parquet', index = False)
steam.to_csv('./data/steam.csv', index = False, errors = 'replace')

In [78]:
steam = pd.read_csv('./data/steam.csv', encoding = 'UTF-8')
review = pd.read_parquet('./data/review.parquet')
items = pd.read_parquet('./data/items.parquet')

## Now we need to prepare the files we are going to use for each function. We do so by merging the DataSets and keeping the columns we need.

In [117]:
steam['id'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 28353 entries, 0 to 28352
Series name: id
Non-Null Count  Dtype
--------------  -----
28353 non-null  int64
dtypes: int64(1)
memory usage: 221.6 KB


In [80]:
steam.head(2)

Unnamed: 0,genres,title,release_date,tags,price,id,developer
0,"['Action', 'Casual', 'Indie', 'Simulation', 'S...",Lost Summoner Kitty,2018-01-04,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim...",4.99,761140,Kotoshiro
1,"['Free to Play', 'Indie', 'RPG', 'Strategy']",Ironbound,2018-01-04,"['Free to Play', 'Strategy', 'Indie', 'RPG', '...",0.0,643980,Secret Level SRL


In [81]:
from ast import literal_eval
steam['genres'] = steam['genres'].apply(literal_eval)
df = steam.explode('genres')

In [82]:
df

Unnamed: 0,genres,title,release_date,tags,price,id,developer
0,Action,Lost Summoner Kitty,2018-01-04,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim...",4.99,761140,Kotoshiro
0,Casual,Lost Summoner Kitty,2018-01-04,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim...",4.99,761140,Kotoshiro
0,Indie,Lost Summoner Kitty,2018-01-04,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim...",4.99,761140,Kotoshiro
0,Simulation,Lost Summoner Kitty,2018-01-04,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim...",4.99,761140,Kotoshiro
0,Strategy,Lost Summoner Kitty,2018-01-04,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim...",4.99,761140,Kotoshiro
...,...,...,...,...,...,...,...
28351,Indie,Russian Roads,2018-01-04,"['Indie', 'Simulation', 'Racing']",1.99,610660,Laush Dmitriy Sergeevich
28351,Racing,Russian Roads,2018-01-04,"['Indie', 'Simulation', 'Racing']",1.99,610660,Laush Dmitriy Sergeevich
28351,Simulation,Russian Roads,2018-01-04,"['Indie', 'Simulation', 'Racing']",1.99,610660,Laush Dmitriy Sergeevich
28352,Casual,EXIT 2 - Directions,2017-09-02,"['Indie', 'Casual', 'Puzzle', 'Singleplayer', ...",4.99,658870,"xropi,stev3ns"


We change the 'item_id' column type to int64 so that we can merge both datasets on a same data type

In [118]:
items['item_id'] = items['item_id'].astype('int64')

In [119]:
merged = pd.merge(steam, items, left_on = 'id', right_on = 'item_id', how = 'inner')

In [120]:
merged.head()

Unnamed: 0,genres,title,release_date,tags,price,id,developer,user_id,item_id,item_name,playtime_forever,playtime_2weeks
0,"[Action, Indie, Racing]",Carmageddon Max Pack,1997-06-30,"['Racing', 'Action', 'Classic', 'Indie', 'Gore...",9.99,282010,Stainless Games Ltd,UTNerd24,282010,Carmageddon Max Pack,5,0
1,"[Action, Indie, Racing]",Carmageddon Max Pack,1997-06-30,"['Racing', 'Action', 'Classic', 'Indie', 'Gore...",9.99,282010,Stainless Games Ltd,I_DID_911_JUST_SAYING,282010,Carmageddon Max Pack,0,0
2,"[Action, Indie, Racing]",Carmageddon Max Pack,1997-06-30,"['Racing', 'Action', 'Classic', 'Indie', 'Gore...",9.99,282010,Stainless Games Ltd,76561197962104795,282010,Carmageddon Max Pack,0,0
3,"[Action, Indie, Racing]",Carmageddon Max Pack,1997-06-30,"['Racing', 'Action', 'Classic', 'Indie', 'Gore...",9.99,282010,Stainless Games Ltd,r3ap3r78,282010,Carmageddon Max Pack,0,0
4,"[Action, Indie, Racing]",Carmageddon Max Pack,1997-06-30,"['Racing', 'Action', 'Classic', 'Indie', 'Gore...",9.99,282010,Stainless Games Ltd,saint556,282010,Carmageddon Max Pack,13,0


In [121]:
merged['year'] = pd.to_datetime(merged['release_date']).dt.year
merged['year'] = merged['year'].astype('int64')

In [122]:
merged.head()

Unnamed: 0,genres,title,release_date,tags,price,id,developer,user_id,item_id,item_name,playtime_forever,playtime_2weeks,year
0,"[Action, Indie, Racing]",Carmageddon Max Pack,1997-06-30,"['Racing', 'Action', 'Classic', 'Indie', 'Gore...",9.99,282010,Stainless Games Ltd,UTNerd24,282010,Carmageddon Max Pack,5,0,1997
1,"[Action, Indie, Racing]",Carmageddon Max Pack,1997-06-30,"['Racing', 'Action', 'Classic', 'Indie', 'Gore...",9.99,282010,Stainless Games Ltd,I_DID_911_JUST_SAYING,282010,Carmageddon Max Pack,0,0,1997
2,"[Action, Indie, Racing]",Carmageddon Max Pack,1997-06-30,"['Racing', 'Action', 'Classic', 'Indie', 'Gore...",9.99,282010,Stainless Games Ltd,76561197962104795,282010,Carmageddon Max Pack,0,0,1997
3,"[Action, Indie, Racing]",Carmageddon Max Pack,1997-06-30,"['Racing', 'Action', 'Classic', 'Indie', 'Gore...",9.99,282010,Stainless Games Ltd,r3ap3r78,282010,Carmageddon Max Pack,0,0,1997
4,"[Action, Indie, Racing]",Carmageddon Max Pack,1997-06-30,"['Racing', 'Action', 'Classic', 'Indie', 'Gore...",9.99,282010,Stainless Games Ltd,saint556,282010,Carmageddon Max Pack,13,0,1997


Delete the unnecesary columns

In [123]:
merged.drop(['genres', 'playtime_2weeks', 'developer', 'release_date', 'price', 'item_id', 'tags', 'item_name'], axis = 1, inplace = True)

In [124]:
merged.head()

Unnamed: 0,title,id,user_id,playtime_forever,year
0,Carmageddon Max Pack,282010,UTNerd24,5,1997
1,Carmageddon Max Pack,282010,I_DID_911_JUST_SAYING,0,1997
2,Carmageddon Max Pack,282010,76561197962104795,0,1997
3,Carmageddon Max Pack,282010,r3ap3r78,0,1997
4,Carmageddon Max Pack,282010,saint556,13,1997


Now we can create a smaller file which will be more efficient in memory terms.

In [33]:
df2.to_parquet('./data/genre_functions.parquet', index = False)

In [39]:
df = pd.read_parquet('./data/genre_functions.parquet')

### Now we proceed creating a DataSet for our user system recommendation.

In [125]:
merged.head(1)

Unnamed: 0,title,id,user_id,playtime_forever,year
0,Carmageddon Max Pack,282010,UTNerd24,5,1997


In [126]:
review.head(1)

Unnamed: 0,user_id,posted,item_id,recommend,sentiment_analysis
0,76561197970982479,"November 5, 2011.",1250,True,2


In [127]:
df3 = pd.merge(merged, review, on = 'user_id', how = 'inner')

In [128]:
df3.head(2)

Unnamed: 0,title,id,user_id,playtime_forever,year,posted,item_id,recommend,sentiment_analysis
0,Carmageddon Max Pack,282010,UTNerd24,5,1997,"May 6, 2014.",244210,True,0
1,Carmageddon Max Pack,282010,UTNerd24,5,1997,"December 2, 2015.",440,True,0


Once our DataSet has all the columns we need, we proceed to save it.

In [130]:
df3.to_parquet('./data/recommendations.parquet')