## Import the data from files as dataframes
the data is saved in json format:
* steam_games_list.json

```json
{
    "applist": {
        "apps": [
            {
                "appid": app_id,
                "name": "game_name"
            },
            ...
        ]
    }
}

```

* steam_games_descriptions.json

```json
{
    "game_id": "game_description",
    ...
}
```

* steam_games_tags_and_genres.json

```json
{
    "game_id": {
        "tags": ["tag1": tag1_id, "tag2": tag2_id, ...],
        "genres": "genre1, genre2, ..."
    },
    ...
}
```

* steam_games_dictinary.json

```json
{
    "word": word_cnt,
}
```

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [27]:

class Data:
    def __init__(self):
        self.data_root_path = '../data'

        # load data
        self.load_games(f'{self.data_root_path}/steam_games_list.json')
        self.load_descriptions(f'{self.data_root_path}/steam_games_descriptions.json')
        self.load_tags(f'{self.data_root_path}/steam_games_tags_and_genres.json')

        # if games dictionary does not exist run ./data/calculate_word_frequency.py
        if not os.path.exists(f'{self.data_root_path}/steam_games_dictionary.json'):
            os.system('python3 ./data/calculate_word_frequency.py')
        self.load_dict(f'{self.data_root_path}/steam_games_dictionary.json')
            

        # clean the data from empty values
        self.ensure_data()

        # merge all the data into games
        self.games = self.games.merge(self.descriptions, on='game_id', how='inner')
        self.games = self.games.merge(self.tags, on='game_id', how='inner')
        self.games = self.games.drop(columns=['tags_and_genres'])

        # delete the unnecessary tables
        del self.descriptions
        del self.tags

    def load_games(self, path):
        # load games from steam_games_list.json
        self.games = pd.read_json(path)
        self.games = pd.json_normalize(self.games['applist']['apps'])
        self.games.columns = ['game_id', 'name']

    def load_descriptions(self, path):
        self.descriptions = pd.read_json(path, typ='series').reset_index()
        self.descriptions.columns = ['game_id', 'description']

    def load_tags(self, path):
        # load tags from tags_and_genres.json, extract tags from tags_and_genres
        self.tags = pd.read_json(path, typ='series').reset_index()
        self.tags.columns = ['game_id', 'tags_and_genres']
        self.tags['tags'] = self.tags['tags_and_genres'].apply(lambda x: x.get('tags', {}))

    def load_dict(self, path):
        # load dict.json and take first 1000 rows, also remove word_cnt
        self.dict = pd.read_json(path, typ='series').reset_index()
        self.dict.columns = ['word', 'word_cnt']
        self.dict = self.dict.drop(columns=['word_cnt'])


    def ensure_data(self):
        self.games = self.games[self.games['name'].apply(lambda x: x != '' and x is not None)]
        self.games = self.games.reset_index(drop=True) 

        self.descriptions = self.descriptions[self.descriptions['description'].apply(lambda x: x != '' and x is not None)]
        self.descriptions = self.descriptions.reset_index(drop=True)

        self.tags = self.tags[self.tags['tags'].apply(lambda x: len(x) > 0 and x is not None)]
        self.tags = self.tags.reset_index(drop=True) 



# debug
# data = Data()
# print(data.games.head())
# print(data.dict.head())

   game_id                           name  \
0  1418990          Unicorns on Unicycles   
1  1419040     Road Maintenance Simulator   
2  1419060                        Retchid   
3  1419070                  Mython Island   
4  1419100  The Unexpected Quest Prologue   

                                         description  \
0  turn your horns into swords in this wacky and ...   
1  experience the everyday life in a german stree...   
2  roadmapabout the gameretchid is an immersive a...   
3  mython island is a monster catching rpg featur...   
4  get the full game hereabout the gamean adventu...   

                                                tags  
0  {'Local Multiplayer': 267, 'Physics': 253, 'Ex...  
1  {'Simulation': 97, 'Casual': 91, 'Indie': 88, ...  
2  {'Exploration': 195, 'FPS': 190, 'Shoot 'Em Up...  
3  {'Creature Collector': 144, 'RPG': 139, 'Turn-...  
4  {'Free to Play': 139, 'Simulation': 121, 'City...  
  word
0  the
1  and
2   to
3   of
4    a


# Preprocessing
## Bag of words
Simply take the words from the descriptions and count the frequency of each word. Then create a dictionary of words by taking N most frequent words. To encode the description of a game we either mark the presence of a word in the description or count the frequency of each word in the description, using only words from dictionary.

## TF-IDF
Term Frequency - Inverse Document Frequency is a measure of how important a word is in a document relative to a collection of documents. We calculate it by multiplying TF and IDF. Term Frequency is the frequency of a word in given document and Inverse Document Frequency is the logarithm of the ratio of the total number of documents to the number of documents containing the word.


In [None]:
class InputPreprocessors:
    class BagOfWords:
        def __init__(self, data, N):
            self.dict = data.dict.head(N)

        def encode_cnt(self, point):
            res = [0] * len(self.dict)

            for word in point:
                if word in self.dict:
                    res[self.dict.index(word)] += 1

            return res
        
        def encode_bin(self, point):
            res = [0] * len(self.dict)

            for word in point:
                if word in self.dict:
                    res[self.dict.index(word)] = 1

            return res
        
    class TFIDF:
        def __init__(self, data, N):
            self.dict = data.dict.head(N)

            self.calc_idf(data)
        
        def calc_idf(self, data):
            self.idf = [0] * len(self.dict)

            for desc in data.descriptions['description']:
                for word in desc.split():
                    if word in self.dict:
                        self.idf[self.dict.index(word)] += 1
            
            self.idf = [len(data.descriptions) / self.idf[i] for i in range(len(self.dict))]
            self.idf = np.log(self.idf)

        def encode(self, point):
            res = [0] * len(self.dict)

            for word in point:
                if word in self.dict:
                    res[self.dict.index(word)] += 1

            res = np.array(res) * np.array(self.idf)
            return res