# Import Libraries

In [130]:
from sklearn.feature_selection import mutual_info_regression
from category_encoders import OrdinalEncoder, OneHotEncoder
import pandas as pd
import numpy as np
import ast
import re

# Load Dataset

In [22]:
games = pd.read_csv('../data/games_clean.csv')

In [99]:
games_ax = pd.read_csv('../data/games.csv')

In [23]:
games['Release date'] = pd.to_datetime(games['Release date'])

# Feature Engineering

## Functions

In [139]:
def make_mi_scores(X, y, discrete_features):
  mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
  mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
  mi_scores = mi_scores.sort_values(ascending=False)
  return mi_scores

def clean_str(word : str):
  cleaned = re.sub(r"[^'\[\],]+", '', word)
  return cleaned

def count_languages(x):
  try:
    return len(ast.literal_eval(x))
  except (ValueError, SyntaxError):
    return 0

## Encode - Feature Categorical

In [50]:
categories = []
for i, row in games.iterrows():
    if type(row['Genres'].split(',')) == list:
        for cat in row['Genres'].split(','): categories.append(cat)
    # break

print(len(set(categories)))

33


In [52]:
one_hot_encode = pd.DataFrame()

for category in set(categories):
    cat = []
    for _, row in games.iterrows():
        cat.append(np.int64(category in row['Genres'].split(',')))
    one_hot_encode[category] = cat

one_hot_encode.head()

Unnamed: 0,Sports,Tutorial,Early Access,Photo Editing,Nudity,Game Development,Adventure,Short,Action,Audio Production,Indie,Free to Play,Education,Web Publishing,Gore,Utilities,Documentary,Strategy,Accounting,RPG,Software Training,Massively Multiplayer,Video Production,Animation & Modeling,Violent,Sexual Content,360 Video,Episodic,Simulation,Casual,Racing,Design & Illustration,Movie
0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [60]:
games = pd.concat((games, one_hot_encode), axis = 1)

In [62]:
games = games.drop(columns= ['Genres'])

In [64]:
ordinal_cols = ['Estimated owners']

In [28]:
mappings = [
    {
        'col' : 'Estimated owners',
        'mapping' : {
            '0 - 0'                 : 0,
            '0 - 20000'             : 1,
            '20000 - 50000'         : 2,
            '50000 - 100000'        : 3,
            '100000 - 200000'       : 4,
            '200000 - 500000'       : 5,
            '500000 - 1000000'      : 6,
            '1000000 - 2000000'     : 7,
            '2000000 - 5000000'     : 8,
            '5000000 - 10000000'    : 9,
            '10000000 - 20000000'   : 10,
            '20000000 - 50000000'   : 11,
            '50000000 - 100000000'  : 12,
            '100000000 - 200000000' : 13
        }
    }
]

In [65]:
ordinal_enc = OrdinalEncoder(cols = ordinal_cols, mapping= mappings)

In [70]:
games = ordinal_enc.fit_transform(games)

## New Features

In [79]:
games['Platform_count'] = games[['Windows', 'Mac', 'Linux']].sum(axis = 1)

In [89]:
games['Achievements_per_hour'] = games.apply(
    lambda x : 
        x['Achievements'] 
        if x['Average playtime forever'] == 0 
        else x['Achievements'] / x['Average playtime forever'],
    axis = 1
)

In [92]:
games['Genre_count'] = games[list(set(categories))].sum(axis = 1)

In [94]:
games['Recent_playtime_ratio'] = games['Average playtime two weeks'] / games['Average playtime forever']

In [96]:
games['Recent_playtime_ratio'] = games.apply(
    lambda x : 
        x['Average playtime two weeks'] 
        if x['Average playtime forever'] == 0 
        else x['Average playtime two weeks'] / x['Average playtime forever'],
    axis = 1
)

In [129]:
type(ast.literal_eval(games_ax['Supported languages'][1])[0])

str

In [135]:
games_ax['Supported languages'] = games_ax['Supported languages'].apply(clean_str)

In [140]:
games_ax['Language_count'] = games_ax['Supported languages'].apply(count_languages)

In [142]:
games_ax['Language_count'].sort_values()

71715    0
21767    0
55194    0
21764    0
21763    0
        ..
27180    1
27181    1
27182    1
27221    1
35857    1
Name: Language_count, Length: 71716, dtype: int64