# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import joblib

import import_ipynb

# Import data

In [2]:
import data_preprocessing

# load the data as formatted dataframe
data = data_preprocessing.Data(tags_or_genres='genres')
print(data.games.head())
print(data.dict.head())

   game_id                                               name  \
0  1418860  Zaccaria Pinball - House of Diamonds Deluxe Pi...   
1  1418900                Sayonara Golden Days - Golden Souls   
2  1418980          Medieval Dynasty - Digital Supporter Pack   
3  1418990                              Unicorns on Unicycles   
4  1419040                         Road Maintenance Simulator   

                                         description  \
0  purchase this dlc disables score limit on this...   
1  the dlc will does not affect really affect the...   
2  the digital supporter pack of medieval dynasty...   
3  turn your horns into swords in this wacky and ...   
4  experience the everyday life in a german stree...   

                                              genres  
0  [Action,  Casual,  Free to Play,  Indie,  Simu...  
1                                            [Indie]  
2            [Action,  Adventure,  RPG,  Simulation]  
3                          [Action,  Casual,  Indie]

In [3]:
data.games['description'].shape

(50443,)

# Input preprocessing
Convert complex input (game description) into simple input for model  
Result input variable will be named `X`

We will be using [Sklearn library](https://scikit-learn.org/1.5/modules/feature_extraction.html) to perform the feature extraction

In [4]:
import input_preprocessing

# X = input_preprocessing.bag_of_words(data.games['description'], max_features=10000)
# X = input_preprocessing.tfidf(data.games['description'], max_features=10000)
X = input_preprocessing.hashing(data.games['description'], n_features=100)
X.shape

(50443, 100)

# Output preprocessing
Convert complex output (game tags) into simple output for model (multi-label classification)  
Resulting output variable will be named `y`

In [5]:
import output_preprocessing

y = output_preprocessing.multilabel_binarizer(data.games['genres'])
y.shape

(50443, 66)

# Evaluation methods
We want to create function `evaluate(y_pred, y_true)` that will evaluate the model performance

In [6]:
import evaluation_methods

# evaluate = evaluation_methods.recall
evaluate = evaluation_methods.f1score

# Model
Use the given preprocessed input `X`, output `y`, evaluation functions `evaluate(y_pred, y_true)`, to train a chosen model (and save the model)  
Resulting model will be class named `Model` with methods:  
* fit(X, y) - train the model
* predict(X) - predict the output for given input
* save_model(path) - save the model to the given path
* load_model(path) - load the model from the given path
* evaluate() - evaluate the model performance

In [7]:
import models

# model = models.KNN(X, y, evaluate)
# model = models.LogisticRegression(X, y, evaluate)
# model = models.DecisionTree(X, y, evaluate)
# model = models.RandomForest(X, y, evaluate)
# model = models.NaiveBayes(X, y, evaluate)
# model = models.SVM(X, y, evaluate)
model = models.MLP(X, y, evaluate)

# Model training and evaluation

In [8]:
model.fit()
print(model.evaluate())

0.4280315194032063


