# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import joblib

import import_ipynb
import data_preprocessing

   game_id                                               name  \
0  1418860  Zaccaria Pinball - House of Diamonds Deluxe Pi...   
1  1418900                Sayonara Golden Days - Golden Souls   
2  1418980          Medieval Dynasty - Digital Supporter Pack   
3  1418990                              Unicorns on Unicycles   
4  1419040                         Road Maintenance Simulator   

                                         description  \
0  purchase this dlc disables score limit on this...   
1  the dlc will does not affect really affect the...   
2  the digital supporter pack of medieval dynasty...   
3  turn your horns into swords in this wacky and ...   
4  experience the everyday life in a german stree...   

                                              genres  
0  [Action,  Casual,  Free to Play,  Indie,  Simu...  
1                                            [Indie]  
2            [Action,  Adventure,  RPG,  Simulation]  
3                          [Action,  Casual,  Indie]

# Import data

In [None]:
# load the data as formatted dataframe
data = data_preprocessing.Data(tags_or_genres='genres')
print(data.games.head())
print(data.dict.head())

   game_id                                               name  \
0  1418860  Zaccaria Pinball - House of Diamonds Deluxe Pi...   
1  1418900                Sayonara Golden Days - Golden Souls   
2  1418980          Medieval Dynasty - Digital Supporter Pack   
3  1418990                              Unicorns on Unicycles   
4  1419040                         Road Maintenance Simulator   

                                         description  \
0  purchase this dlc disables score limit on this...   
1  the dlc will does not affect really affect the...   
2  the digital supporter pack of medieval dynasty...   
3  turn your horns into swords in this wacky and ...   
4  experience the everyday life in a german stree...   

                                              genres  
0  [Action,  Casual,  Free to Play,  Indie,  Simu...  
1                                            [Indie]  
2            [Action,  Adventure,  RPG,  Simulation]  
3                          [Action,  Casual,  Indie]

# Input preprocessing
Convert complex input (game description) into simple input for model  
Result input variable will be named `X`

We will be using [Sklearn library](https://scikit-learn.org/1.5/modules/feature_extraction.html) to perform the feature extraction

In [3]:
data.games['description'].shape

(50443,)

### Bag of Words
https://scikit-learn.org/1.5/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=10000, stop_words='english')
X = vectorizer.fit_transform(data.games['description'])

X.shape


(50443, 10000)

### TF-IDF

In [5]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

vectorizer = CountVectorizer(max_features=10000, stop_words='english')
counts = vectorizer.fit_transform(data.games['description'])

transformer = TfidfTransformer(smooth_idf=False)
X = transformer.fit_transform(counts)

X.shape

(50443, 10000)

### Hashing Vectorizer
https://scikit-learn.org/1.5/modules/generated/sklearn.feature_extraction.text.HashingVectorizer.html

In [6]:
from sklearn.feature_extraction.text import HashingVectorizer

hv = HashingVectorizer(n_features=100)
X = hv.transform(data.games['description'])

X.shape

(50443, 100)

# Output preprocessing
Convert complex output (game tags) into simple output for model (multi-label classification)  
Resulting output variable will be named `y`

### Multi-label binarizer
https://scikit-learn.org/1.5/modules/generated/sklearn.preprocessing.LabelBinarizer.html

In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

binarizer = MultiLabelBinarizer()
y = binarizer.fit_transform(data.games['genres'])

y.shape

(50443, 66)

# Evaluation methods
We want to create function `evaluate(y_pred, y_true)` that will evaluate the model performance

### Recall
https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.recall_score.html

In [8]:
from sklearn.metrics import recall_score

def evaluate(y_pred, y_true):
	return recall_score(y_true, y_pred, average='micro')

### F1-score
https://scikit-learn.org/0.15/modules/generated/sklearn.metrics.f1_score.html

In [9]:
from sklearn.metrics import f1_score

def evaluate(y_pred, y_true):
    return f1_score(y_true, y_pred, average='micro')

# Model
Use the given preprocessed input `X`, output `y`, evaluation functions `evaluate(y_pred, y_true)`, to train a chosen model (and save the model)  
Resulting model will be class named `Model` with methods:  
* fit(X, y) - train the model
* predict(X) - predict the output for given input
* save_model(path) - save the model to the given path
* load_model(path) - load the model from the given path
* evaluate() - evaluate the model performance

### KNN (K Nearest Neighbors)
https://scikit-learn.org/1.5/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import joblib

class Model:
    def __init__(self, X, y):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2)
        self.knn = KNeighborsClassifier(n_neighbors=10)
        
    def fit(self):
        self.knn.fit(self.X_train, self.y_train)

    def predict(self, X):
        return self.knn.predict(X)

    def evaluate(self):
        y_pred = self.predict(self.X_test)
        return evaluate(y_pred, self.y_test)

    def save_model(self, path):
        joblib.dump(self.knn, path)

    def load_model(self, path):
        self.knn = joblib.load(path)

### Logistic Regression

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

class Model:
	def __init__(self, X, y):
		self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2)
		self.lr = MultiOutputClassifier(LogisticRegression(penalty=None))

	def fit(self):
		self.lr.fit(self.X_train, self.y_train)
		
	def predict(self, X):
		return self.lr.predict(X)
	
	def evaluate(self):
		y_pred = self.predict(self.X_test)
		return evaluate(y_pred, self.y_test)
	
	def save_model(self, filename):
		joblib.dump(self.lr, filename)

	def load_model(self, filename):
		self.lr = joblib.load(filename)


### Decision Trees

In [12]:
from sklearn import tree

class Model:
	def __init__(self, X, y):
		self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2)
		self.tree = tree.DecisionTreeClassifier()

	def fit(self):
		self.tree.fit(self.X_train, self.y_train)
		
	def predict(self, X):
		return self.tree.predict(X)
	
	def evaluate(self):
		y_pred = self.predict(self.X_test)
		return evaluate(y_pred, self.y_test)
	
	def save_model(self, filename):
		joblib.dump(self.tree, filename)

	def load_model(self, filename):
		self.tree = joblib.load(filename)

### Random Forest

https://scikit-learn.org/1.5/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [29]:
from sklearn.ensemble import RandomForestClassifier

class Model:
	def __init__(self, X, y):
		self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2)
		self.rfc = RandomForestClassifier()

	def fit(self):
		self.rfc.fit(self.X_train, self.y_train)
		
	def predict(self, X):
		return self.rfc.predict(X)
	
	def evaluate(self):
		y_pred = self.predict(self.X_test)
		return evaluate(y_pred, self.y_test)
	
	def save_model(self, filename):
		joblib.dump(self.rfc, filename)

	def load_model(self, filename):
		self.rfc = joblib.load(filename)

### Naive Bayes

In [14]:
from sklearn.naive_bayes import MultinomialNB

class Model:
	def __init__(self, X, y):
		self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2)
		self.nb = MultiOutputClassifier(MultinomialNB())

	def fit(self):
		self.nb.fit(self.X_train, self.y_train)
		
	def predict(self, X):
		return self.nb.predict(X)
	
	def evaluate(self):
		y_pred = self.predict(self.X_test)
		return evaluate(y_pred, self.y_test)
	
	def save_model(self, filename):
		joblib.dump(self.nb, filename)

	def load_model(self, filename):
		self.nb = joblib.load(filename)

# Model training and evaluation

In [17]:
model = Model(X, y)
model.fit()
print(model.evaluate())

0.5334152166496077


In [31]:
model.save_model('../models/random_forest.h5')