# all_models.ipynb 

Here we will run our data through all models to see which one performs best

In [1]:
import pandas as pd
import os
import math
import argparse
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_iris, load_wine, load_digits, load_breast_cancer, load_diabetes
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")

folder = "data/CombinedPricePlayerCountHistory"
game_dfs = []

for filename in os.listdir(folder):
    if filename == "readme.md":
        continue

    # Construct filenames
    fullpath = os.path.join(folder, filename)
    appId, garbage_collector = os.path.splitext(filename)

    # Create data frame for each csv
    df = pd.read_csv(fullpath)
    df['appid'] = appId
    # Put appid first for visual pleasure
    cols = ['appid'] + [col for col in df if col != 'appid']
    df = df[cols]
    game_dfs.append(df)

# Combine them into one csv we can use to train
combined_df = pd.concat(game_dfs, ignore_index=True)

#pull in the rest of our merged info
merged_info_df = pd.read_csv("merged_info.csv")

#format combined_df
combined_df['appid'] = combined_df['appid'].astype('int64')
combined_df['Date'] = pd.to_datetime(combined_df['Date'])

#merge combined_df and merged_info_df
full_df = combined_df.merge(merged_info_df,on='appid')

#drop redundant columns
full_df = full_df.drop('type', axis=1)
full_df = full_df.drop('freetoplay', axis=1)

#format dates, drop ones that do not follow format
full_df['releasedate'] = pd.to_datetime(full_df['releasedate'], format='%d-%b-%y',errors='coerce')
full_df = full_df.dropna(subset=['releasedate'])

#numerically encode string columns (name, dev, pub)
label_encoder = LabelEncoder()
full_df['name'] = label_encoder.fit_transform(full_df['name'])
full_df['developer'] = label_encoder.fit_transform(full_df['developer'])
full_df['publisher'] = label_encoder.fit_transform(full_df['publisher'])

#to check
print(full_df)

         appid       Date  Playercount  Initialprice  Finalprice  Discount  \
0       614570 2019-04-07          103         29.99       29.99         0   
1       614570 2019-04-08           71         29.99       29.99         0   
2       614570 2019-04-09           67         29.99       29.99         0   
3       614570 2019-04-10           58         29.99       29.99         0   
4       614570 2019-04-11           55         29.99       29.99         0   
...        ...        ...          ...           ...         ...       ...   
597477  228280 2020-08-08          622         19.99       19.99         0   
597478  228280 2020-08-09          688         19.99       19.99         0   
597479  228280 2020-08-10          558         19.99       19.99         0   
597480  228280 2020-08-11          529         19.99       19.99         0   
597481  228280 2020-08-12          520         19.99       19.99         0   

        name releasedate  developer  publisher  
0        273  

In [2]:
#read in csv, convert date column to datetime object
full_df['Date'] = pd.to_datetime(full_df['Date'])

#2/3 samples train/test split
samples = len(full_df)
split = math.floor(samples * (2/3))

#split at date 2/3 of the way through entry by using "split" to index into the df, get the date, and use that
split_date = full_df.iloc[split]['Date']
split_date = pd.to_datetime(split_date)
train_data = full_df[full_df['Date'] < split_date]
test_data = full_df[full_df['Date'] >= split_date]

#Considering 'Finalprice' spoils whether or not there is a discount/it's amount so we omit that feature
#Set up training data
X_train = train_data[['Date', 'Playercount', 'Initialprice']]
y_train = train_data['Discount']

#Set up testing data
X_test = test_data[['Date', 'Playercount', 'Initialprice']]
y_test = test_data['Discount']

print(X_test)
print(y_test)

#convert dates into something more compatible with our model (dates are not ML friendly)
X_train['DayOfWeek'] = X_train['Date'].dt.dayofweek
X_train['Month'] = X_train['Date'].dt.month
X_train['Year'] = X_train['Date'].dt.year
X_test['DayOfWeek'] = X_test['Date'].dt.dayofweek
X_test['Month'] = X_test['Date'].dt.month
X_test['Year'] = X_test['Date'].dt.year

#drop the old date
X_train = X_train.drop(['Date'], axis=1)
X_test = X_test.drop(['Date'], axis=1)

             Date  Playercount  Initialprice
183    2019-10-08           88         29.99
184    2019-10-09           87         29.99
185    2019-10-10           75         29.99
186    2019-10-11           89         29.99
187    2019-10-12          117         29.99
...           ...          ...           ...
597477 2020-08-08          622         19.99
597478 2020-08-09          688         19.99
597479 2020-08-10          558         19.99
597480 2020-08-11          529         19.99
597481 2020-08-12          520         19.99

[371315 rows x 3 columns]
183       0
184       0
185       0
186       0
187       0
         ..
597477    0
597478    0
597479    0
597480    0
597481    0
Name: Discount, Length: 371315, dtype: int64


Setup the models

In [3]:
names = [
    "Nearest Neighbors",
    # "Linear SVM",
    # "RBF SVM",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
]

classifiers = [
    KNeighborsClassifier(3),
    # SVC(kernel="linear", C=0.025),
    # SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
]

for i in range(len(classifiers)):
    cn = names[i]
    clf = classifiers[i]
    clf.fit(X_train, y_train)
    y_predict = clf.predict(X_train)
    train_acc = accuracy_score(y_train, y_predict)

    y_predict = clf.predict(X_test)
    test_acc = accuracy_score(y_test, y_predict)

    print("Name:", cn)
    print("Training:", train_acc)
    print("Testing:", test_acc)
    print("\n")   


Name: Nearest Neighbors
Training: 0.8757922096418145
Testing: 0.8103119992459232


Name: Decision Tree
Training: 0.8720091860713522
Testing: 0.8222641153737392


Name: Random Forest
Training: 0.8718267108167771
Testing: 0.8226573125244065


Name: Neural Net
Training: 0.8718267108167771
Testing: 0.8226573125244065


Name: AdaBoost
Training: 0.8264571316670227
Testing: 0.7832433378667708


Name: Naive Bayes
Training: 0.6260770490635904
Testing: 0.6491334850463892




We saw that a decision tree, random forest, and neural net all had pretty good accuracies. First we will hyper parameters on decision trees to see if we can get better results.

In [10]:
depths = [1, 2, 3, 4, 5, 10, 15, 20, 30, 40, 50, 100]
scores = {}
dt_scores = {}

print("Decision Trees")
for depth in depths:
    clf = DecisionTreeClassifier(max_depth=depth)
    clf.fit(X_train, y_train)
    y_predict = clf.predict(X_train)
    train_acc = accuracy_score(y_train, y_predict)

    y_predict = clf.predict(X_test)
    test_acc = accuracy_score(y_test, y_predict)

    dt_scores[depth] = test_acc

    print("Depth:", depth)
    print("Training:", train_acc)
    print("Testing:", test_acc)
    print("\n")   

print("Best Depth:", max(dt_scores, key=lambda k: dt_scores[k]))
    

Decision Trees
Depth: 1
Training: 0.8718267108167771
Testing: 0.8226573125244065


Depth: 2
Training: 0.8718267108167771
Testing: 0.8226573125244065


Depth: 3
Training: 0.8718267108167771
Testing: 0.8226573125244065


Depth: 4
Training: 0.8719468774478388
Testing: 0.8226734713114202


Depth: 5
Training: 0.8720091860713522
Testing: 0.8222641153737392


Depth: 10
Training: 0.873090685750908
Testing: 0.8210495132165412


Depth: 15
Training: 0.8780397707042654
Testing: 0.8168455354618047


Depth: 20
Training: 0.89140497044791
Testing: 0.8036357270780874


Depth: 30
Training: 0.9231022573524176
Testing: 0.7765428275184143


Depth: 40
Training: 0.927067756177455
Testing: 0.7745633761092334


Depth: 50
Training: 0.9271701203446557
Testing: 0.7744394920754616


Depth: 100
Training: 0.9271701203446557
Testing: 0.7744825821741648


Best Depth: 4
