In [1]:
import data
import classifier
import utils

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

import pandas as pd

PATH_TO_CSV = '/home/firl/bot-detector-ml-training/src/data/raw/2022-10-26_hiscore_data.csv'
PATH_TO_CSV = None
if PATH_TO_CSV is None:
    print('set PATH_TO_CSV=/path/to/2022-10-26_hiscore_data.csv and run all')
    raise ValueError

df = pd.read_csv(PATH_TO_CSV)

In [9]:
# This logic is roughly equivalent to app.py's train function
hiscoredata = data.hiscoreData(df, False)
features = hiscoredata.features()
# no playerData()-related data, its already in features
features_labeled = features
binary_classifier = classifier.classifier("binaryClassifier")
# Logic should be the same as app.py line 179: train the model 
x = features_labeled.copy()
y = x['label'].apply(lambda x: 0 if x == 'Real_Player' else 1)
# TODO: one-hot encode account status
print('unique account_status values:', x['account_status'].unique())
# TODO: using created_at, updated_at epoch times slightly improve binary classifier
# x['created_at'] = pd.to_datetime(x['created_at'], format="%Y-%m-%d %H:%M:%S").apply(lambda x: (x - pd.Timestamp("1970-01-01")) / pd.Timedelta("1s"))
# x['updated_at'] = pd.to_datetime(x['updated_at'], format="%Y-%m-%d %H:%M:%S").apply(lambda x: (x - pd.Timestamp("1970-01-01")) / pd.Timedelta("1s"))
# print(x['created_at'].head())
x.drop(columns=['label', 'label_id', 'name', 'created_at', 'updated_at', 'account_status'], inplace=True)
print('x columns:\n', '\n\t'.join(list(x.columns)))
print('x head:\n', x.head())
print('y head:\n', y.head())


unique account_status values: ['not banned' 'banned']
x columns:
 possible_ban
	confirmed_ban
	total
	attack
	defence
	strength
	hitpoints
	ranged
	prayer
	magic
	cooking
	woodcutting
	fletching
	fishing
	firemaking
	crafting
	smithing
	mining
	herblore
	agility
	thieving
	slayer
	farming
	runecraft
	hunter
	construction
	league
	bounty_hunter_hunter
	bounty_hunter_rogue
	cs_all
	cs_beginner
	cs_easy
	cs_medium
	cs_hard
	cs_elite
	cs_master
	lms_rank
	soul_wars_zeal
	abyssal_sire
	alchemical_hydra
	barrows_chests
	bryophyta
	callisto
	cerberus
	chambers_of_xeric
	chambers_of_xeric_challenge_mode
	chaos_elemental
	chaos_fanatic
	commander_zilyana
	corporeal_beast
	crazy_archaeologist
	dagannoth_prime
	dagannoth_rex
	dagannoth_supreme
	deranged_archaeologist
	general_graardor
	giant_mole
	grotesque_guardians
	hespori
	kalphite_queen
	king_black_dragon
	kraken
	kreearra
	kril_tsutsaroth
	mimic
	nex
	nightmare
	phosanis_nightmare
	obor
	sarachnis
	scorpia
	skotizo
	tempoross
	the_gauntlet


In [None]:
# This logic is roughly equivalent to app.py's train function
train_x, test_x, train_y, test_y = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)
binary_classifier.fit(train_x, train_y)


In [3]:
print(binary_classifier.score(test_y, test_x))
# OUTPUT: (0.9992126580557206, 0.9992126580557205)

(0.9992126580557206, 0.9992126580557205)


In [4]:
print(
    classification_report(
        y_true=test_y, 
        y_pred=binary_classifier.predict(test_x)
    )
)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     26765
           1       1.00      1.00      1.00     30482

    accuracy                           1.00     57247
   macro avg       1.00      1.00      1.00     57247
weighted avg       1.00      1.00      1.00     57247



In [5]:
# pd.to_datetime("2010/11/12", format="%Y/%m/%d")
# Out[52]: Timestamp('2010-11-12 00:00:00')

# pd.to_datetime("12-11-2010 00:00", format="%d-%m-%Y %H:%M")
# Out[53]: Timestamp('2010-11-12 00:00:00')

# TODO: play around with updated_at/created_at and other feature extraction
# # print(features_labeled['created_at'].dtypes)
# # created_at_ts = pd.to_datetime(features_labeled['created_at'], format="%Y-%m-%d %H:%M:%S")
# # updated_at_ts = pd.to_datetime(features_labeled['updated_at'], format="%Y-%m-%d %H:%M:%S")
# # acct_lifespan_ts = updated_at_ts - created_at_ts
# # print(created_at_ts.head())
# # print(updated_at_ts.head())
# # print(acct_lifespan_ts)

# created_at_ts.apply(lambda x: (x - pd.Timestamp("1970-01-01")) / pd.Timedelta("1s")).head()
# (stamps - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")
# pd.to_datetime(features_labeled['created_at'], format="%Y-%m-%d %H:%M:%S").dtypes

In [6]:
# pca code loosely based on:
# https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_iris.html
def do_pca(X):
    pca = PCA(n_components=2)
    X_r = pca.fit(X).transform(X)
    print("explained variance ratio: %s" % str(pca.explained_variance_ratio_))
    return X_r
    
def plot_2d_pca(X, y, omit_real=True, make_bad_plot=False):
    y = y.copy()
    y = pd.DataFrame(y)
    print(y.head())
    pca = PCA(n_components=2)
    X_r = pca.fit(X).transform(X)
    print("explained variance ratio: %s" % str(pca.explained_variance_ratio_))
    
    if omit_real:
        X_r = X_r[y['label'] != 'Real_Player']
        y = y[y['label'] != 'Real_Player']
    
    if make_bad_plot:
        fig, ax = plt.subplots(figsize=(1,1), dpi=80)
        lw = 1
        #for color, i in zip(colors, [0, 1, 2]):
        label_to_id = dict((label, label_id) for label_id, label in enumerate(y['label'].unique()))
        ax.scatter(X_r[:,0], X_r[:,1], c=y['label'].apply(lambda l: label_to_id[l]), alpha=0.3, norm='log', lw=lw, label=y['label'])
        for i, label in enumerate(y['label'].values):
            ax.annotate(label, (X_r[i,0], X_r[i,1]), fontsize=4)
        ax.set_xscale('log')
        ax.set_yscale('log')
        ax.legend(loc="best", shadow=False, scatterpoints=1)
        # ax.title("PCA of IRIS dataset")
    
    
# do_pca(train_x)
plot_2d_pca(x, y)

           label
Player_id       
1              0
8              0
29             0
39             0
59             0
explained variance ratio: [0.97047863 0.0107855 ]
