In [1]:
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from pathlib import Path
from keras.utils.np_utils import to_categorical
from typing import List
import cv2 as cv
# Using SMOTE for the over sampling portion.
from imblearn.over_sampling import SMOTE
import time
%matplotlib inline

In [2]:
model_address = os.path.join(Path(os.getcwd()).parent,'Modeling\\Existing_Models\\xgboost_dino_tuned.pkl')

In [3]:
model = pickle.load(open(model_address, 'rb'))  # horrible.
print(model.get_xgb_params())

{'objective': 'multi:softprob', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 0.7, 'gamma': 0, 'gpu_id': -1, 'interaction_constraints': '', 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 7, 'min_child_weight': 3, 'monotone_constraints': '()', 'n_jobs': 16, 'num_parallel_tree': 1, 'predictor': 'auto', 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': None, 'subsample': 0.8, 'tree_method': 'exact', 'validate_parameters': 1, 'verbosity': None, 'eval_metric': 'mlogloss'}


In [4]:
target_address = os.path.join(Path(os.getcwd()).parent,'Window_capture\\Data\\command_keys.npy')
screenshot_address = os.path.join(Path(os.getcwd()).parent,'Window_capture\\Data\\screenshots.npy')

labels = np.load(target_address)
images = np.load(screenshot_address, allow_pickle = True)


print("Length Command Keys Shape: ",labels.shape)
print("Length Screenshot Shape: ",images.shape)
print("Screenshot Shape: ",images[0].shape)
print(np.unique(labels, return_counts = True))

Length Command Keys Shape:  (15673,)
Length Screenshot Shape:  (15673, 129600)
Screenshot Shape:  (129600,)
(array([-1, 38, 40]), array([12109,  1523,  2041], dtype=int64))


In [5]:
# res_list = [i for i, value in enumerate(labels) if value == -1] # Let's get rid of some -1 values.
# idx = np.random.choice(res_list, 9000, replace=False) # Randomly choose X number of entries to be deleted specified as -1
# images = pd.DataFrame(images) # flatten images then converted to dataframe for easier removal of idx
# images = np.array(images.drop(images.index[idx])) # flatten images then converted to dataframe for easier removal of idx
# labels = np.delete(labels, idx)
# print(images.shape, labels.shape)
# print(np.unique(labels, return_counts = True))

In [6]:
# Undersample using Tomek Links
# from imblearn.under_sampling import TomekLinks
# tl = TomekLinks()
# images, labels = tl.fit_resample(images, labels)

In [7]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 101)
images, labels = smote.fit_resample(images, labels)

In [8]:
np.unique(labels, return_counts = True)

(array([-1, 38, 40]), array([12109, 12109, 12109], dtype=int64))

In [9]:
# Cast -1 to 0, 38 to 1 and 40 to 2
labels[labels == -1] = 0
labels[labels == 38] = 1
labels[labels == 40] = 2

In [10]:
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size = 0.25)

In [11]:
np.unique(y_train, return_counts = True)

(array([0, 1, 2]), array([9069, 9085, 9091], dtype=int64))

In [12]:
# from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [13]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test, label=y_test)

In [15]:
# Convert the data to DMatrix for xgboost

# Loop through multiple thread numbers for xgboost
start_time = time.time()
n_estimators = 100
param = {
          'max_depth' : 7,
                'eta' : 0.1,
    'min_child_weight': 3,
    'colsample_bytree': 0.7,
            'subsample': 0.8,
           'objective':'multi:softmax',
           'num_class': 3,
        }

bst = xgb.train(param,
                dtrain,
                n_estimators,
                [(dtest, 'eval'), (dtrain, 'train')],
               early_stopping_rounds = 50)
print("XGBoost (no wrapper) Time: {}s".format(time.time() - start_time))

[0]	eval-mlogloss:0.99900	train-mlogloss:0.99819
[1]	eval-mlogloss:0.91509	train-mlogloss:0.91352
[2]	eval-mlogloss:0.84313	train-mlogloss:0.84081
[3]	eval-mlogloss:0.78056	train-mlogloss:0.77753
[4]	eval-mlogloss:0.72615	train-mlogloss:0.72258
[5]	eval-mlogloss:0.67665	train-mlogloss:0.67254
[6]	eval-mlogloss:0.63298	train-mlogloss:0.62804
[7]	eval-mlogloss:0.59374	train-mlogloss:0.58788
[8]	eval-mlogloss:0.55829	train-mlogloss:0.55175
[9]	eval-mlogloss:0.52669	train-mlogloss:0.51942
[10]	eval-mlogloss:0.49880	train-mlogloss:0.49068
[11]	eval-mlogloss:0.47292	train-mlogloss:0.46453
[12]	eval-mlogloss:0.44991	train-mlogloss:0.44098
[13]	eval-mlogloss:0.42849	train-mlogloss:0.41880
[14]	eval-mlogloss:0.40880	train-mlogloss:0.39892
[15]	eval-mlogloss:0.39009	train-mlogloss:0.37992
[16]	eval-mlogloss:0.37342	train-mlogloss:0.36263
[17]	eval-mlogloss:0.35764	train-mlogloss:0.34635
[18]	eval-mlogloss:0.34336	train-mlogloss:0.33186
[19]	eval-mlogloss:0.33048	train-mlogloss:0.31842
[20]	eval-

In [16]:
preds = np.round(bst.predict(dtest) )
acc = 1. - (np.abs(preds - y_test).sum() / y_test.shape[0])
print("Acc: {}".format(acc))
print("Prediction time --- %s seconds ---" % (time.time() - start_time))

Acc: 0.9627835278573001
Prediction time --- 7208.550240516663 seconds ---


In [17]:
# make predictions for test data
y_hat = bst.predict(dtest)
print(f'LogReg accuracy on held-out frames = {round(accuracy_score(y_test, y_hat),4)}')

LogReg accuracy on held-out frames = 0.9676


In [18]:
confusion_matrix(y_test, y_hat, labels=[0, 1, 2])
target_names = ['nothing', 'up', 'down']
print(classification_report(y_test, y_hat, target_names=target_names))

              precision    recall  f1-score   support

     nothing       0.95      0.95      0.95      3040
          up       0.96      0.95      0.96      3024
        down       0.99      1.00      0.99      3018

    accuracy                           0.97      9082
   macro avg       0.97      0.97      0.97      9082
weighted avg       0.97      0.97      0.97      9082



In [19]:
pickle.dump(bst, open('Existing_Models/xgboost_dino_tuned_2.pkl', 'wb'))