In [1]:
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from pathlib import Path
from keras.utils.np_utils import to_categorical
from typing import List
import cv2 as cv
# Using SMOTE for the over sampling portion.
from imblearn.over_sampling import SMOTE
import time
%matplotlib inline

In [2]:
target_address = os.path.join(Path(os.getcwd()).parent,'Window_capture\\Data\\command_keys.npy')
# screenshot_address = os.path.join(Path(os.getcwd()).parent,'Window_capture\\Data\\screenshots.npy')
screenshot_address = os.path.join(Path(os.getcwd()).parent,'Window_capture\\Data\\screenshots.npy')

labels = np.load(target_address)
images = np.load(screenshot_address, allow_pickle = True)


print("Length Command Keys Shape: ",labels.shape)
print("Length Screenshot Shape: ",images.shape)
print("Screenshot Shape: ",images[0].shape)
print(np.unique(labels, return_counts = True))

Length Command Keys Shape:  (15673,)
Length Screenshot Shape:  (15673, 129600)
Screenshot Shape:  (129600,)
(array([-1, 38, 40]), array([12109,  1523,  2041], dtype=int64))


In [None]:
# res_list = [i for i, value in enumerate(labels) if value == -1] # Let's get rid of some -1 values.
# idx = np.random.choice(res_list, 9000, replace=False) # Randomly choose X number of entries to be deleted specified as -1
# images = pd.DataFrame(images) # flatten images then converted to dataframe for easier removal of idx
# images = np.array(images.drop(images.index[idx])) # flatten images then converted to dataframe for easier removal of idx
# labels = np.delete(labels, idx)
# print(images.shape, labels.shape)
# print(np.unique(labels, return_counts = True))

In [None]:
# Undersample using Tomek Links
# from imblearn.under_sampling import TomekLinks
# tl = TomekLinks()
# images, labels = tl.fit_resample(images, labels)

In [3]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 101)
images, labels = smote.fit_resample(images, labels)

In [4]:
np.unique(labels, return_counts = True)

(array([-1, 38, 40]), array([12109, 12109, 12109], dtype=int64))

In [5]:
# Cast -1 to 0, 38 to 1 and 40 to 2
labels[labels == -1] = 0
labels[labels == 38] = 1
labels[labels == 40] = 2

In [6]:
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size = 0.25)

In [7]:
np.unique(y_train, return_counts = True)

(array([0, 1, 2]), array([9115, 9076, 9054], dtype=int64))

In [8]:
# from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [9]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test, label=y_test)

In [10]:
# Convert the data to DMatrix for xgboost

# Loop through multiple thread numbers for xgboost
start_time = time.time()
n_estimators = 50
param = {
          'max_depth' : 6,
                'eta' : 0.3,
           'objective':'multi:softmax',
           'num_class': 3,
        }

bst = xgb.train(param,
                dtrain,
                n_estimators,
                [(dtest, 'eval'), (dtrain, 'train')] )
print("XGBoost (no wrapper) Time: {}s".format(time.time() - start_time))

[0]	eval-mlogloss:0.83101	train-mlogloss:0.83198
[1]	eval-mlogloss:0.67115	train-mlogloss:0.67138
[2]	eval-mlogloss:0.55730	train-mlogloss:0.55737
[3]	eval-mlogloss:0.47466	train-mlogloss:0.47356
[4]	eval-mlogloss:0.41648	train-mlogloss:0.41445
[5]	eval-mlogloss:0.36969	train-mlogloss:0.36649
[6]	eval-mlogloss:0.33054	train-mlogloss:0.32680
[7]	eval-mlogloss:0.30137	train-mlogloss:0.29608
[8]	eval-mlogloss:0.27882	train-mlogloss:0.27214
[9]	eval-mlogloss:0.25657	train-mlogloss:0.24836
[10]	eval-mlogloss:0.23672	train-mlogloss:0.22716
[11]	eval-mlogloss:0.22169	train-mlogloss:0.21121
[12]	eval-mlogloss:0.20816	train-mlogloss:0.19602
[13]	eval-mlogloss:0.19635	train-mlogloss:0.18336
[14]	eval-mlogloss:0.18536	train-mlogloss:0.17058
[15]	eval-mlogloss:0.17552	train-mlogloss:0.16018
[16]	eval-mlogloss:0.16732	train-mlogloss:0.15020
[17]	eval-mlogloss:0.15968	train-mlogloss:0.14130
[18]	eval-mlogloss:0.15297	train-mlogloss:0.13372
[19]	eval-mlogloss:0.14682	train-mlogloss:0.12674
[20]	eval-

In [11]:
preds = np.round(bst.predict(dtest) )
acc = 1. - (np.abs(preds - y_test).sum() / y_test.shape[0])
print("Acc: {}".format(acc))
print("Prediction time --- %s seconds ---" % (time.time() - start_time))

Acc: 0.964325038537767
Prediction time --- 3808.9451112747192 seconds ---


In [12]:
# make predictions for test data
y_hat = bst.predict(dtest)
print(f'LogReg accuracy on held-out frames = {round(accuracy_score(y_test, y_hat),4)}')

LogReg accuracy on held-out frames = 0.9683


In [13]:
confusion_matrix(y_test, y_hat, labels=[0, 1, 2])
target_names = ['nothing', 'up', 'down']
print(classification_report(y_test, y_hat, target_names=target_names))

              precision    recall  f1-score   support

     nothing       0.95      0.96      0.95      2994
          up       0.97      0.95      0.96      3033
        down       0.99      1.00      0.99      3055

    accuracy                           0.97      9082
   macro avg       0.97      0.97      0.97      9082
weighted avg       0.97      0.97      0.97      9082



In [14]:
pickle.dump(bst, open('Existing_Models/xgboost_dino_SMOTE.pkl', 'wb'))