# Production Style Testing

In [1]:
import os
import json
import cv2
import string
import random
import albumentations as A
import copy
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, confusion_matrix, accuracy_score
from sklearn.naive_bayes import BernoulliNB 
import pickle
import warnings
warnings.filterwarnings('ignore') 

### Build Dataset

In [2]:
def generate_random_field(length = 0):
    length = length if length else random.randint(2, 20)
    field = ''.join(random.choices(string.ascii_letters + string.digits + string.punctuation, k=length))
    return field

transform = A.Compose([
        # A.RandomRotate90(),
        # A.Flip(),
        # A.Transpose(),
        A.ImageCompression(quality_lower=5, p=0.1),
        A.OneOf([
            # A.IAAAdditiveGaussianNoise(),
            A.GaussNoise(p=0.8,var_limit=(0,25)),
            # A.ISONoise(p=0.2,),
        ], p=0.2),
        A.OneOf([
            A.MotionBlur(p=.2),
            A.MedianBlur(blur_limit=1, p=0.1),
            A.Blur(blur_limit=1, p=0.1),
        ], p=0.25),
        A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.25, rotate_limit=15, p=0.25),
        # A.OneOf([
        #     A.OpticalDistortion(p=0.5),
        #     A.GridDistortion(p=.5),
        #     A.PiecewiseAffine(p=0.5),
        # ], p=0.5),
        A.OneOf([
            A.RandomFog(),
            A.RandomRain(),
            A.RandomSnow(),
            A.RandomSunFlare(),            
        ], p=0.1),
        # A.HueSaturationValue(p=0.01),
        # A.Rotate(66,p=0.3)
        # A.ToGray(always_apply=True)
    ])
# random.seed(42) 

fonts = [
    cv2.FONT_HERSHEY_SIMPLEX,
    cv2.FONT_HERSHEY_COMPLEX,
    cv2.FONT_HERSHEY_PLAIN,
    cv2.FONT_HERSHEY_DUPLEX,
    cv2.FONT_HERSHEY_TRIPLEX,
    cv2.FONT_HERSHEY_COMPLEX_SMALL,
    cv2.FONT_HERSHEY_SCRIPT_COMPLEX,
    cv2.FONT_HERSHEY_SCRIPT_COMPLEX,
    cv2.FONT_ITALIC]

def generate_target_dictionary():
    with open('data_dictionary.json') as data_dict:
        categories = json.load(data_dict)['target_data']
    return categories

template_directory = 'templates_img'
text_locations = json.load(open('text_locations.json', 'r'))
image_dir =  "data/"
backgrounds_dir = 'image_backgrounds'

categories = generate_target_dictionary()

def load_backgrounds():
    for filename in os.listdir(backgrounds_dir):
        backgrounds = []
        img = cv2.imread(backgrounds_dir+ '/'+filename, 1)
        backgrounds.append(img)
    return backgrounds



def agument_image(image, backgrounds, doc_info):
    background_img = backgrounds[random.randint(0, len(backgrounds))-1]
    img = image
    for loc in doc_info:
        font = random.choice(fonts)
        cv2.putText(img, generate_random_field(),
                    (int(loc['x']),int(loc['y'])), font,
                    1, (0, 0, 0), 1)

    x_size = random.randint(-150,400)
    y_size = random.randint(-150,400)
    x_size = x_size if x_size > 150 else 0
    y_size = y_size if y_size > 150 else 0
    x_offset = int(x_size/1.5)
    y_offset = int(y_size/1.5)
    background_img = cv2.resize(background_img, (850+abs(x_size), 1100+abs(y_size))) 

    background_img[y_offset:y_offset+img.shape[0], x_offset:x_offset+img.shape[1]] = img
    transformed = transform(image=cv2.resize(background_img,(460,720)))
    img = transformed['image']

    img_280 =cv2.resize(img,(280,360))
    img_200 =cv2.resize(img,(200,200))
    img_42 = cv2.resize(img,(42,65))
    
    # img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img_280 = cv2.adaptiveThreshold(cv2.cvtColor(img_280, cv2.COLOR_BGR2GRAY),255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,11,2)
    img_200 = cv2.adaptiveThreshold(cv2.cvtColor(img_200, cv2.COLOR_BGR2GRAY),255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,11,2)
    img_42 = cv2.adaptiveThreshold(cv2.cvtColor(img_42, cv2.COLOR_BGR2GRAY),255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,11,2)
    return img_280/255, img_200/255, img_42/42

dim_size = (280*360)
dim_shape = (280,360)

def generate_dataset(batch_size):
    backgrounds = load_backgrounds()
    chunk = batch_size//6

    x_280 = np.zeros(shape=(batch_size,(280*360)))
    x_200 = np.zeros(shape=(batch_size,(200*200)))
    x_42 = np.zeros(shape=(batch_size,(42*65)))

    y = np.zeros((batch_size,))
    index = 0
    #generate our data
    for filename in text_locations:
        
        if text_locations[filename] != {}:
            image = cv2.imread(template_directory+ '/'+filename, 1)
            image = cv2.resize(image, (850, 1100)) 

            doc_info = text_locations[filename]
            for row in range(chunk):
                print(index,end='\r', flush=True)
                img_280, img_200, img_42 = agument_image(image, backgrounds, doc_info)
                x_280[index] =  np.reshape(img_280, (280*360))
                x_200[index] =  np.reshape(img_200, (200*200))
                x_42[index] =  np.reshape(img_42, (42*65))
                y[index] = categories[filename[:4]]
                index = index + 1

    return x_280, x_200, x_42, y


### Define our testing functions


In [3]:
def test_on_batch(x_280, x_200, x_42, y):
    # Models
    #nb_full_280
    # models_280
    # nb_full_200
    # models_200
    # nb_full_42
    # models_42

    def write_results(y, y_pred, heading):
        file.writelines(f'\n\n{heading}')
        file.writelines(f'\n\nData Size:\t{len(y_pred)}/{len(y_pred)}')
        file.writelines(f'\n\tAccuracy:\t{accuracy_score(y, y_pred)}')
        file.writelines(f'\n\tPrecision:\t{precision_score(y, y_pred, average="macro")}')
        file.writelines(f'\n\tRecall:\t{recall_score(y, y_pred, average="macro")}')
        file.writelines(f'\n\tF1:\t{f1_score(y, y_pred, average="macro")}\n')
        file.writelines(str(confusion_matrix(y, y_pred)))
        return True

    OVR_results = np.zeros((3,y.shape[0],6))
    Multi_results = np.zeros((3, y.shape[0],6))

    Multi_results[0] = nb_full_280.predict_proba(x_280)[:]
    for key in categories.keys():
        OVR_results[0,:,categories[key]] = models_280[key].predict_proba(x_280)[:,0]

    Multi_results[1] = nb_full_200.predict_proba(x_200)[:]
    for key in categories.keys():
        OVR_results[1,:,categories[key]] = models_200[key].predict_proba(x_200)[:,0]

    Multi_results[2] = nb_full_42.predict_proba(x_42)[:]
    for key in categories.keys():
        OVR_results[2,:,categories[key]] = models_200[key].predict_proba(x_200)[:,0]

    test = np.copy(y)
    test2 = np.copy(y)
    test3 = np.copy(y)

    nb_full_280_choice = np.copy(y)
    models_280_choice = np.copy(y)
    nb_full_200_choice = np.copy(y)
    models_200_choice = np.copy(y)
    nb_full_42_choice = np.copy(y)
    models_42_choice = np.copy(y)
    voting_sum = np.array((y.shape[0],6))
    voting_results = np.copy(y)

    for  i in range(y.shape[0]):
        # print(i)
        # added_results[i,:] = results[i,:] + (np.absolute(nb_full_results[i,:]-1)/1.0e+200)
        nb_full_280_choice[i] = np.where(Multi_results[0, i,:] == np.amax(Multi_results[0,i,:].reshape(6)))[0]
        models_280_choice_ = np.where(OVR_results[0, i,:] == np.amin(OVR_results[0,i,:].reshape(6)))[0]
        nb_full_200_choice[i] = np.where(Multi_results[1, i,:] == np.amax(Multi_results[1,i,:].reshape(6)))[0]
        models_200_choice_ = np.where(OVR_results[1, i,:] == np.amin(OVR_results[1,i,:].reshape(6)))[0]
        nb_full_42_choice[i] = np.where(Multi_results[2, i,:] == np.amax(Multi_results[2,i,:].reshape(6)))[0]
        models_42_choice_ = np.where(OVR_results[2, i,:] == np.amin(OVR_results[2,i,:].reshape(6)))[0]

        # print(Multi_results[:,i,:])
        # print(np.sum(Multi_results[:,i,:], axis=1))
        # print(np.sum(Multi_results[:,i,:], axis=0))
        Multi_results[0,i,:] *= 8
        Multi_results[1,i,:] *= 6
        Multi_results[2,i,:] *= 4
        OVR_results[0,i,:] = np.abs((OVR_results[0,i,:] -1)*6)
        OVR_results[1,i,:] = np.abs((OVR_results[1,i,:] -1)*1.5)
        OVR_results[2,i,:] = np.abs((OVR_results[2,i,:] -1)*1)
        voting_sum =  np.sum(Multi_results[:,i,:], axis=0)+ np.sum((OVR_results[:,i,:]), axis=0)
        # print(voting_sum)
        # print(voting_results[i])
        vote_max = np.where(voting_sum == np.amax(voting_sum.reshape(6)))[0]
        if len(vote_max)>1:
            vote_max = 6
        else:
            vote_max = vote_max[0]

        voting_results[i] = vote_max


        # max = np.where(results[i,:] == np.amin(results[i,:].reshape(6)))[0]
        # max2 = np.where(added_results[i,:] == np.amin(added_results[i,:].reshape(6)))[0]
        # max3 = np.where(nb_full_results[i,:]== np.amax(nb_full_results[i,:].reshape(6)))[0]
        choices = [models_280_choice_, models_200_choice_, models_42_choice_]
        for x in range(len(choices)):
            if len(choices[x])>1:
                choices[x] = 6
            else:
                choices[x] = choices[x][0]
            # print(choices)
        # print(models_280_choice_)
        models_280_choice[i] = choices[0]
        models_200_choice[i] = choices[1]
        models_42_choice[i] = choices[2]

    with open("test_log.txt", "a") as file:
        
        file.writelines('\n\n\n')
        file.writelines('*|'*50)
        file.writelines('\n')
        file.writelines('*|'*50)
        file.writelines('\n')
        file.writelines('*|'*50)
        file.writelines('\n\n')

        y_pred = models_42_choice
        mask = y_pred != 6
        write_results(y, y_pred, heading='Base OVR 42 Features')
        write_results(y[mask], y_pred[mask], heading='Filtered OVR 42 Features')
        write_results(y, nb_full_42_choice, heading='Multiclass 42 Features')

        file.writelines('\n\n')
        file.writelines("==="*20)

        y_pred = models_200_choice
        mask = y_pred != 6
        write_results(y, y_pred, heading='Base OVR 200 Features')
        write_results(y[mask], y_pred[mask], heading='Filtered OVR 200 Features')
        write_results(y, nb_full_200_choice, heading='Multiclass 200 Features')

        file.writelines('\n\n')
        file.writelines("==="*20)

        y_pred = models_280_choice
        mask = y_pred != 6
        write_results(y, y_pred, heading='Base OVR 280 Features')
        write_results(y[mask], y_pred[mask], heading='Filtered OVR 280 Features')
        write_results(y, nb_full_280_choice, heading='Multiclass 280 Features')

        file.writelines('\n\n')
        file.writelines("==="*20)

        write_results(y, voting_results, heading='Voting Ensemble')

        

### Load our Models

In [4]:
nb_full_280 =  pickle.load(open('models/MulticlassModel_280x360', 'rb'))
models_280 =  pickle.load(open('models/EnsembleModels_280x360', 'rb'))
nb_full_200 =  pickle.load(open('models/MulticlassModel_200x200', 'rb'))
models_200 =  pickle.load(open('models/EnsembleModels_200x200', 'rb'))
nb_full_42 =  pickle.load(open('models/MulticlassModel_42x65', 'rb'))
models_42 =  pickle.load(open('models/EnsembleModels_42x65', 'rb'))

In [5]:
x_280, x_200, x_42, y = generate_dataset(2400)

2399

In [6]:
test_on_batch(x_280, x_200, x_42, y)

ValueError: X has 40000 features, but BernoulliNB is expecting 9350 features as input.