In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Show Code."></form>''')

In [2]:
import dill
dill.load_session("Pixel_Value_Raw_Distance.db")

In [3]:
import numpy as np 
from PIL import Image
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [4]:
#IMAGE SIZE CONSTANTS
IMAGE_WIDTH = 32
IMAGE_HEIGHT = 32
TOTAL_PIXELS = IMAGE_WIDTH * IMAGE_HEIGHT

#DATASET CONSTANTS
DATASET = "MPEG7"

#TESTING METRIC CONSTANTS
K_START_RANGE, K_END_RANGE = 1,20 #Inclusive
K_INCREMENT = 1

In [5]:
def load_image_from_folder(folder):
    images = []
    filenames = []
    y = []
    category, idx = "none", 0
    for filename in sorted(os.listdir(folder)):
        if filename.endswith(".gif"):
            image = Image.open(os.path.join(folder, filename))
            image = image.resize((32, 32), Image.ANTIALIAS)
            if image is not None:
                images.append(image)
                filenames.append(filename)
                if filename.startswith(category):
                    y.append(idx)
                else:
                    category = filename.split('-')[0]
                    idx = idx + 1
                    y.append(idx)                    
    return images, filenames, y

In [6]:
def load_images(dataset):
    folder = "\\" + dataset
    full_path = os.getcwd() + folder
    images, filenames, y = load_image_from_folder(full_path)

    return images, filenames, y

In [7]:
from IPython.display import display

#Display Images From Start to Stop
def display_images(start, stop):
    for i in range(start, stop):
        display(images[i])

In [8]:
def convert_images_to_2Darr(images, y):
    tmp = []
    for image in images:
        im = np.asarray(image)
        im = im.reshape((1024,))
        tmp.append(im)
    imgs_array = np.array(tmp)
    y = np.array(y)
    return imgs_array, y

In [9]:
def get_label_names(filenames):
    names = []

    for name in filenames:
        names.append(name.split('-')[0])

    label_names = np.array(names)
    return label_names
    

In [10]:
from IPython.display import display
def display_images(k, query,indexes, data, query_text="", label_name="", result_text="", display_query=True):
    if (display_query):
        print(color.BOLD + color.UNDERLINE + query_text + color.END + " " + label_name)
        query = [query.reshape((32,32))]
        q_output = np.array(query)*255
        q_output = q_output.transpose(1, 0, 2)
        q_output = q_output.reshape((32, -1))
        query_img = Image.fromarray(q_output)
        display(query_img)
    
    if (result_text):
        print(color.BOLD + result_text)
        
    temp = []
    for ind in indexes[:k]:
        temp.append(data[ind].reshape((32,32)))
    output = np.array(temp)*255
    output = output.transpose(1, 0, 2)
    output = output.reshape((32, -1))
    im_query= Image.fromarray(output)
    display(im_query) 
    
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [11]:
#Load Images
images, filenames, y = load_images(DATASET)
#Convert to Numpy Array
imgs_array, y = convert_images_to_2Darr(images, y)
#Convert Label Names
label_names = get_label_names(filenames)
#Split Training Data
X_train, X_test, y_train, y_test, train_names, test_names = train_test_split(imgs_array, y, label_names, test_size=0.2, random_state=42)


In [12]:
import pandas as pd
def calculate_precision_and_recall(index, idx, train_names, test_names, label_amounts, k):
    correct_label = test_names[idx]
    precisions = [] #used for average precision
    found = 0
    k_val = 0
    recall = -1
  
    while (found < label_amounts[correct_label]) or (k_val <= k):
        true_positives = 0
        false_positives = 0
        
        
        for i in range(0,k_val+1):
            if (train_names[index[i]] == correct_label):
                true_positives+=1
                #Will be equal to precision at last correct in image in range
                precision = true_positives/(true_positives + false_positives)
                
            else:
                false_positives+=1
                precision = -1
                
            #if we have just checked k documents, precision is equal to precision@k
            #this will keep getting overwritten with the same value but it is not a big deal
            if (i == k-1):
                precision_at_k = true_positives/(true_positives + false_positives)
                #Storing recall@k
                recall = true_positives/(label_amounts[correct_label] - true_positives + true_positives)  
            
        
        #If Relevant Document found in this iteration add its precision value 
        if (precision != -1):
            precisions.append(precision)
        
        #Check if last value was correct, if so update found
        if (train_names[index[i]] == correct_label):
            found+=1
            
        k_val+=1
    
     #If empty add a zero to array for averaging
    if not precisions:
            precisions.append(0)
    
    #if we did not change recall, that means all were found before k, therefore recall is 100%
    #precision@k will also not change after this point so it can be computed now
    if recall == -1:
        recall = 1
        
    #Calculate Average Precision
    average_precision = np.average(precisions)
    
    
 
    
    return average_precision, recall, precision_at_k

In [13]:
from sklearn.metrics.pairwise import euclidean_distances

#Sort the test data based on euclidean distance calculated from the training data
def calculate_average_precisions(train_data, test_data, train_names, test_names, k=10,image_display_num=0):
    images_displayed = 0
    label_amounts = pd.value_counts(train_names)
    
    precisions = []
    recalls = []
    precisions_at_k = []
    
    
    for idx, query in enumerate(test_data):
        #Calculate first k closest iamges
        query = query.reshape((1, -1))
        D = euclidean_distances(train_data, query).squeeze()
        index = np.argsort(D)
    
    
        #Calcualte Metrics for Current Query
        average_precision, recall, precision_at_k = calculate_precision_and_recall(index, idx, train_names, test_names, label_amounts, k)
            
        #Append Results
        precisions.append(average_precision)
        recalls.append(recall)
        precisions_at_k.append(precision_at_k)
        
        if (images_displayed < image_display_num):
            display_images(k, query, index, X_train, "Query:", test_names[idx], "Results:", True) 
            print("Label Amount ", label_amounts[test_names[idx]])
            print("Precision: {0:.2f}".format(average_precision))
            print("Recall@K {0:.2f}: ".format(recall))
            print("Precision@K {0:.2f}".format(precision_at_k))
            images_displayed += 1
         
    return precisions, recalls, precisions_at_k

In [None]:
all_mAPs, all_recalls_at_k, all_precisions_at_k, k_labels = [], [], [], []
for k_val in range(K_START_RANGE,K_END_RANGE+1):
    average_precisions, recalls_at_k, precisions_at_k = calculate_average_precisions(X_train, X_test, train_names, test_names, k=k_val, image_display_num=0)
    all_mAPs.append(np.average(average_precisions))
    all_recalls_at_k.append(np.average(recalls_at_k))
    all_precisions_at_k.append(np.average(precisions_at_k))
    k_labels.append(k_val)
    
overall_mAP = np.average(all_mAPs)
overall_precision_at_k = np.average(all_precisions_at_k)
overall_recall_at_k = np.average(all_recalls_at_k)

In [None]:
from IPython.display import Markdown, display
display(Markdown("<h1><left><u>Pixel Value with Raw Distance: (k={0} to {1})</left></h1><u>".format(K_START_RANGE, K_END_RANGE)))
display(Markdown("<h2><left>Pixel Value with Raw Distance mAP: {0:.2f}</left></h2>".format(overall_mAP)))

In [None]:
import plotly.graph_objects as go

#Add Overall mAP to bottom
col1 = k_labels + ['<b>Overall<b>']
col2 = all_precisions_at_k + ['<b>'+str(overall_precision_at_k)+'</b>']
col3 = all_recalls_at_k + ['<b>'+str(overall_recall_at_k)+'</b>']
fig = go.Figure(data=[go.Table(
    header=dict(values=['<b>k</b>','<b>Precision@k</b>', '<b>Recall@k</b>'],
                line_color='darkslategray',
                fill_color='#8aa1b4',
                align='left'),
    cells=dict(values=[col1, col2,col3], 
               line_color='darkslategray',
               fill_color=['#c4cfd9', '#ffffff'],
               align='left'), columnwidth=[150,800])
])

fig.update_layout(width=900, height=700, title_text="<b>Precision and Recall for Different Values of k</b>", title_x=0.5,title_y=0.91)
fig.update_layout(title_font=dict(size=20))
fig.show()

In [None]:
#Change size of cell output to avoid scrolling
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 70em; }</style>"))

In [None]:
plt.rcParams["figure.figsize"] = (15,15)
precision_recall_curve = plt.plot(all_recalls_at_k, all_precisions_at_k)
plt.title("Pixel Value Raw Distance (k={0}-{1})".format(K_START_RANGE, K_END_RANGE),fontweight='bold')
plt.xlabel("Recall@k",fontweight='bold')
plt.ylabel("Precision@k",fontweight='bold')
plt.xticks([0.2,0.4,0.6,0.8,1.0])
plt.yticks([0.2,0.4,0.6,0.8,1.0])
plt.rcParams['axes.titlesize'] = 25
plt.rc('axes', labelsize=25)    # fontsize of the x and y labels
plt.tick_params(axis='both', which='major', labelsize=20)
plt.savefig('../Graphs/Pixel_Value_Raw_Distance.png', facecolor='white')

plt.show()
plt.close()

In [None]:
# Rename and Store Variables to be Presented in Different Notebook
raw_distance_overall_mAP = overall_mAP
raw_distance_mAPs = all_mAPs
raw_distance_total_recalls = all_average_total_recalls

%store raw_distance_overall_mAP
%store raw_distance_mAPs
%store raw_distance_total_recalls

In [None]:
#Example with output
calculate_average_precisions(X_train, X_test, train_names, test_names, k=2, image_display_num=0)
print()

In [None]:
dill.dump_session("Pixel_Value_Raw_Distance.db")