In [1]:
import setGPU

import glob
import os
import zipfile
import json
import shutil
import re
import cv2
import numpy as np
import scipy.io as sio
from scipy import spatial
import msgpack

from collections import namedtuple

#from tqdm.auto import tqdm

from efficientnet.tfkeras import EfficientNetB5
from efficientnet.tfkeras import center_crop_and_resize, preprocess_input

import tensorflow as tf

import tensorflow_hub as hub

setGPU: Setting GPU to: 0


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
TESTCASES = '/mnt/data/testCompressed'

In [3]:
def parse_result(res):
    """parses a standard timestamp result
    
    Acceptable input formats are (as strings):
        <title_id>_<segment>_<timestamp>
        <title_id>_<timestamp>
        
    Output (as ints):
        (title_id, timestmp)
    """
    if len(res) != 1:
        return None, None
    parts = res[0].split('_')
    
    if len(parts) == 3:
        title_id, _, timestamp = res[0].split('_')
        return int(title_id), int(timestamp)
    elif len(parts) == 2:
        title_id, timestamp = res[0].split('_')
        return int(title_id), int(timestamp)
    return None, None

In [4]:
def sorted_alphanumeric(data):
    """Sort a list of strings alphanumerically so that '9' comes before '10', etc..."""
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(data, key=alphanum_key)

In [5]:
def load_images(img_files, resize=None, debug=False):
    """loads one or more images into a 4D numpy array
        
    Parameters:
    
        img_files - a list of paths to images
        resize - optional tuple describing the output size of each image
        debug - optional flag to display images inline in the notebook
        
    Returns:
    
        a 4D numpy array of RGB images with pixels normalized between 0.0 and 1.0 
    """
    images = []
    
    for img_file in img_files:
        image = cv2.imread(img_file)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        """
        if resize != None:
            image = cv2.resize(image, (resize[0], resize[1]))
        
        if debug:
            plt.figure()
            plt.title(img_file)
            plt.imshow(image / 255.0)
            plt.show()
            
        image = image / 255.0"""
            
        images.append(image)
        
    return images

In [6]:
# loading pretrained model
model = EfficientNetB5(weights='imagenet')
interModel = tf.keras.Model(inputs=model.input, outputs=model.get_layer('avg_pool').output)

Instructions for updating:
`normal` is a deprecated alias for `truncated_normal`
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [7]:
def extract_fv_tvj(image):
    image_size = model.input_shape[1]
    x = center_crop_and_resize(image, image_size=image_size)
    x = preprocess_input(x)
    x = np.expand_dims(x, 0)
    fvs = interModel.predict(x)
    
    return fvs

In [6]:
IMAGE_SHAPE = (456, 456)
feature_extractor = tf.keras.Sequential([
    hub.KerasLayer('https://tfhub.dev/google/efficientnet/b5/feature-vector/1', input_shape=IMAGE_SHAPE+(3,))
])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor








In [7]:
def extract_fv(image):
    x = cv2.resize(image, IMAGE_SHAPE)
    x = x/255.0
    x = np.expand_dims(x, 0)
    
    result = feature_extractor.predict(x)
    
    return result

In [8]:
def signedmag(a):
    pos = a[np.where(a>0)]
    neg = a[np.where(a<0)]
    return np.linalg.norm(pos) - np.linalg.norm(neg)

def combine_vectors(a):
    return np.apply_along_axis(signedmag, 0, a)

In [9]:
def load_testcases():
    """Loads all test cases and generates combined feature vectors for each one.
    
    Parameters:
    
        None
        
    Returns:
    
        a list of tuples containing (title id, timestamp, combined feature vector)
    
    """
    test_zips = glob.glob(os.path.join(TESTCASES, '*.zip'))
    tmp_dir = os.path.join(TESTCASES, 'tmp-zip')

    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
        
    all_tests = []
    
    for test_zip in (test_zips):
        
        for d in glob.glob(os.path.join(tmp_dir, '*')):
            shutil.rmtree(d)
            
        zip_ref = zipfile.ZipFile(test_zip, 'r')
        zip_ref.extractall(tmp_dir)
        zip_ref.close()
        
        retDataFile = glob.glob(os.path.join(tmp_dir, '*/retrievaldata.json'))[0]
        with open(retDataFile) as f:
            retData = json.loads(f.read())
            
        title_id, timestamp = parse_result(retData['response']['Results'])
        
        fvs = []
        
        imgFiles = sorted_alphanumeric(glob.glob(os.path.join(tmp_dir, '*/*.jpg')))
        images = load_images(imgFiles)
        
        for image in images:
            fvs.append(extract_fv(image))
            
        combined_fv = combine_vectors(np.array(fvs))
        
        all_tests.append((title_id, timestamp, combined_fv))
        
    shutil.rmtree(tmp_dir, ignore_errors=True)
    
    return all_tests

In [10]:
all_tests = load_testcases()

In [11]:
#def read_all_combined_fvs(identifier, files=[]):
def read_all_combined_fvs(files=[]):
    """Reads all combined feature vectors for a particular model
    
    Parameters:
    
        identifier - string used to identify the model
    
    Returns:
    
        a dictionary where the keys are the title IDs and the values are lists of tuples containing
        the timestamp and the feature vectors for the timestamp
        
        i.e.
        
        title_fvs[title_id] = [(timestamp0, [fv0, fv1, fv2]), (timstamp1, [fv3])]
    
    """

    if files:
        all_packs = files
    #else:
    #    all_packs = glob.glob(os.path.join(FEATURES, '*/*{}*.pack'.format(identifier)))
    
    title_fvs = {}

    for i, pack in enumerate(all_packs):
        
        ext = os.path.splitext(pack)
        print(ext)
        if ext[1] == '.pack':
            # Original functionality
            with open(pack, mode='rb') as f:
                data = msgpack.load(f)
            
            ts_fv = [(data[b'Timestamp'], np.array(data[b'CombinedFV']))]
    
            titleID = data[b'TitleID']

        elif ext[1] == '.data':
            # Legacy title data passed in
            try:
                titleID = int(os.path.splitext(os.path.split(pack)[-1])[0].split('_')[-1])
            except Exception as e:
                print('Could not determine titleID for {} -- Skipping'.format(pack))
                print(e)
                continue

            with open(pack, mode='rb') as f:
                data = msgpack.load(f)
            
            ts_fv = [(d[b'Timestamp'], combine_vectors(d[b'FeatureVectors'])) for d in data]
    
        elif ext[1] == '.zip':
            # Zipped title data passed in
            try:
                titleID = int(os.path.splitext(os.path.split(pack)[-1])[0].split('_')[-1])
            except Exception as e:
                print('Could not determine titleID for {} -- Skipping'.format(pack))
                print(e)
                continue

            ts_fv = []
                
            archive = zipfile.ZipFile(pack, 'r')
            for fn in archive.namelist():
                b = archive.read(fn)
                data = msgpack.unpackb(b)
                ts_fv.append((data[b'Timestamp'], combine_vectors(data[b'FeatureVectors'])))
            
        else:
            print('Skipping unknown file type: {}'.format(pack))
            continue
            
    
        fvs = title_fvs.get(titleID, [])
        fvs.extend(ts_fv)
    
        title_fvs[titleID] = fvs
        
    return title_fvs

In [12]:
title_files = glob.glob(os.path.join('/mnt/data/features', '*.data')) + glob.glob(os.path.join('/mnt/data/features', '*.zip'))
#title_index = read_all_combined_fvs(MODEL_IDENTIFIER, title_files)
title_index = read_all_combined_fvs(title_files)

('/mnt/data/features/EfficientNet-B5_112206', '.data')
('/mnt/data/features/EfficientNet-B5_28359', '.data')
('/mnt/data/features/EfficientNet-B5_28374', '.data')
('/mnt/data/features/EfficientNet-B5_112203', '.data')
('/mnt/data/features/EfficientNet-B5_28377', '.data')
('/mnt/data/features/EfficientNet-B5_81676', '.data')
('/mnt/data/features/EfficientNet-B5_112200', '.data')
('/mnt/data/features/EfficientNet-B5_28356', '.data')
('/mnt/data/features/EfficientNet-B5_81677', '.data')
('/mnt/data/features/EfficientNet-B5_28367', '.data')
('/mnt/data/features/EfficientNet-B5_112207', '.data')
('/mnt/data/features/EfficientNet-B5_28358', '.data')
('/mnt/data/features/EfficientNet-B5_81672', '.data')
('/mnt/data/features/EfficientNet-B5_28373', '.data')
('/mnt/data/features/EfficientNet-B5_28378', '.data')
('/mnt/data/features/EfficientNet-B5_112220', '.data')
('/mnt/data/features/EfficientNet-B5_81675', '.data')
('/mnt/data/features/EfficientNet-B5_28368', '.data')
('/mnt/data/features/Ef

In [34]:
def read_title_fvs(titlesPath):
    title_fvs = {}
    
    for title_name in os.listdir(titlesPath):
        titlePath=os.path.join(titlesPath, title_name)
        matFile=title_name+'AggregateENB5.mat'
        matFilePath=os.path.join(titlePath, matFile)
        
        data = sio.loadmat(matFilePath)
        fvs = data['aggregateFvs']
        
        ts_fv = []
        for rowInd in range(fvs.shape[0]):
            row = fvs[rowInd,:]
            ts_fv.append((rowInd, row))
        
        title_fvs[int(title_name)] = ts_fv
        
    return title_fvs

In [35]:
titlesPath='../../../data/movie_titles/all_titles_uncompressed/'
title_index = read_title_fvs(titlesPath)

In [13]:
Test = namedtuple('Test', 'title timestamp')
Result = namedtuple('Result', 'title timestamp distance')

In [14]:
def run_brute_force_regression_suite(index, tests):
    """Compares each test case against every feature vector in the index to determine the top 5 best matches
    
    Parameters:
    
        index - dictionary of feature vectors by title, as returned by `read_all_combined_fvs`
        tests - list of test cases, as returned by `load_testcases`
        
    Returns:
    
        a dictionary where the keys is a named tuple (title, timestamp) for the test case and the value is the top 5 matches from the index
    """    
    # make sure we only run tests for titles that are indexed
    filtered_tests = [t for t in all_tests if t[0] in index]
    
    results = {}
    
    for i, test in enumerate(filtered_tests):
        all_results = [Result(titleid, ts, spatial.distance.cosine(test[2], fv)) for titleid in index for (ts, fv) in index[titleid]]

        results[Test(test[0], test[1])] = sorted(all_results, key=lambda x: x[2])[:5]
        
    return results

In [15]:
results = run_brute_force_regression_suite(title_index, all_tests)

In [16]:
def summarize_results(results):
    
    top1_title_match = 0
    top1_exact = 0
    top1_within_01 = 0
    top1_within_02 = 0
    top1_within_05 = 0
    top1_within_10 = 0
    top1_within_30 = 0
    top1_within_60 = 0
    top1_exact_match = 0

    top5_title_match = 0
    top5_exact = 0
    top5_within_01 = 0
    top5_within_02 = 0
    top5_within_05 = 0
    top5_within_10 = 0
    top5_within_30 = 0
    top5_within_60 = 0
    
    for test, matches in results.items():
        e_title = test.title
        if matches[0].title != test.title:
            continue
        
        top1_title_match += 1
    
        diff = abs(matches[0].timestamp - test.timestamp)
    
        if diff <= 60:
            top1_within_60 += 1
        
        if diff <= 30:
            top1_within_30 += 1
        
        if diff <= 10:
            top1_within_10 += 1
        
        if diff <= 5:
            top1_within_05 += 1
        
        if diff <= 2:
            top1_within_02 += 1
        

        if diff <= 1:
            top1_within_01 += 1
        

        if diff == 0:
            top1_exact_match += 1
        
        top5 = sorted([abs(m.timestamp - test.timestamp) for m in matches if m.title == test.title])
    
        if len(top5) == 0:
            continue
        
        top5_title_match += 1
    
        best = top5[0]
    
        if best <= 60:
            top5_within_60 += 1

        if best <= 30:
            top5_within_30 += 1

        if best <= 10:
            top5_within_10 += 1
        
        if best <= 5:
            top5_within_05 += 1
        
        if best <= 2:
            top5_within_02 += 1  
        
        if best <= 1:
            top5_within_01 += 1  

        if best == 0:
            top5_exact += 1
        
    print('Top1 Title Match: {}%'.format(100 * top1_title_match / len(results)))
    print('Top1 Exact Match: {}%'.format(100 * top1_exact / len(results)))
    print('Top1 Within 01:   {}%'.format(100 * top1_within_01 / len(results)))
    print('Top1 Within 02:   {}%'.format(100 * top1_within_02 / len(results)))
    print('Top1 Within 05:   {}%'.format(100 * top1_within_05 / len(results)))
    print('Top1 Within 10:   {}%'.format(100 * top1_within_10 / len(results)))
    print('Top1 Within 30:   {}%'.format(100 * top1_within_30 / len(results)))
    print('Top1 Within 60:   {}%'.format(100 * top1_within_60 / len(results)))
    print('----')
    print('Top5 Title Match: {}%'.format(100 * top5_title_match / len(results)))
    print('Top5 Exact Match: {}%'.format(100 * top5_exact / len(results)))
    print('Top5 Within 01:   {}%'.format(100 * top5_within_01 / len(results)))
    print('Top5 Within 02:   {}%'.format(100 * top5_within_02 / len(results)))
    print('Top5 Within 05:   {}%'.format(100 * top5_within_05 / len(results)))
    print('Top5 Within 10:   {}%'.format(100 * top5_within_10 / len(results)))
    print('Top5 Within 30:   {}%'.format(100 * top5_within_30 / len(results)))
    print('Top5 Within 60:   {}%'.format(100 * top5_within_60 / len(results)))

In [46]:
summarize_results(results)

Top1 Title Match: 96.15384615384616%
Top1 Exact Match: 0.0%
Top1 Within 01:   75.64102564102564%
Top1 Within 02:   78.2051282051282%
Top1 Within 05:   84.61538461538461%
Top1 Within 10:   89.74358974358974%
Top1 Within 30:   93.58974358974359%
Top1 Within 60:   94.87179487179488%
----
Top5 Title Match: 96.15384615384616%
Top5 Exact Match: 82.05128205128206%
Top5 Within 01:   88.46153846153847%
Top5 Within 02:   91.02564102564102%
Top5 Within 05:   92.3076923076923%
Top5 Within 10:   93.58974358974359%
Top5 Within 30:   94.87179487179488%
Top5 Within 60:   96.15384615384616%
