In [1]:
#Task 4 - Task 2 sub portion

import os
import json
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

In [3]:
column_labels = ["point-type", "x", "y", "t", "sigma2", "tau2", "detector-confidence"]
for i in range(1, 73):
    column_labels.append("dscr-hog (" + str(i) + ")")
for i in range(1, 91):
    column_labels.append("dscr-hof (" + str(i) + ")")

tau_values = [2, 4]
sigma_values = [4, 8, 16, 32, 64, 128]

# Cluster Representatives calculated during Task 2
with open('HoG_cluster_representatives.json', 'r') as fp:
    nt_centers_HOG = json.load(fp)
with open('HoF_cluster_representatives.json', 'r') as fp:
    nt_centers_HOF = json.load(fp)

In [5]:
# Find the Euclidean Distance between the STIPs and Cluster Representatives
# Returns the index of the closest cluster representative
def find_closest_clusters(x, y):
    distances = euclidean_distances(x, y)
    return np.argmin(distances, axis=1)

In [9]:
def get_bof(folder_path):
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            video_file = os.path.join(root, file)
            video_name = os.path.relpath(video_file, folder_path)

            # STIP files are tab separated, with comments and a header that is not formatted for a Pandas DataFrame
            try:
                df = pd.read_csv(folder_path + "/" + video_name, sep="\t", comment='#', header=None)

                # select 400 highest samples sorted by column index 6 (detector-confidence)
                df = df.nlargest(400, 6)

                # Drops the last column as it is an empty column (due to how STIP data provided is formatted)
                df = df.dropna(axis=1)
            
            except pd.errors.EmptyDataError:
                print("no data in ", video_name)
                continue
            
            df.columns = column_labels

            hog_histograms = []
            hof_histograms = []
            
            for t in tau_values:
                for s in sigma_values:
            
                    filtered_df = df[(df["tau2"] == t) & (df["sigma2"] == s)]
                    # A STIP file may not have any values for a specific pair of tau2 and sigma2 values.
                    if filtered_df.shape[0] != 0:
                        hog_cluster_Id = find_closest_clusters(filtered_df.loc[:, "dscr-hog (1)":"dscr-hog (72)"], nt_centers_HOG[str((t, s))])
                        hof_cluster_Id = find_closest_clusters(filtered_df.loc[:, "dscr-hof (1)":"dscr-hof (90)"], nt_centers_HOF[str((t, s))])
            
                        hog_histogram, bin_edges = np.histogram(hog_cluster_Id, bins=np.arange(41))
                        hof_histogram, bin_edges = np.histogram(hof_cluster_Id, bins=np.arange(41))
                    
                    elif filtered_df.shape[0] != 0:
                        hog_histograms.append(numpy.zeros(480))
                        hof_histograms.append(numpy.zeros(480))
            
                    hog_histograms.append(hog_histogram)
                    hof_histograms.append(hof_histogram)

            bof = {}
            bof["hog"] = np.concatenate(hog_histograms).tolist()
            bof["hof"] = np.concatenate(hof_histograms).tolist()

            # Export bof_HoF and bof_HoG values for each video as a separate JSON file to compare with sample Query in Task 5 
            json_object = json.dumps(bof, indent = 4)
            with open("Task-4\\" + video_name + ".json", "w") as outfile:
                outfile.write(json_object)

STIP_target_path = "STIP/target_videos"
get_bof(STIP_target_path)