In [1]:
# Task 2

import os
import json
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances

In [3]:
# Arrays to store 400 highest confidence STIPs for each video
# Adding samples for each video to a DataFrame is not ideal, hence Arrays are used.
target_data = []
non_target_data = []

# Function to traverse the STIP directory
def process_STIPs_in_folder(folder_path):
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            video_file = os.path.join(root, file)
            video_name = os.path.relpath(video_file, folder_path)
            try:
                # STIP files are tab separated, with comments and a header that is not formatted for a Pandas DataFrame
                temp_df = pd.read_csv(folder_path + "/" + video_name, sep="\t", comment='#', header=None)

                # select 400 highest samples sorted by column index 6 (detector-confidence)
                temp_df = temp_df.nlargest(400, 6)
                if folder_path == STIP_target_path:
                    target_data.append(temp_df)
                elif folder_path == STIP_non_target_path:
                    non_target_data.append(temp_df)
            # Exception Handling for Empty STIP files
            except pd.errors.EmptyDataError:
                print("no data in ", folder_path + "/" + video_name)

STIP_target_path = "STIP/target_videos"
STIP_non_target_path = "STIP/non_target_videos"

process_STIPs_in_folder(STIP_target_path)
process_STIPs_in_folder(STIP_non_target_path)

no data in  STIP/non_target_videos/smile\show_your_smile_-)_smile_h_nm_np1_fr_med_0.avi.txt
no data in  STIP/non_target_videos/somersault\LONGESTYARD_somersault_f_cm_np1_le_bad_27.avi.txt
no data in  STIP/non_target_videos/stand\IndianaJonesandTheTempleofDoom_stand_f_nm_np1_ri_med_3.avi.txt
no data in  STIP/non_target_videos/walk\TrumanShow_walk_f_nm_np1_fr_med_23.avi.txt


In [5]:
# Convert Arrays to Pandas DataFrame
target_df = pd.concat(target_data, ignore_index=True)
non_target_df = pd.concat(non_target_data, ignore_index=True)

In [24]:
# Drops the last column as it is an empty column (due to how STIP data provided is formatted)
target_df = target_df.dropna(axis=1)
non_target_df = non_target_df.dropna(axis=1)

In [3]:
column_labels = ["point-type", "x", "y", "t", "sigma2", "tau2", "detector-confidence"]
for i in range(1, 73):
    column_labels.append("dscr-hog (" + str(i) + ")")
for i in range(1, 91):
    column_labels.append("dscr-hof (" + str(i) + ")")

In [30]:
target_df.columns = non_target_df.columns = column_labels

In [5]:
# Export the DataFrame to local, to avoid running the above blocks everytime
target_df.to_csv('STIPs_target.csv', index=False)
non_target_df.to_csv('STIPs_non_target.csv', index=False)

In [5]:
# Import the DataFrames from local instead of running the above blocks everytime
target_df = pd.read_csv("STIPs_target.csv")
non_target_df = pd.read_csv("STIPs_non_target.csv")

In [19]:
# Task 2a

# Dictionary to store Cluster Representatives, with (tau, sigma) values as Key
nt_centers_HOG = {}
nt_centers_HOF = {}

tau_values = [2, 4]
sigma_values = [4, 8, 16, 32, 64, 128]

In [21]:
for t in tau_values:
    for s in sigma_values:
        # Sample 10000 STIPs for each pair of tau2, sigma2 pairs
        filtered_df = non_target_df[(non_target_df["tau2"] == t) & (non_target_df["sigma2"] == s)].sample(10000)

        # Take the HoG and HoF features
        X_HOG = filtered_df.loc[:, "dscr-hog (1)":"dscr-hog (72)"]
        X_HOF = filtered_df.loc[:, "dscr-hof (1)":"dscr-hof (90)"]

        kmeans = KMeans(n_clusters=40)

        kmeans.fit(X_HOG)
        nt_centers_HOG[str((t, s))] = kmeans.cluster_centers_.tolist()

        kmeans.fit(X_HOF)
        nt_centers_HOF[str((t, s))] = kmeans.cluster_centers_.tolist()

In [43]:
# Export the Cluster Representatives for Task 5
with open("HoG_cluster_representatives.json", "w") as fp:
    json.dump(nt_centers_HOG, fp)
with open("HoF_cluster_representatives.json", "w") as fp:
    json.dump(nt_centers_HOF, fp)

In [25]:
# Task 2b, 2c

video_name = input()
# video_name = "STIP/non_target_videos/catch/Ball_hochwerfen_-_Rolle_-_Ball_fangen_(Timo_3)_catch_f_cm_np1_ri_med_1.avi.txt"

In [27]:
# STIP files are tab separated, with comments and a header that is not formatted for a Pandas DataFrame
df = pd.read_csv(video_name, sep="\t", comment='#', header=None)

# select 400 highest samples sorted by column index 6 (detector-confidence)
df = df.nlargest(400, 6)

In [29]:
# Drops the last column as it is an empty column (due to how STIP data provided is formatted)
df = df.dropna(axis=1)
df.columns = column_labels

In [31]:
# Find the Euclidean Distance between the STIPs and Cluster Representatives
# Returns the index of the closest cluster representative
def find_closest_clusters(x, y):
    distances = euclidean_distances(x, y)
    return np.argmin(distances, axis=1)

In [35]:
hog_histograms = []
hof_histograms = []

In [37]:
for t in tau_values:
    for s in sigma_values:

        filtered_df = df[(df["tau2"] == t) & (df["sigma2"] == s)]
        hog_cluster_Id = find_closest_clusters(filtered_df.loc[:, "dscr-hog (1)":"dscr-hog (72)"], nt_centers_HOG[str((t, s))])
        hof_cluster_Id = find_closest_clusters(filtered_df.loc[:, "dscr-hof (1)":"dscr-hof (90)"], nt_centers_HOF[str((t, s))])

        hog_histogram, bin_edges = np.histogram(hog_cluster_Id, bins=np.arange(41))
        hof_histogram, bin_edges = np.histogram(hof_cluster_Id, bins=np.arange(41))

        hog_histograms.append(hog_histogram)
        hof_histograms.append(hof_histogram)

        # if (t == 2) & (s == 4):
        #     plt.figure().set_figwidth(20)
        #     plt.bar(bin_edges[:-1], histogram, width=1.0, edgecolor='black', align='center')
        #     plt.title('40-Dimensional Histogram of STIP Clusters')
        #     plt.xlabel('Cluster ID')
        #     plt.ylabel('Frequency')
        #     plt.xticks(np.arange(40))

        #     plt.show()

In [39]:
# Output for Task 2b and 2c. 480 dimensional Bag of Features Vector
bof_HOG_descriptor = np.concatenate(hog_histograms)
bof_HOF_descriptor = np.concatenate(hof_histograms)