In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
CLUSTER_PATH = "/Users/lukefoster/Documents/CompEtho/BM_GPU/results/cluster"

In [3]:
# save all cluster files to all_clusters list
all_clusters = [CLUSTER_PATH + "/" + f for f in os.listdir(CLUSTER_PATH)]

In [4]:
# find the largest cluster number within cluster data
def find_max(data):
    max_values = []

    for key, value in data.items():
        max_values.append(max(value))

    return int(max(max_values))

In [5]:
# create a csv of normalized cluster data
def create_csv(cluster_data, title):
    # create a list of all cluster values within cluster data
    cluster_values = np.arange(0, find_max(cluster_data)+1)

    # create an empty array that will be filled with cluster data
    arr = np.empty((len(cluster_data), len(cluster_values)+1), dtype="object")

    for i, dict in enumerate(cluster_data.items()):
        # key = dict[0]
        # value = dict[1]
        row = np.empty(len(cluster_values)+1, dtype="object")

        # set first row element to be file name
        row[0] = dict[0]

        values = []
    
        for value in dict[1]:
            if value >= 0:
                values.append(int(value))

        # number of good frames for video = len(values)

        try:
            for j in range(1,len(cluster_values)+1):
                row[j] = values.count(j-1) / len(values)
        except (ZeroDivisionError):
            print(f"{dict[0]} has 0 good clusters")

        arr[i] = row

    # convert array into dataframe
    DF = pd.DataFrame(arr)
    
    # add mean of each cluster proportion to dataframe
    means = np.empty((1,len(cluster_values)+1), dtype="object")
    means[0][0] = "mean"
    for i in range(1, len(cluster_values)+1):
        means[0][i] = DF[i].mean()  
    means = pd.DataFrame(means)   
    
    # add standard deviation of each cluster proportion to dataframe
    stds = np.empty((1,len(cluster_values)+1), dtype="object")
    stds[0][0] = "standard deviation"
    for i in range(1, len(cluster_values)+1):
        stds[0][i] = DF[i].std()  
    stds = pd.DataFrame(stds)
    
    DF = DF.append(means, ignore_index=True)
    DF = DF.append(stds, ignore_index=True)

    # rename first column to "Video File"
    DF = DF.rename(columns={0: "Video File"})

    # loop through each other column, renaming them to "Cluster {n}"
    for i in range(1,len(cluster_values)+1):
        DF = DF.rename(columns={i: f"Cluster {i-1}"})

    # save the dataframe as a csv
    DF.to_csv(f"{CLUSTER_PATH}/{title}.csv")

In [6]:
all_cluster_data = {}

for file in all_clusters:
    try:
        all_cluster_data[f"{file.split('cluster_')[-1].split('.npy')[0]}"] = np.load(file)
    except (ValueError):
        print("Non-.npy file in directory")

create_csv(all_cluster_data, title="all_clusters")

Non-.npy file in directory
07YTminfd_antmovie200928211544 has 0 good clusters
newant_antmovie200924212211 has 0 good clusters
12RWminfd_antmovie200927224447 has 0 good clusters
10BYmidfd_antmovie200928222401 has 0 good clusters
11RBminfd_antmovie200927222415 has 0 good clusters
00AAmidfr_antmovie200926192629 has 0 good clusters
01SSmidfr_antmovie201010190036 has 0 good clusters
02BBmidfd_antmovie200926202619 has 0 good clusters


In [7]:
min_cluster_data = {}

for file in all_clusters:
    if "min" in file.split("/")[-1]:
        try:
            min_cluster_data[f"{file.split('cluster_')[-1].split('.npy')[0]}"] = np.load(file)
        except (ValueError):
            print("Non-.npy file in directory")

create_csv(min_cluster_data, title="min_clusters")

07YTminfd_antmovie200928211544 has 0 good clusters
12RWminfd_antmovie200927224447 has 0 good clusters
11RBminfd_antmovie200927222415 has 0 good clusters


In [8]:
mid_cluster_data = {}

for file in all_clusters:
    if "mid" in file.split("/")[-1]:
        try:
            mid_cluster_data[f"{file.split('cluster_')[-1].split('.npy')[0]}"] = np.load(file)
        except (ValueError):
            print("Non-.npy file in directory")
            
create_csv(mid_cluster_data, title="mid_clusters")

10BYmidfd_antmovie200928222401 has 0 good clusters
00AAmidfr_antmovie200926192629 has 0 good clusters
01SSmidfr_antmovie201010190036 has 0 good clusters
02BBmidfd_antmovie200926202619 has 0 good clusters


In [9]:
maj_cluster_data = {}

for file in all_clusters:
    if "maj" in file.split("/")[-1]:
        try:
            maj_cluster_data[f"{file.split('cluster_')[-1].split('.npy')[0]}"] = np.load(file)
        except (ValueError):
            print("Non-.npy file in directory")
            
create_csv(maj_cluster_data, title="maj_clusters")

In [10]:
other_cluster_data = {}

for file in all_clusters:
    if "min" not in file.split("/")[-1] and "mid" not in file.split("/")[-1] and "maj" not in file.split("/")[-1]:
        try:
            other_cluster_data[f"{file.split('cluster_')[-1].split('.npy')[0]}"] = np.load(file)
        except (ValueError):
            print("Non-.npy file in directory")
            
create_csv(other_cluster_data, title="other_clusters")

Non-.npy file in directory
newant_antmovie200924212211 has 0 good clusters
