In [None]:
#Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
import random
from umap import UMAP
from sklearn.preprocessing import normalize
from sklearn.cluster import DBSCAN
from collections import Counter
import plotly.express as px
import re
from scipy import stats


parent_dir = os.path.dirname(os.getcwd())
current_dir = os.getcwd()

#### Cleaning Functions

In [None]:
def remove_intro_outro_frames(df):
    """
    Removes frames that are likely part of the introduction or outro based on the number of empty fields.

    Parameters:
    df (pd.DataFrame): The input dataframe containing frame data.

    Returns:
    tuple: A tuple containing the cleaned dataframe and the removed frames.
    """
    aux_df = df.copy()

    # List to store the number of empty fields for each row
    num_empty_fields = []
    
    # Iterate over each row in the dataframe
    for index, row in aux_df.iterrows():
        empty_fields = 0
        # Count the number of empty fields in the current row
        for field in row:
            if len(field) == 0:
                empty_fields += 1
        num_empty_fields.append(empty_fields)
    
    # Convert the list to a numpy array for easier indexing
    num_empty_fields = np.array(num_empty_fields)
    
    # Identify and separate the frames to be removed (those with more than 2 empty fields)
    removed_frames = aux_df[num_empty_fields > 2]
    
    # Keep only the frames that have 2 or fewer empty fields
    cleaned_df = aux_df[num_empty_fields <= 2]
    
    # Return the cleaned dataframe and the removed frames
    return cleaned_df, removed_frames

def clean_detections(df):
    """
    Cleans the 'detections' column in the dataframe by selecting only persons with confidence > 0.5
    and filtering out detections outside defined limits.

    Parameters:
    df (pd.DataFrame): The input dataframe containing detection data.

    Returns:
    pd.Series: A series containing the cleaned detections.
    """
    # Create a copy of the relevant columns
    detections_df = df[['filename', 'detections']].copy()

    # Initialize a list to store persons with confidence > 0.5
    persons = []
    
    # Iterate over each detection in the dataframe
    for detection in detections_df['detections']:
        person_in_limits = []
        for obj in detection:
            if obj[4] == 'person' and obj[5] > 0.5:
                person_in_limits.append(obj)
        persons.append(person_in_limits)
    
    # Add the filtered persons to the dataframe
    detections_df['persons'] = persons

    # Further filter the persons to include only those within defined spatial limits
    persons = []
    for detection in detections_df['persons']:
        person_in_limits = []
        for obj in detection:
            x, y, w, h = obj[0], obj[1], obj[2], obj[3]
            x = x + w / 2
            y = y + h / 2
            if y < 525 and x > 125 and x < 1125 and w*h > 150000:
                person_in_limits.append(obj)
        persons.append(person_in_limits)
    
    # Update the dataframe with the filtered persons
    detections_df['persons_in_limits'] = persons
    
    # Return the cleaned 'persons_in_limits' series
    return detections_df['persons_in_limits']

def clean_fer(df):
    """
    Cleans the 'fer' column in the dataframe by filtering out faces outside defined limits
    and removing faces with an area below a specified threshold.

    Parameters:
    df (pd.DataFrame): The input dataframe containing FER (Facial Emotion Recognition) data.

    Returns:
    pd.Series: A series containing the cleaned FER data.
    """
    # Create a copy of the relevant columns
    fer_df = df[['filename', 'fer']].copy()

    # Initialize a list to store faces within defined spatial limits
    faces = []
    
    # Iterate over each frame in the FER data
    for frame in fer_df['fer']:
        faces_in_limits = []
        for face in frame:
            location = face['location']
            x1, x2, y1, y2 = location[0], location[1], location[2], location[3]
            x = (x1 + x2) / 2
            y = (y1 + y2) / 2
            if y < 470 and x > 125 and x < 1125:
                faces_in_limits.append(face)
        faces.append(faces_in_limits)
    
    # Add the filtered faces to the dataframe
    fer_df['faces_in_limits'] = faces

    # Define the area threshold for filtering faces
    area_threshold = 25000
    
    # Initialize a list to store faces with area above the threshold
    faces = []
    
    # Iterate over each frame in the 'faces_in_limits' data
    for frame in fer_df['faces_in_limits']:
        faces_above_area = []
        for face in frame:
            location = face['location']
            x1, x2, y1, y2 = location[0], location[1], location[2], location[3]
            w, h = x2 - x1, y2 - y1
            area = w * h
            if area > area_threshold:
                faces_above_area.append(face)
        faces.append(faces_above_area)
    
    # Update the dataframe with the filtered faces
    fer_df['faces_in_limits_above_area'] = faces
    
    # Return the cleaned 'faces_in_limits_above_area' series
    return fer_df['faces_in_limits_above_area']

def clean_df(df):
    """
    Cleans the input dataframe by removing outliers and cleaning specific columns.

    Parameters:
    df (pd.DataFrame): The input dataframe to be cleaned.

    Returns:
    pd.DataFrame: The cleaned dataframe.
    """
    cleaned_df = df.copy()
    cleaned_df, removed_frames= remove_intro_outro_frames(cleaned_df)
    cleaned_df['detections'] = clean_detections(df)
    cleaned_df['fer'] = clean_fer(df)
    
    return cleaned_df, removed_frames

#### Clustering Functions

In [None]:
def cluster_video(cleaned_df, video, print_results=True):
    """
    Clusters the video frames into specific categories based on the number of persons and faces detected.

    Parameters:
    cleaned_df (pd.DataFrame): The cleaned dataframe containing frame data.
    video (str): The video filename or identifier.
    print_results (bool): Whether to print the clustering results. Defaults to True.

    Returns:
    dict: A dictionary of 5 clusters with the corresponding dataframes.
    """
    # Copy the cleaned dataframe
    df = cleaned_df.copy()
    
    # Add columns to count the number of persons and faces in each frame
    df['num_persons'] = df['detections'].apply(lambda x: len(x))
    df['num_faces'] = df['fer'].apply(lambda x: len(x))
    # First Clustering -> Divide the frames into 3 clusters (Split-view, 1-Person-View, Others)
    
    # Prepare the input for clustering using the 'num_persons' and 'num_faces' columns
    input = df[['num_persons', 'num_faces']].values
    std_input = StandardScaler().fit_transform(input)
    if video == 'ad-ps':
        pca = UMAP(n_neighbors=25, min_dist=0.1, n_components=2, metric='euclidean')
    else:
        pca = PCA()
    reduced_data = pca.fit_transform(std_input)
    #plot umap components
    # Define the number of clusters and apply Agglomerative Clustering
    n_clusters = 3
    hc = AgglomerativeClustering(n_clusters=n_clusters, metric='euclidean', linkage='ward')
    labels = hc.fit_predict(reduced_data)
    #plot_cluster_data(reduced_data, labels, n_clusters=n_clusters, components=(0, 1, 2))
    # Get the filenames of images for each cluster
    cluster_img_indexes = []
    for i in range(n_clusters):
        cluster = []
        cluster.append(df[labels == i]['filename'])
        cluster_img_indexes.append(cluster)
        
    # Assign names to clusters and store the dataframes
    first_cluster_data = assign_first_cluster_names_and_store_df(df, labels, n_clusters)
    # Print the first clustering results if requested
    if print_results:
        print("First Clustering")
        for label, (cluster_name, cluster_df) in first_cluster_data.items():
            print(f"Cluster {label} is named: {cluster_name} and has {len(cluster_df)} frames.")
    
    # Second Clustering -> Cluster the 1-Person-View into 3 clusters
    
    # Get the cluster with the name "1-Person-View"
    for cluster in first_cluster_data:
        if first_cluster_data[cluster][0] == "1-Person-View":
            one_person_view = first_cluster_data[cluster][1]
            break
        
    # Define the feature matrix for the second clustering
    input_data = np.array(one_person_view['embedding'].values.tolist())
    std_input = StandardScaler().fit_transform(input_data)
    
    # Apply UMAP
    umap = UMAP(n_neighbors=15, min_dist=0.1, n_components=3, metric='euclidean')
    reduced_data = umap.fit_transform(std_input)

    # Apply Agglomerative Clustering for the second clustering
    if video == 'ad-ps':
        n_clusters = 5
    else:
        n_clusters = 3
    hc = AgglomerativeClustering(n_clusters=n_clusters, metric='euclidean', linkage='ward')
    labels = hc.fit_predict(reduced_data)
    
    # Get the filenames of images for each cluster in the second clustering
    cluster_img_indexes = []
    for i in range(n_clusters):
        cluster = []
        cluster.append(one_person_view[labels == i]['filename'])
        cluster_img_indexes.append(cluster)
        
    # Assign names to the second clusters and store the dataframes
    second_cluster_data = assign_second_cluster_names_and_store_df(one_person_view, labels, n_clusters)
    
    # Print the second clustering results if requested
    if print_results:
        print("Second Clustering")
        for label, (cluster_name, cluster_df) in second_cluster_data.items():
            print(f"Cluster {label} is named: {cluster_name} and has {len(cluster_df)} frames.")
            
    # Combine all clusters into a single dictionary
    all_clusters = {}
    for i in range(3):
        all_clusters[i] = first_cluster_data[i]
    for i in range(n_clusters):
        all_clusters[i+n_clusters] = second_cluster_data[i]
        
    # Remove the "1-Person-View" cluster as it's already divided
    for cluster in all_clusters:
        if all_clusters[cluster][0] == "1-Person-View":
            del all_clusters[cluster]
            break
    # Reset the keys of the dictionary
    all_clusters = dict(enumerate(all_clusters.values()))
    
    return all_clusters

def assign_first_cluster_names_and_store_df(cleaned_df, labels, n_clusters):
    """
    Assigns names to the first set of clusters based on the mean number of faces in each cluster and stores each cluster DataFrame along with its label.

    Parameters:
    cleaned_df (DataFrame): The dataframe containing the data.
    y_hc (array-like): The cluster labels for each data point in the dataframe.
    n_clusters (int): The number of clusters.

    Returns:
    dict: A dictionary mapping cluster labels to tuples containing the cluster name and DataFrame.
    """
    # Create a list to store each cluster DataFrame along with its label
    cluster_data = {}

    # Create a list to store each cluster in a separate dataframe
    clusters = []
    for i in range(n_clusters):
        cluster = cleaned_df[labels == i]
        clusters.append(cluster)

    # Calculate the mean of the number of persons and faces in each cluster
    means = []
    for cluster in clusters:
        mean_num_persons = cluster['num_persons'].mean()
        mean_num_faces = cluster['num_faces'].mean()
        means.append((mean_num_persons, mean_num_faces))

    # Print the means to verify the output
    for mean in means:
        print(mean)

    # Assign names to the clusters based on the mean values
    for i, mean in enumerate(means):
        if round(mean[1]) == 2:
            cluster_name = "Split-view"
        elif round(mean[1]) == 1:
            cluster_name = "1-Person-View"
        else:
            cluster_name = "Others"

        # Store the cluster name and DataFrame
        cluster_data[i] = (cluster_name, clusters[i])

    return cluster_data

def assign_second_cluster_names_and_store_df(one_person_view, labels, n_clusters):
    """
    Assigns names to the second set of clusters based on the number of frames and stores each cluster DataFrame along with its label.

    Parameters:
    one_person_view (pd.DataFrame): The dataframe containing the "1-Person-View" cluster data.
    labels (array-like): The cluster labels for each data point in the dataframe.
    n_clusters (int): The number of clusters.

    Returns:
    dict: A dictionary mapping cluster labels to tuples containing the cluster name and DataFrame.
    """
    # Create a list to store each cluster DataFrame along with its label
    cluster_data = {}

    # Create a list to store each cluster in a separate dataframe
    clusters = []
    for i in range(n_clusters):
        cluster = one_person_view[labels == i]
        clusters.append(cluster)

    # Order the clusters by length and assign names (Person 1, Person 2, Person 3) based on the number of frames
    clusters.sort(key=len, reverse=True)
    
    for i, cluster in enumerate(clusters):
        if i == 0:
            cluster_name = "Person 1"
        elif i == 1:
            cluster_name = "Person 2"
        elif i == 2:
            cluster_name = "Person 3"
        elif i == 3:
            cluster_name = "Person 4"
        else:
            cluster_name = "Person 5"

        # Store the cluster name and DataFrame
        cluster_data[i] = (cluster_name, clusters[i])
        
    return cluster_data

#### Labeling Functions


In [None]:
def label_df(all_clusters_dict, removed_frames):
    """
    Labels the dataframe by assigning cluster IDs and labels to each frame, including the removed frames.

    Parameters:
    all_clusters_dict (dict): Dictionary of clusters containing dataframes.
    removed_frames (pd.DataFrame): DataFrame of frames that were removed during cleaning.

    Returns:
    pd.DataFrame: The labeled dataframe with cluster IDs and labels.
    """
    # Assign cluster ID and label to removed frames
    removed_frames['cluster_id'] = -1
    removed_frames['cluster_label'] = 'Removed Frame'
    
    # Assign cluster ID and label to each frame in the clusters
    for cluster in all_clusters_dict:
        all_clusters_dict[cluster][1].loc[:, 'cluster_id'] = cluster
        all_clusters_dict[cluster][1].loc[:, 'cluster_label'] = all_clusters_dict[cluster][0]
    
    # Concatenate all cluster dataframes and sort them by filename
    labeled_df = pd.concat([cluster[1] for cluster in all_clusters_dict.values()])
    labeled_df = pd.concat([labeled_df, removed_frames])
    labeled_df.sort_values('filename', inplace=True)
    labeled_df.reset_index(drop=True, inplace=True)
    
    labeled_df = labeled_df[['filename', 'cluster_id', 'cluster_label'] + [col for col in labeled_df.columns if col not in ['filename','cluster_id', 'cluster_label', 'num_persons', 'num_faces']]]
    
    return labeled_df

def label_moderator(labeled_df, video):
    if video == 'ad-ps':
        # Count the number of frames for each person
        p1_len = len(labeled_df[labeled_df['cluster_label'] == "Person 1"])
        p2_len = len(labeled_df[labeled_df['cluster_label'] == "Person 2"])
        p3_len = len(labeled_df[labeled_df['cluster_label'] == "Person 3"])
        p4_len = len(labeled_df[labeled_df['cluster_label'] == "Person 4"])
        p5_len = len(labeled_df[labeled_df['cluster_label'] == "Person 5"])
        
        # Create a list of (person, length) tuples
        clusters = [
            ("Person 1", p1_len),
            ("Person 2", p2_len),
            ("Person 3", p3_len),
            ("Person 4", p4_len),
            ("Person 5", p5_len)
        ]

        # Sort the list by length in descending order
        sorted_clusters = sorted(clusters, key=lambda x: x[1], reverse=True)

        # Assign politic1 and politic2 to the two biggest clusters
        politic1 = sorted_clusters[0][0]
        politic2 = sorted_clusters[1][0]

        # Extract labels for the 3 smallest clusters, excluding politic1 and politic2
        smallest_clusters = sorted_clusters[2:]
        available_labels = [cluster[0] for cluster in smallest_clusters]

        # Shuffle the labels to randomize their assignment
        random.shuffle(available_labels)

        # Assign the randomized labels back to the smallest clusters
        for i, cluster in enumerate(smallest_clusters):
            smallest_clusters[i] = (cluster[0], cluster[1], available_labels[i])

        # Unpack the updated smallest_clusters
        moderator1 = smallest_clusters[0][2]
        moderator2 = smallest_clusters[1][2]
        moderator3 = smallest_clusters[2][2]
        
        # Update the labels in the original DataFrame
        labeled_df.loc[labeled_df['cluster_label'] == moderator1, 'cluster_label'] = 'Moderator1'
        labeled_df.loc[labeled_df['cluster_label'] == moderator2, 'cluster_label'] = 'Moderator2'
        labeled_df.loc[labeled_df['cluster_label'] == moderator3, 'cluster_label'] = 'Moderator3'
        labeled_df.loc[labeled_df['cluster_label'] == politic1, 'cluster_label'] = 'Politic 1'
        labeled_df.loc[labeled_df['cluster_label'] == politic2, 'cluster_label'] = 'Politic 2'
    else:
        # Count the number of frames for each person
        p1_len = len(labeled_df[labeled_df['cluster_label'] == "Person 1"])
        p2_len = len(labeled_df[labeled_df['cluster_label'] == "Person 2"])
        p3_len = len(labeled_df[labeled_df['cluster_label'] == "Person 3"])
        
        
        # Create a list of (person, length) tuples
        clusters = [
            ("Person 1", p1_len),
            ("Person 2", p2_len),
            ("Person 3", p3_len)
        ]
        
        sorted_clusters = sorted(clusters, key=lambda x: x[1], reverse=True)
        
        politic1 = sorted_clusters[0][0]
        politic2 = sorted_clusters[1][0]
        moderator = sorted(clusters, key=lambda x: x[1])[0][0]
        
        # Update the labels in the original DataFrame
        labeled_df.loc[labeled_df['cluster_label'] == moderator, 'cluster_label'] = 'Moderator'
        labeled_df.loc[labeled_df['cluster_label'] == politic1, 'cluster_label'] = 'Politic 1'
        labeled_df.loc[labeled_df['cluster_label'] == politic2, 'cluster_label'] = 'Politic 2'
    
    return labeled_df

#### Video Segmenting Functions

In [None]:
def smooth_video(labeled_df, window_size=21):
    # Format the dataframe
    df = labeled_df.copy()
    df = df.drop(['detections', 'poses', 'faces', 'text', 'embedding'], axis=1)
    df['file_number'] = df['filename'].apply(extract_number) - 1
    cluster_dict = get_video_labels(df)

    # Apply the sliding window approach to smooth the cluster labels to get a more accurate video segmenting process
    df['smoothed_cluster'] = df['cluster_id'].rolling(window_size, center=True).apply(lambda x: stats.mode(x)[0])
    df['smoothed_cluster_label'] = df['smoothed_cluster'].map(cluster_dict)
    df['smoothed_cluster_label'] = df['smoothed_cluster_label'].fillna(df['cluster_label'])
    df['smoothed_cluster'] = df['smoothed_cluster'].fillna(df['cluster_id'])
    
    # Get the time segments for each of the cluster labels
    df['cluster_change'] = (df['smoothed_cluster_label'] != df['smoothed_cluster_label'].shift()).cumsum()
    df['emotion'] = df['fer'].apply(lambda x: x[0]['emotion'] if isinstance(x, list) and x else None)

    # Group by cluster change and extract emotions for each segment
    file_intervals = df.groupby('cluster_change').agg({'file_number': ['min', 'max'], 'emotion': lambda x: x.tolist()})
    file_intervals.columns = ['min', 'max', 'emotions']

    # Create a new dataframe with timeline exclusive data
    smoothed_timeline_df = pd.DataFrame()
    smoothed_timeline_df['start(s)'] = file_intervals['min'].astype(int)
    smoothed_timeline_df['end(s)'] = file_intervals['max'].astype(int)
    smoothed_timeline_df['start(hour)'] = pd.to_datetime(smoothed_timeline_df['start(s)'].apply(lambda x: '{:02d}:{:02d}:{:02d}'.format(x // 3600, x % 3600 // 60, x % 3600 % 60)), format='%H:%M:%S')
    smoothed_timeline_df['end(hour)'] = pd.to_datetime(smoothed_timeline_df['end(s)'].apply(lambda x: '{:02d}:{:02d}:{:02d}'.format(x // 3600, x % 3600 // 60, x % 3600 % 60)), format='%H:%M:%S')
    smoothed_timeline_df['duration'] = smoothed_timeline_df['end(s)'] - smoothed_timeline_df['start(s)'] + 1
    smoothed_timeline_df['emotions'] = smoothed_timeline_df.index.map(lambda x: file_intervals.loc[x, 'emotions'])
    smoothed_timeline_df['major_emotion'] = smoothed_timeline_df['emotions'].apply(major_emotion)
    
    smoothed_timeline_df = smoothed_timeline_df.drop('emotions', axis=1)
    # Assign the correct cluster ids and labels for each of the time segments
    cluster_mapping = df.drop_duplicates('cluster_change').set_index('cluster_change')['smoothed_cluster_label'].to_dict()
    cluster_id_mapping = df.drop_duplicates('cluster_change').set_index('cluster_change')['smoothed_cluster'].to_dict()
    smoothed_timeline_df['cluster_id'] = smoothed_timeline_df.index.map(cluster_id_mapping)
    smoothed_timeline_df['cluster_label'] = smoothed_timeline_df.index.map(cluster_mapping)
    # Format the dataframe
    smoothed_timeline_df = smoothed_timeline_df.drop(["start(s)", "end(s)"], axis=1)
    smoothed_timeline_df = smoothed_timeline_df.reset_index(drop=True)
    smoothed_timeline_df = smoothed_timeline_df.sort_values(by='start(hour)', ascending=True)

    return smoothed_timeline_df

def non_smooth_video(labeled_df):
    # Format the dataframe
    df = labeled_df.copy()
    df = df.drop(['detections', 'poses', 'faces', 'text', 'embedding'], axis=1)
    df['file_number'] = df['filename'].apply(extract_number) - 1

    # Get the time segments for each of the cluster labels
    df['cluster_change'] = (df['cluster_label'] != df['cluster_label'].shift()).cumsum()
    df['emotion'] = df['fer'].apply(lambda x: x[0]['emotion'] if isinstance(x, list) and x else None)

    # Group by cluster change and extract emotions for each segment
    file_intervals = df.groupby('cluster_change').agg({'file_number': ['min', 'max'], 'emotion': lambda x: x.tolist()})
    file_intervals.columns = ['min', 'max', 'emotions']

    # Create a new dataframe with timeline exclusive data
    timeline_df = pd.DataFrame()
    timeline_df['start(s)'] = file_intervals['min'].astype(int)
    timeline_df['end(s)'] = file_intervals['max'].astype(int)
    timeline_df['start(hour)'] = pd.to_datetime(timeline_df['start(s)'].apply(lambda x: '{:02d}:{:02d}:{:02d}'.format(x // 3600, x % 3600 // 60, x % 3600 % 60)), format='%H:%M:%S')
    timeline_df['end(hour)'] = pd.to_datetime(timeline_df['end(s)'].apply(lambda x: '{:02d}:{:02d}:{:02d}'.format(x // 3600, x % 3600 // 60, x % 3600 % 60)), format='%H:%M:%S')
    timeline_df['duration'] = timeline_df['end(s)'] - timeline_df['start(s)'] + 1
    timeline_df['emotions'] = timeline_df.index.map(lambda x: file_intervals.loc[x, 'emotions'])
    timeline_df['major_emotion'] = timeline_df['emotions'].apply(major_emotion)
    timeline_df = timeline_df.drop('emotions', axis=1)
    
    # Assign the correct cluster ids and labels for each of the time segments
    cluster_mapping = df.drop_duplicates('cluster_change').set_index('cluster_change')['cluster_label'].to_dict()
    cluster_id_mapping = df.drop_duplicates('cluster_change').set_index('cluster_change')['cluster_id'].to_dict()
    timeline_df['cluster_id'] = timeline_df.index.map(cluster_id_mapping)
    timeline_df['cluster_label'] = timeline_df.index.map(cluster_mapping)
    
    # Format the dataframe
    timeline_df = timeline_df.drop(["start(s)", "end(s)"], axis=1)
    timeline_df = timeline_df.reset_index(drop=True)
    timeline_df = timeline_df.sort_values(by='start(hour)', ascending=True)
    
    return timeline_df

def segment_video_timeline_emotion(labeled_df):
    """
    Segments the video timeline based on the emotion detected in each frame.
    
    Parameters:
    """
    smoothed_timeline_df = smooth_video(labeled_df, window_size=31)
    timeline_df = non_smooth_video(labeled_df)
    
    return smoothed_timeline_df, timeline_df

def major_emotion(emotions_list):
    # Count the occurrences of each emotion
    emotion_counts = Counter(emotions_list)
    
    # Find the emotion with the highest count
    major_emotion = max(emotion_counts, key=emotion_counts.get)
    
    
    return major_emotion

def extract_number(s):
    """
        Description: This function extracts the number from a string
        
        input: s -> string
        output: int -> number extracted from the string
    """    
    return int(re.search(r'\d+', s).group())  
  
def get_video_labels(df):
    """
        Description: This function gets the video labels
        
        input: df -> dataframe with the following columns: cluster_id, cluster_label
        output: cluster_dict -> dictionary with the cluster_id as key and the cluster_label as value
    """    
    unique_df = df.drop_duplicates(subset=['cluster_id', 'cluster_label'])
    cluster_dict = unique_df.set_index('cluster_id')['cluster_label'].to_dict()
    
    return dict(sorted(cluster_dict.items()))

def get_total_screen_time(timeline_df):
    """
        Description: This function gets the total screen time for each cluster
        
        input: timeline_df -> dataframe with the following columns: start(min), end(min), duration, cluster_id, cluster_label; ordered by start(min)
        output: total_screen_time_df -> dataframe with the following columns: cluster_label, duration, duration(min)
    """
    total_screen_time = timeline_df.groupby('cluster_label')['duration'].sum()
    total_screen_time_df = total_screen_time.reset_index()
    total_screen_time_df['duration(hour)'] = total_screen_time_df['duration'].apply(lambda x: '{:02d}:{:02d}:{:02d}'.format(x // 3600, x % 3600//60, x % 3600%60))
    return total_screen_time_df

def get_total_time_by_emotion(timeline_df):
    # Group by cluster_label and emotion, and sum the durations
    total_time_by_emotion = timeline_df.groupby(['cluster_label', 'major_emotion'])['duration'].sum().reset_index()
    
    # Pivot the table to have total time for each emotion in each cluster
    total_time_by_emotion_pivot = total_time_by_emotion.pivot(index='cluster_label', columns='major_emotion', values='duration').fillna(0)
    
    # Calculate total duration for each cluster
    total_duration_by_cluster = timeline_df.groupby('cluster_label')['duration'].sum().reset_index()
    total_duration_by_cluster['total_duration(hour)'] = total_duration_by_cluster['duration'].apply(lambda x: '{:02d}:{:02d}:{:02d}'.format(x // 3600, x % 3600 // 60, x % 3600 % 60))
    total_duration_by_cluster = total_duration_by_cluster.drop('duration', axis=1)
    
    # Merge total time by emotion and total duration by cluster
    combined_df = pd.merge(total_time_by_emotion_pivot, total_duration_by_cluster, on='cluster_label', how='left')
    
    # Convert durations to 'hh:mm' format
    for emotion in total_time_by_emotion_pivot.columns:
        combined_df[f"{emotion}(hour)"] = combined_df[emotion].apply(lambda x: '{:02d}:{:02d}:{:02d}'.format(int(x) // 3600, int(x) % 3600 // 60, int(x) % 3600 % 60) if not pd.isnull(x) else '00:00:00')
        
        combined_df = combined_df.drop(emotion, axis=1)
    return combined_df

#### Plot Functions

In [None]:
emotion_colors = {
    'Neutral': '#D3D3D3',   # Light grey
    'Happiness': '#FFD700',  # Gold
    'Sadness': '#87CEEB',    # Sky blue
    'Surprise': '#FFA500',   # Orange
    'Fear': '#BA55D3',       # Medium purple
    'Disgust': '#98FB98',    # Pale green
    'Anger': '#FF6347',      # Tomato
    'Contempt': '#BC8F8F',    # Rosy brown
    'None': '#000000'        # Black
}

def plot_cluster_data(reduced_data, label_vector, n_clusters, components=(0, 1, 2)):
    """
    Visualize 2D and 3D scatter plots for the given clustering results.
    
    Parameters:
    reduced_data (np.array): The reduced data in 2D or 3D.
    label_vector (np.array): The cluster labels for each data point.
    n_clusters (int): The number of clusters.
    components (tuple): The indices of the components to plot. Defaults to (0, 1, 2).
    """
    # Create a DataFrame from the data for easier plotting with Plotly Express
    df = pd.DataFrame(reduced_data, columns=[f'Principal Component {i+1}' for i in range(reduced_data.shape[1])])
    df['Cluster'] = label_vector
    df['Cluster Size'] = df.groupby('Cluster')['Cluster'].transform('size')

    # 2D Scatter Plot
    fig = px.scatter(df, x=f'Principal Component {components[0]+1}', y=f'Principal Component {components[1]+1}', color='Cluster', color_continuous_scale='Viridis', size='Cluster Size')
    fig.update_layout(title='PCA 1st Round clustering', width=700, height=500, xaxis=dict(title=dict(text='Number of Persons', font=dict(size=16))), yaxis=dict(title=dict(text='Number of Faces', font=dict(size=16))))
    fig.show()

    # 3D Scatter Plot
    if reduced_data.shape[1] >= 3 and len(components) >= 3:
        fig = px.scatter_3d(df, x=f'Principal Component {components[0]+1}', y=f'Principal Component {components[1]+1}', z=f'Principal Component {components[2]+1}', color='Cluster', color_continuous_scale='Viridis', size='Cluster Size')
        fig.update_layout(title='Clusters of customers (3D)')
        fig.show()

def visualize_labeled_df(labeled_df, video, images_per_cluster=10, images_per_row=10):
    """
    Visualizes the labeled dataframe by displaying sample images from each cluster.

    Parameters:
    labeled_df (pd.DataFrame): The labeled dataframe containing cluster information.
    images_per_cluster (int): Number of images to display per cluster. Defaults to 10.
    images_per_row (int): Number of images to display per row. Defaults to 10.

    Returns:
    None
    """
    for cluster in labeled_df['cluster_id'].sort_values().unique():
        # Print the cluster details
        print(f"Cluster {cluster} ({labeled_df[labeled_df['cluster_id'] == cluster]['cluster_label'].iloc[0]}) -> size: {len(labeled_df[labeled_df['cluster_id'] == cluster])} frames")
        
        # Get the filenames of images for the current cluster
        cluster_data = labeled_df[labeled_df['cluster_id'] == cluster]['filename'].values[:images_per_cluster]
        num_images = len(cluster_data)
        
        # Calculate the number of rows required for displaying images
        num_rows = (num_images + images_per_row - 1) // images_per_row
        fig, axes = plt.subplots(num_rows, images_per_row, figsize=(20, 2 * num_rows))
        axes = axes.ravel()
        
        # Display each image in the subplot
        for i, img_path in enumerate(cluster_data):
            img = Image.open(os.path.join(current_dir, 'processed', video, img_path))
            ax = axes[i]
            ax.imshow(img)
            ax.axis('off')
            ax.set_title(img_path)
        
        # Hide any remaining subplot axes
        for j in range(num_images, num_rows * images_per_row):
            axes[j].axis('off')
            
        # Adjust the layout and display the plot
        plt.tight_layout()
        plt.show()

def plot_total_time_by_emotion(total_time_by_emotion_df):
    total_time_by_emotion_df['start'] = pd.to_datetime('00:00:00', format='%H:%M:%S')
    total_time_by_emotion_df = total_time_by_emotion_df.sort_values(by='cluster_label')
    
    fig = px.timeline(total_time_by_emotion_df, x_start='start', x_end='total_duration(hour)', y='cluster_label', 
                      color='major_emotion', color_discrete_map=emotion_colors, 
                      title='Total Time by Emotion per Cluster')
    
    fig.update_layout(yaxis={'categoryorder': 'total ascending'})
    fig.update_yaxes(title_text='Cluster Label')
    fig.update_xaxes(title_text='Total Duration (hour)')
    
    fig.show()

def plot_timeline_w_major_emotion(timeline_df,video):
    timeline_df = timeline_df.sort_values(by='cluster_label')
    fig = px.timeline(timeline_df, x_start='start(hour)', x_end='end(hour)', y='cluster_label', 
                      color='major_emotion', color_discrete_map=emotion_colors, 
                      title='Video Timeline - '+ video)
    fig.show()

def plot_timeline(timeline_df):
    """
        Description: This function plots the video timeline
        
        input: timeline_df -> dataframe with the following columns: start(min), end(min), duration, cluster_id, cluster_label; ordered by start(min)
    """
    timeline_df = timeline_df.sort_values(by='cluster_label')
    fig = px.timeline(timeline_df, x_start='start(hour)', x_end='end(hour)', y='cluster_label', 
                      color='cluster_label', color_discrete_map=emotion_colors, 
                      title='Video Timeline')
    fig.show()
    
def plot_total_screen_time(timeline_df,video):
    """
        Description: This function plots the total screen time for each cluster
        
        input: timeline_df -> dataframe with the following columns: start(min), end(min), duration, cluster_id, cluster_label; ordered by start(min)
    """
    total_screen_time = timeline_df.groupby('cluster_label')['duration'].sum()
    total_screen_time_df = total_screen_time.reset_index()
    total_screen_time_df['duration(hour)'] = pd.to_datetime(total_screen_time_df['duration'].apply(lambda x: '{:02d}:{:02d}:{:02d}'.format(x // 3600, x % 3600//60, x % 3600%60)), format='%H:%M:%S')
    total_screen_time_df['start']= pd.to_datetime('00:00:00', format='%H:%M:%S')
    total_screen_time_df.sort_values(by='cluster_label', inplace=True)
    
    fig = px.timeline(total_screen_time_df, x_start ='start', x_end='duration(hour)', y='cluster_label', 
                      color='cluster_label', color_discrete_map=None, 
                      title='Screen Time per Cluster - '+ video)
    
    fig.update_traces(marker=dict(color='#1a7277'))  # Set color for all bars
    
    fig.show()

### Single Video Analysis

In [None]:
def read_video_pickel(video, current_dir):
    data = pd.read_pickle(os.path.join(current_dir, 'processed', video + '.pkl'))
    data.sort_values('filename', inplace=True)
    data.reset_index(drop=True, inplace=True)
    
    return data

video = 'chega-ps'
# Read the video pickle file
original_df = read_video_pickel(video, current_dir)

# Clean the original dataframe using the clean_df function
cleaned_df, removed_frames = clean_df(original_df)
# Get dictionary of 5 clusters with the corresponding dataframes (Person 1, Person 2, Person 3, Split-view, Others)
all_clusters_dict = cluster_video(cleaned_df, video, print_results=False) 

# Put all the dataframes in a single dataframe with cluster IDs and labels
labeled_df = label_df(all_clusters_dict, removed_frames)
labeled_df = label_moderator(labeled_df, video)

smoothed_timeline_df, timeline_df = segment_video_timeline_emotion(labeled_df)

plot_timeline_w_major_emotion(timeline_df,video)
plot_total_screen_time(timeline_df,video)
plot_timeline_w_major_emotion(smoothed_timeline_df,video)
plot_total_screen_time(smoothed_timeline_df,video)

# Visualize the results of the clustering
visualize_labeled_df(labeled_df, video, images_per_cluster=10, images_per_row=10)

if video == 'ad-ps':
    clusters_order = ['Removed Frame', 'Split-view', 'Politic 1', 'Politic 2', 'Moderator1', 'Moderator2', 'Moderator3', 'Others']
else:
    clusters_order = ['Removed Frame', 'Split-view', 'Politic 1', 'Politic 2', 'Moderator', 'Others']
#for cluster in clusters_order:
#    print(f"Cluster: {cluster} -> {len(labeled_df[labeled_df['cluster_label'] == cluster])} frames")
#print('*'*10)
# Save the labeled dataframe to a pickle file
#labeled_df.to_pickle(os.path.join(current_dir, 'labeled', video + '_labeled.pkl'))

In [None]:
def read_video_pickel(video, current_dir):
    data = pd.read_pickle(os.path.join(current_dir, 'processed', video + '.pkl'))
    data.sort_values('filename', inplace=True)
    data.reset_index(drop=True, inplace=True)
    
    return data

files = os.listdir('processed')
videos = [f for f in files if f.endswith('.pkl')]
video_titles = [f.split('.')[0] for f in videos]
for video in video_titles:
    print(f"Processing video: {video}")
    # Read the video pickle file
    original_df = read_video_pickel(video, current_dir)

    # Clean the original dataframe using the clean_df function
    cleaned_df, removed_frames = clean_df(original_df)
    # Get dictionary of 5 clusters with the corresponding dataframes (Person 1, Person 2, Person 3, Split-view, Others)
    all_clusters_dict = cluster_video(cleaned_df, video, print_results=False) 

    # Put all the dataframes in a single dataframe with cluster IDs and labels
    labeled_df = label_df(all_clusters_dict, removed_frames)
    labeled_df = label_moderator(labeled_df, video)

    #smoothed_timeline_df, timeline_df = segment_video_timeline_emotion(labeled_df)

    #plot_timeline_w_major_emotion(smoothed_timeline_df,video)
    #plot_total_screen_time(timeline_df,video)
    #plot_total_screen_time(smoothed_timeline_df,video)

    # Visualize the results of the clustering
    visualize_labeled_df(labeled_df, video, images_per_cluster=40, images_per_row=10)

    #if video == 'ad-ps':
    #    clusters_order = ['Removed Frame', 'Split-view', 'Politic 1', 'Politic 2', 'Moderator1', 'Moderator2', 'Moderator3', 'Others']
    #else:
    #    clusters_order = ['Removed Frame', 'Split-view', 'Politic 1', 'Politic 2', 'Moderator', 'Others']
    #for cluster in clusters_order:
    #    print(f"Cluster: {cluster} -> {len(labeled_df[labeled_df['cluster_label'] == cluster])} frames")
    #print('*'*10)
    # Save the labeled dataframe to a pickle file
    labeled_df.to_pickle(os.path.join(current_dir, 'labeled', video + '_labeled.pkl'))