# **Sentences for action recognition using labels generated by mmaction2**

In [1]:
import os
import json
import pandas as pd
import cv2
import csv
from datetime import timedelta


# **Datasets**
*  **ActivityNet dataset**
*  **SumMe dataset**
*  **TVSum dataset**


In [2]:
json_dir = '' #path to the actions detected for a dataset by mmaction2, ./data/ActionRecognition/annotations/..
result = {}
all_dataframes = []
for json_file in os.listdir(json_dir):
    if json_file.endswith('.json'):
        video_name = os.path.splitext(json_file)[0]
        with open(os.path.join(json_dir, json_file), 'r') as f:
            data = json.load(f)
        
        # Remove entries 0-8 because they contain no information
        data = {int(k): v for k, v in data.items() if int(k) >= 8}
        frames = []
        for frame_num, frame_data in data.items():
            for number, action in frame_data.items():
                frames.append({'frame': frame_num, 'action_num': number, 'action': action})
                
        df = pd.DataFrame(frames)
        df['video_name'] = video_name
        all_dataframes.append(df)

final_df = pd.concat(all_dataframes, ignore_index=True)
final_df.head()

Unnamed: 0,frame,action_num,action,video_name
0,8,1,laughing: 0.1,fWutDQy1nnY
1,8,2,singing: 0.07,fWutDQy1nnY
2,8,3,shaking hands: 0.07,fWutDQy1nnY
3,8,4,celebrating: 0.05,fWutDQy1nnY
4,8,5,slapping: 0.04,fWutDQy1nnY


In [3]:
final_df[['action', 'score']] = final_df['action'].str.split(': ', expand=True)
final_df['score'] = pd.to_numeric(final_df['score'])
column_order = ['video_name', 'frame', 'action_num', 'action', 'score']
df = final_df[column_order]
df.head()


Unnamed: 0,video_name,frame,action_num,action,score
0,fWutDQy1nnY,8,1,laughing,0.1
1,fWutDQy1nnY,8,2,singing,0.07
2,fWutDQy1nnY,8,3,shaking hands,0.07
3,fWutDQy1nnY,8,4,celebrating,0.05
4,fWutDQy1nnY,8,5,slapping,0.04


In [4]:
df = df.sort_values(by='score', ascending=False) 
df = df.groupby(['video_name', 'frame'], as_index=False).first()  
filtered_df = df[df['score'] > 0.35]
filtered_df


Unnamed: 0,video_name,frame,action_num,action,score
0,37rzWOQsNIw,8,1,garbage collecting,0.60
1,37rzWOQsNIw,9,1,garbage collecting,0.60
2,37rzWOQsNIw,10,1,garbage collecting,0.60
3,37rzWOQsNIw,11,1,garbage collecting,0.74
4,37rzWOQsNIw,12,1,garbage collecting,0.74
...,...,...,...,...,...
14945,xxdtq8mxegs,381,1,stretching leg,0.45
14946,xxdtq8mxegs,382,1,stretching leg,0.45
14947,xxdtq8mxegs,383,1,stretching leg,0.39
14948,xxdtq8mxegs,384,1,stretching leg,0.39


In [5]:
def group_activities(df):
    grouped_activities = {}

    for video_id, group in df.groupby('video_name'):
        group = group.sort_values('frame')
        results = []

        start_frame = None
        prev_frame = None
        prev_activity = None

        for _, row in group.iterrows():
            frame, activity = row['frame'], row['action']
            if prev_activity != activity or (prev_frame is not None and frame != prev_frame + 1):
                if prev_activity is not None:
                    results.append((start_frame, prev_frame, prev_activity))
                start_frame = frame

            prev_frame = frame
            prev_activity = activity

        if prev_activity is not None:
            results.append((start_frame, prev_frame, prev_activity))

        grouped_activities[video_id] = results

    return grouped_activities

dictionary = group_activities(filtered_df)
len(dictionary)

20

In [6]:
def format_time(seconds):
    minutes, seconds = divmod(int(seconds), 60)
    if minutes >0:
        return f"{minutes} minute(s) and {seconds} second(s)"
    else:
        return f"{seconds} second(s)"
video_folder='/home/wikaaxx/Desktop/thesis/datasets/TVSum/videos'
def generate_sentences_and_save(grouped_activities, video_folder, output_csv):
    data = []

    for video_name, actions in grouped_activities.items():
        name= f"{video_name}.mp4"
        video_path = os.path.join(video_folder, name)
        #print(video_path)
        cap = cv2.VideoCapture(video_path)

        # Extract frame rate to calculate timestamps
        fps = cap.get(cv2.CAP_PROP_FPS)
        if not fps or fps <= 0:
            raise ValueError(f"Could not extract FPS for video: {video_name}")

        sentences = []
        for start, end, action in actions:
            start_time = start * 10 / fps
            end_time = end * 10/ fps
            start_time_formatted = format_time(start_time)
            end_time_formatted = format_time(end_time)
            sentences.append(f"Action: {action} was detected from {start_time_formatted} to {end_time_formatted}.")

        description = f"In the video {video_name}, " + " ".join(sentences)
        data.append({'video': video_name, 'description': description})
        cap.release()

    # Write the output CSV
    with open(output_csv, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['video', 'description'])
        writer.writeheader()
        writer.writerows(data)

dictionary = group_activities(filtered_df)
output_csv = "TVSum_action_sentences_mmaction2.csv"

generate_sentences_and_save(dictionary, video_folder, output_csv)

* **MSVD dataset**

In [None]:
df=pd.read_csv('MSVD_actions.csv')
df['video'] = df['video'].str.replace('.avi', '', regex=False)
df

In [None]:
#take duration of the entire video as end_time since the videos are snippets of single actions
video_folder_path = '/home/wikaaxx/Desktop/thesis/datasets/MSVD/MSVD_videos' #put the path the MSVD videos here

def get_video_info(video_folder_path, df):
    video_dict = {}
    for video_file in os.listdir(video_folder_path):
        if video_file.endswith('.avi'):
                video_file_path = os.path.join(video_folder_path, video_file)
                video_name = os.path.basename(video_file).split('.')[0]
                
                
                video_row = df[df['video'] == video_name]  
                if not video_row.empty:
                    action = video_row['action'].values[0]
                    
                video = cv2.VideoCapture(video_file_path)
                fps = video.get(cv2.CAP_PROP_FPS)
                total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
                start_time = timedelta(seconds=0)  
                end_time = timedelta(seconds=total_frames / fps)
                
                video_dict[video_file] = {'Action': action, 'Start': str(start_time), 'End': str(end_time)}
    
    return video_dict
annotations_dict = get_video_info(video_folder_path, df)
annotations_dict

In [None]:
# creating sentences like in ActivityNet
def convert_time(time_str):
    hours, minutes, seconds = time_str.split(":")
    hours = int(hours)
    minutes = int(minutes)
    seconds = float(seconds)
    if hours > 0:
        return f"{hours} hour(s), {minutes} minute(s) and {int(seconds)} second(s)"
    elif minutes > 0:
        return f"{minutes} minute(s) and {int(seconds)} second(s)"
    else:
        return f"{int(seconds)} second(s)"

video_descriptions = []
for video, annotation in annotations_dict.items():
    label = annotation['Action'].lower()
    start_time = convert_time(annotation['Start'])
    end_time = convert_time(annotation['End'])
    time_range = f"from {start_time} to {end_time}"
  
    sentences = []
    sentences.append(f"In the video {video}: ")
    sentence = f"action {label} was detected {time_range}."
    sentences.append(sentence)

    description = " ".join(sentences)
    video_descriptions.append({"video": video, "description": description})
df_output = pd.DataFrame(video_descriptions)
df_output.to_csv("MSVD_action_sentences_mmaction2.csv", index=False)