## WORKFLOW EXTRACTION
<!-- Prerequisites -->

Ensure Python and the required libraries are installed. Install dependencies using: -->

In [3]:
import pandas as pd
import os


<!-- Features -->

<!-- Reads categorized and clustered CSV files containing workflow step timestamps.

Computes the earliest start time and latest end time for each workflow step.

Converts timestamps from seconds into hh:mm:ss format.

Saves the final timestamp ranges for each workflow step into a new CSV file. -->

In [None]:

input_dir = '/Users/nvaishnavi/Documents/Instructional_Video_analysis/5_clustered_categorized_file'
output_dir = '/Users/nvaishnavi/Documents/Instructional_Video_analysis/6_final_timestamp_ranges'
os.makedirs(output_dir, exist_ok=True)
 
workflow_steps = {
    0: "Introduction",
    1: "Step-by-Step Instruction",
    2: "Context Setting",
    3: "Transitions",
    4: "Recap",
    5: "Practical Application",
    6: "Conclusion"
}


def seconds_to_time(seconds):
    try:
        if seconds is None:
            return "00:00:00"  
        seconds = float(seconds)  
        minutes, seconds = divmod(seconds, 60)
        hours, minutes = divmod(minutes, 60)
        return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}"
    except (ValueError, TypeError):
        return "00:00:00"  # Default 


for filename in os.listdir(input_dir):
    if filename.endswith('.csv'):  
        print(f"Processing file: {filename}")

        
        file_path = os.path.join(input_dir, filename)
        df = pd.read_csv(file_path)

        
        timestamp_ranges = {step: {'start': None, 'end': None} for step in workflow_steps.values()}

        
        for index, row in df.iterrows():
            workflow_step = row['Workflow Step']
            start_time = row['Start Time']
            end_time = row['End Time']

           
            if start_time is None or end_time is None or pd.isnull(start_time) or pd.isnull(end_time):
                continue

            
            try:
                start_time_parts = start_time.split(":")
                start_time_seconds = int(start_time_parts[0]) * 3600 + int(start_time_parts[1]) * 60 + float(start_time_parts[2])
                
                end_time_parts = end_time.split(":")
                end_time_seconds = int(end_time_parts[0]) * 3600 + int(end_time_parts[1]) * 60 + float(end_time_parts[2])
            except ValueError:
                continue  

            
            if timestamp_ranges[workflow_step]['start'] is None or start_time_seconds < timestamp_ranges[workflow_step]['start']:
                timestamp_ranges[workflow_step]['start'] = start_time_seconds
            if timestamp_ranges[workflow_step]['end'] is None or end_time_seconds > timestamp_ranges[workflow_step]['end']:
                timestamp_ranges[workflow_step]['end'] = end_time_seconds

       
        timestamp_data = []
        for step, times in timestamp_ranges.items():
            start_time = seconds_to_time(times['start'])
            end_time = seconds_to_time(times['end'])
            timestamp_data.append([step, start_time, end_time])

        
        timestamp_df = pd.DataFrame(timestamp_data, columns=["Workflow Step", "Start Time", "End Time"])

        
        output_file = os.path.join(output_dir, f"timestamps_{filename}")
        timestamp_df.to_csv(output_file, index=False)

        print(f"Timestamp ranges saved to: {output_file}")

print("All files processed.")
