# Matching timestamps
The teams transcriptions come with accurate timestamps for individual speakers since they are using separate microphones. Unfortunately, whisper does not have automatic knowledge of who's saying what. To make up for this, the transcribed speech from whisper is matched with the teams transcription timestamp.
<br><br>
This is achieved by taking a sentence from the whisper transcription and looking at its timestamp (start/end times). A specific sentence is then assigned the speaker tag of the time section with the most overlapping time from the teams transcription. 

In [None]:
import os
import json
import re
import pandas as pd
import numpy as np
import datetime

folder_path = os.path.join("")
if not os.path.exists(folder_path):
    print(f"Folder does not exist")

In [None]:
# Teams transcription in tsv-format
df_teams = pd.read_csv(os.path.join(folder_path, "teams.tsv"), sep="\t")

print(f"Shape of df: {df_teams.shape}")
df_teams.head()

In [None]:
# Whisper transcription in tsv-format
df_whisper = pd.read_csv(os.path.join(folder_path, "whisper_transcription_large-v2.tsv"), sep="\t")

print(f"Shape of df: {df_whisper.shape}")
df_whisper.head()

In [None]:
# Function to find the closest timestamp in df2 to a given timestamp in df1
def find_overlap(start1, end1, start2, end2):
    overlap = min(end1, end2) - max(start1, start2)

    if overlap < 0:
        return 0
    else:
        return overlap
    

# Function to format the timestamp
def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'):
    assert seconds >= 0, "non-negative timestamp expected"
    milliseconds = round(seconds * 1000.0)

    hours = milliseconds // 3_600_000
    milliseconds -= hours * 3_600_000

    minutes = milliseconds // 60_000
    milliseconds -= minutes * 60_000

    seconds = milliseconds // 1_000
    milliseconds -= seconds * 1_000

    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
    return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"

In [None]:
# Handle offset/misalignment in speech-to-text


offset = 200    # milliseconds

# Calculate offset based on timestamps in team transcription and audio
hour_teams, minute_teams, second_teams = 0, 2, 46
milli_teams = int(hour_teams * 60*60*1000 + minute_teams * 60*1000 + second_teams * 1000)

hour_audio, minute_audio, second_audio = 0, 2, 50
milli_audio = int(hour_audio * 60*60*1000 + minute_audio * 60*1000 + second_audio * 1000)

#offset = milli_audio - milli_teams
print(offset)

# Make an empty column for speaker
df_whisper["speaker"] = np.zeros(df_whisper.shape[0])

# Loop over all rows in whisper transcription
for i in range(df_whisper.shape[0]):
    start1, end1 = df_whisper[["start", "end"]].iloc[i]

    # Find the row in team transcription with the most overlap with the current row in whisper transcription
    amount_of_overlap = df_teams.apply(lambda row: find_overlap(start1=start1, end1=end1, 
                                        start2=row["start"]+offset, 
                                        end2=row["end"]+offset), 
                                        axis=1)
    
    # Get index of row with most overlap
    index_max = np.argmax(amount_of_overlap)

    # Extract speaker from team transcription and add to whisper transcription
    speaker = df_teams.loc[index_max, "speaker"]
    df_whisper.loc[i, "speaker"] = speaker

# Change column order and save to file
df_whisper = df_whisper[["start", "end", "speaker", "text"]]
df_whisper.to_csv(os.path.join(folder_path, "text_whisper_offset_adjusted.tsv"), index=False, sep="\t")

df_whisper.head(10)