# **Installations**

In [None]:
# !rm -rf /kaggle/working/*

# fast whisper
!pip install git+https://github.com/SYSTRAN/faster-whisper.git -q -U

In [None]:
# load faster-whisper model
from faster_whisper import WhisperModel
model_size = "base"
model = WhisperModel(model_size, device="cuda", compute_type="float32")

In [None]:
# library 4 downloading video & audio from youtube
!pip install yt-dlp -q -U

# **Imports**

In [None]:
import subprocess
import re
import json
import time

# **Functions**

In [None]:
def download_audio(video_link):
  # Construct the command with the video link
  command = fr"yt-dlp {video_link} --format m4a -o '/kaggle/working/%(id)s.%(ext)s'"

  # Execute the command to download the audio
  subprocess.run(command, shell=True)


def download_video(video_link):
    # Construct the command with the video link and specify the output format as mp4
    command = fr"yt-dlp {video_link} -f 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio' -o '/kaggle/working/%(id)s.%(ext)s'"

    # Execute the command to download the video
    subprocess.run(command, shell=True)


def get_video_id(video_link):
  # Define a regex pattern to match the 'v' parameter in the URL
  pattern = r'(?:v=|\/)([a-zA-Z0-9_-]{11})'

  # Search for the pattern in the link
  match = re.search(pattern, video_link)

  # If match found, return the value of 'v' parameter
  if match:
      video_id = match.group(1)
      return video_id
  else:
      return None


def faster_whisper(audio_name):
  # transcripe audio to segments using faster whisper
  segments, info = model.transcribe(audio_name, beam_size=5)
  print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
  return segments


def save_transcription(segments, transcription_file):
  data_for_json = []
  for segment in segments:
      start_hours, start_remainder = divmod(segment.start, 3600)
      start_minutes, start_seconds = divmod(start_remainder, 60)
      end_hours, end_remainder = divmod(segment.end, 3600)
      end_minutes, end_seconds = divmod(end_remainder, 60)

      segment_data = {
          "start_time": "%02d:%02d:%06.3f" % (start_hours, start_minutes, start_seconds),
          "end_time": "%02d:%02d:%06.3f" % (end_hours, end_minutes, end_seconds),
          "text": segment.text
      }
      data_for_json.append(segment_data)

  with open(transcription_file, "w", encoding='utf-8') as json_file:
    json.dump(data_for_json, json_file, indent=4)


def read_transcription(transcription_file):
  # Reading the data from the JSON filetranscription_entries
  with open(transcription_file, "r", encoding='utf-8') as json_file:
    transcription_entries = json.load(json_file)

  return transcription_entries


def build_dictionaries(transcription_entries):
  # Assuming segments_from_json is already defined and loaded as before
  sentence_dict = {}
  inverted_index = {}

  start_time = time.time()

  for entry in transcription_entries:
      sentence = entry['text']
      timestamps = (entry['start_time'], entry['end_time'])
      sentence_dict[sentence] = timestamps

      # Normalize by lowercasing and removing punctuation
      words = set(re.sub(r'\W+', ' ', sentence.lower()).split())
      for word in words:
          if word not in inverted_index:
              inverted_index[word] = []
          inverted_index[word].append(sentence)

  end_time = time.time()

  building_time = (end_time - start_time) * 1000
  return sentence_dict, inverted_index, building_time


def search_by_subset(query, inverted_index, sentence_dict):
    query_words = set(re.sub(r'\W+', ' ', query.lower()).split())  # Normalize query
    sentences_with_query = None

    for word in query_words:
        if word in inverted_index:
            if sentences_with_query is None:
                sentences_with_query = set(inverted_index[word])
            else:
                sentences_with_query.intersection_update(inverted_index[word])
        else:
            return []  # Early return if any word is not found

    if sentences_with_query is None:
        return []

    # Return the start timestamps for each matching sentence
    return sorted(set(sentence_dict[sentence][0] for sentence in sentences_with_query))

In [None]:
def ASR(video_link):
  # audio preparation
  download_audio(video_link)
  video_id = get_video_id(video_link)

  # identify files name
  audio_name = f"{video_id}.m4a"
  print("audio saved to: ", audio_name, " file")
  transcription_file_name = r"{}.json".format(video_id)

  # run faster-whisper
  segments = faster_whisper(audio_name)

  # save transcription to JSON file
  save_transcription(segments, transcription_file_name)

  # read transcription from json file
  transcription_entries = read_transcription(transcription_file_name)

  # build dictionaries
  sentence_dict, inverted_index, building_time = build_dictionaries(transcription_entries)
  print("Building Dictionaries Time:", building_time , "milliseconds")

  return transcription_entries, sentence_dict, inverted_index

In [None]:
import json

# called 4 each query
def search(query, sentence_dict, inverted_index):
  start_time = time.time()

  start_timestamps = search_by_subset(query, inverted_index, sentence_dict)

  end_time = time.time()

  print("Start timestamps for query:", start_timestamps)
  print("Hashing Searching Time:", (end_time - start_time) * 1000, "milliseconds")

  api = {"timestamps": start_timestamps}
  json_object = json.dumps(api, indent=4)
  with open("/kaggle/working/start_timestamps.json", "w") as outfile:
      outfile.write(json_object)

  return start_timestamps

# **Testing**

In [None]:
import json
f = open('/kaggle/input/youtube-links/video_link.json')
data = json.load(f)
video_link = data['link']
query = data['query']
f.close()
transcription_entries, sentence_dict, inverted_index = ASR(video_link)

In [None]:
# Displaying the content
for segment in transcription_entries:
    print(f"Start Time: {segment['start_time']}, End Time: {segment['end_time']}, Text: {segment['text']}")

In [None]:
start_timestamps = search(query, sentence_dict, inverted_index)