In [None]:
import pandas as pd

from pathlib import Path
import os
import json
import re

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
class config:
    BASE_DIR = Path('/content/drive/MyDrive/DataColab Task')
    DATA_DIR = BASE_DIR / 'data'
    MODEL_DIR = BASE_DIR / 'models'
    TRANSCRIPTS_DIR = DATA_DIR / "transcripts"

In [None]:
pd.set_option('display.max_colwidth', None)
df = pd.read_csv(config.DATA_DIR / "to_fill.csv")
df.head()

Unnamed: 0,first_words,last_words,source_video_id
0,Well knew. This morning police need your help,"gunpoint, beating him and stealing his cell phone.",18246
1,a call. San Francisco firefighters rescued a man,all the way down to the ocean. ocean.,12387
2,"Paul. Meanwhile, the state set a record in","night through conservation, some 4000 conservation is extraordinary.",16859
3,Emergency crews in Florida continue to search for,"in Florida to more than 850,000 homes.",18246
4,But even though the state never ordered rolling,feel since their power got cut out needlessly.,16859


In [None]:
df.shape

(18, 3)

In [None]:
df.dtypes

first_words        object
last_words         object
source_video_id     int64
dtype: object

In [None]:
df["source_video_id"] = df["source_video_id"].astype(str)
df.dtypes

first_words        object
last_words         object
source_video_id    object
dtype: object

In [None]:
#these 2 cells in the dataframe are incorrect
#for example: The sentence "all the way down to the ocean. ocean." doesn't exist in the transcript
df.loc[1, "last_words"] = "all the way down to the ocean."
df.loc[12, "last_words"] = "name or what led to that fall."
df.head()

Unnamed: 0,first_words,last_words,source_video_id
0,Well knew. This morning police need your help,"gunpoint, beating him and stealing his cell phone.",18246
1,a call. San Francisco firefighters rescued a man,all the way down to the ocean.,12387
2,"Paul. Meanwhile, the state set a record in","night through conservation, some 4000 conservation is extraordinary.",16859
3,Emergency crews in Florida continue to search for,"in Florida to more than 850,000 homes.",18246
4,But even though the state never ordered rolling,feel since their power got cut out needlessly.,16859


In [None]:
df["source_video_id"].unique()

array(['18246', '12387', '16859'], dtype=object)

In [None]:
#saving the transcripts in a dictionary as a dictionary
transcripts = {}

file_names = os.listdir(config.TRANSCRIPTS_DIR)
for i, file_name in enumerate(file_names):
  file_dir = config.TRANSCRIPTS_DIR / file_name
  with open(file_dir, "r") as read_file:
    transcript_id = file_dir.stem
    transcripts[transcript_id] = json.load(read_file)

In [None]:
transcripts.keys()

dict_keys(['18246', '16859', '12387'])

In [None]:
#finding the starting and ending times of the body based on the index of the first starting word and the last ending word
def get_start_and_end(first_word_idx, last_word_idx, ts_id):
  first_word_pos = len(transcripts[ts_id]["text"][:first_word_idx].split(" ")) - 1
  last_word_pos = len(transcripts[ts_id]["text"][:last_word_idx].split(" ")) - 1
  start = transcripts[ts_id]["words"][first_word_pos]["start"]
  end = transcripts[ts_id]["words"][last_word_pos]["end"]
  return start, end

In [None]:
info_ls = []

#finding the body, start and end and saving them in "info" dictionary and adding each dictionary to "info_ls"
def extract_info(first_words, last_words, ts_id):
  info = {}
  text = transcripts[ts_id]["text"]
  res = re.search(first_words + ".*" + last_words, text)
  first_word_idx, last_word_idx = res.span()
  info["start"], info["end"] = get_start_and_end(first_word_idx, last_word_idx, ts_id)
  info["body"] = transcripts[ts_id]["text"][int(first_word_idx): int(last_word_idx)]
  info_ls.append(info)

In [None]:
df.apply(lambda row: extract_info(row["first_words"], row["last_words"], row["source_video_id"]), axis=1)
info_ls[11]

{'start': 534674,
 'end': 548314,
 'body': 'And National Guard teams are also on the ground in the Carolinas, where the cleanup is just getting started. There. President Biden approved emergency declarations in both north and South Carolina. Fasttracking federal aid. '}

In [None]:
info_df = pd.DataFrame(info_ls)
info_df.head(3)

Unnamed: 0,start,end,body
0,464928,504300,"Well knew. This morning police need your help finding the group behind a string of violent robberies in the Bronx. Investigators are looking for the five suspects you see in these surveillance images. And they say two of the incidents happened just minutes apart on August 18. The first near Olenville Avenue and Waits Place. That's where five people beat a man before stealing his iPhone. Five minutes later, just before midnight, the same group attacked a 47 year old man on Brocks Park East, stealing his backpack. And police say in a third robbery on August 29 at the same location, the suspects robbed a man at gunpoint, beating him and stealing his cell phone."
1,359020,384950,"a call. San Francisco firefighters rescued a man right near the Cliff house. They say he fell over this ledge here early this morning. If you're not familiar with the area, this is kind of just north end of Ocean Beach. You could see the firefighters. They're able to use their ropes and pull that man up about 20ft from the cement barrier down below. Now they say it could have been much worse. It was just about a foot or two away from going even farther down below, all the way down to the ocean."
2,60704,100410,"Paul. Meanwhile, the state set a record in energy demand yesterday at 52,061 isn't too far behind. We were, however, able to narrowly avoid rolling blackouts. State energy officials and governor newsom are crediting Californians for cutting back just enough yesterday to keep the grid from getting. Overwhelmed over the next two days. We're still going to have to be mindful of work yet to be done. That said, what folks were able to accomplish last night through conservation, some 4000 conservation is extraordinary."


In [None]:
final_df = pd.concat([df, info_df], axis=1)
final_df.head(3)

Unnamed: 0,first_words,last_words,source_video_id,start,end,body
0,Well knew. This morning police need your help,"gunpoint, beating him and stealing his cell phone.",18246,464928,504300,"Well knew. This morning police need your help finding the group behind a string of violent robberies in the Bronx. Investigators are looking for the five suspects you see in these surveillance images. And they say two of the incidents happened just minutes apart on August 18. The first near Olenville Avenue and Waits Place. That's where five people beat a man before stealing his iPhone. Five minutes later, just before midnight, the same group attacked a 47 year old man on Brocks Park East, stealing his backpack. And police say in a third robbery on August 29 at the same location, the suspects robbed a man at gunpoint, beating him and stealing his cell phone."
1,a call. San Francisco firefighters rescued a man,all the way down to the ocean.,12387,359020,384950,"a call. San Francisco firefighters rescued a man right near the Cliff house. They say he fell over this ledge here early this morning. If you're not familiar with the area, this is kind of just north end of Ocean Beach. You could see the firefighters. They're able to use their ropes and pull that man up about 20ft from the cement barrier down below. Now they say it could have been much worse. It was just about a foot or two away from going even farther down below, all the way down to the ocean."
2,"Paul. Meanwhile, the state set a record in","night through conservation, some 4000 conservation is extraordinary.",16859,60704,100410,"Paul. Meanwhile, the state set a record in energy demand yesterday at 52,061 isn't too far behind. We were, however, able to narrowly avoid rolling blackouts. State energy officials and governor newsom are crediting Californians for cutting back just enough yesterday to keep the grid from getting. Overwhelmed over the next two days. We're still going to have to be mindful of work yet to be done. That said, what folks were able to accomplish last night through conservation, some 4000 conservation is extraordinary."


In [None]:
final_df.to_csv(config.DATA_DIR / "filled.csv")