## Transcribing using AWS transcribe 


##### Author: Madhuroopa


#### Steps: 
##### 1. use AWS transcribe and AWS s3 to transcribe and diarize
##### 2. However the results need to be processed to our format 
##### 3. use this notebook to process the .vtt and .json file generated from AWS transcribe



In [1]:
from google.colab import drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd
import json

# Load the JSON file
with open('/content/gdrive/My Drive/Capstone/data/test.json', 'r') as f: ## change the file path to the json file saved path that you got from AWS
    data = json.load(f)

# Extract the relevant information from the JSON file
segments = data['results']['speaker_labels']['segments']
rows = []
for segment in segments:
    start_time = float(segment['start_time'])/60
    end_time = float(segment['end_time'])/60
    speaker_label = segment['speaker_label']
    rows.append([start_time, end_time, speaker_label])

# Create a pandas dataframe from the extracted information
df = pd.DataFrame(rows, columns=['start_time', 'end_time', 'speaker_label'])

df.head(20)

Unnamed: 0,start_time,end_time,speaker_label
0,0.0,0.02765,spk_0
1,0.046817,0.085983,spk_0
2,0.09615,0.144817,spk_0
3,0.1545,0.211333,spk_0
4,0.22065,0.303817,spk_0
5,0.31465,0.329167,spk_0
6,0.349483,0.494667,spk_0
7,0.507983,0.521,spk_0
8,0.5355,0.551333,spk_0
9,0.560167,0.643833,spk_0


In [3]:
df['start_time']=df['start_time'].astype(float)

In [4]:
df['end_time']=df['end_time'].astype(float)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343 entries, 0 to 342
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   start_time     343 non-null    float64
 1   end_time       343 non-null    float64
 2   speaker_label  343 non-null    object 
dtypes: float64(2), object(1)
memory usage: 8.2+ KB


In [6]:
pip install webvtt-py


Collecting webvtt-py
  Downloading webvtt_py-0.4.6-py3-none-any.whl (16 kB)
Collecting docopt (from webvtt-py)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=4723e0bf95acb9386807af765308857797edba7ec6d1b6158ab37321f9d2136b
  Stored in directory: /root/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac
Successfully built docopt
Installing collected packages: docopt, webvtt-py
Successfully installed docopt-0.6.2 webvtt-py-0.4.6


In [7]:

import webvtt

# Load the webvtt file
subtitles = webvtt.read('/content/gdrive/My Drive/Capstone/data/test.vtt') ## change the file path to the .vtt file saved path that you got from AWS

# Initialize an empty list to store the captions
data = []

# Loop through the captions and extract the information

for subtitle in subtitles:
    start_time = subtitle.start.split(':')
    end_time = subtitle.end.split(':')

    # Convert start and end time to minutes
    start_minutes = int(start_time[0])*60 + int(start_time[1]) + float(start_time[2])/60
    end_minutes = int(end_time[0])*60 + int(end_time[1]) + float(end_time[2])/60

    text = subtitle.text.strip()

    # Append the information to the data list
    data.append((start_minutes, end_minutes, text))


# Create a pandas dataframe from the data list
titles = pd.DataFrame(data, columns=['start_time', 'end_time', 'text'])


In [8]:
titles['start_time']=titles['start_time'].astype(float)

In [9]:
titles['end_time']=titles['end_time'].astype(float)

In [10]:
titles.head(90)

Unnamed: 0,start_time,end_time,text
0,0.000150,0.025817,I can record
1,0.046983,0.083983,and we don't have a ton
2,0.096333,0.125650,"of items to get to,"
3,0.132333,0.142000,uh
4,0.154650,0.208983,and I might be able to do one that might be fun
...,...,...,...
85,3.797167,3.840167,"uh thanks for the link too, Sonia on the roles"
86,3.856333,3.865817,ah
87,3.878667,3.901983,product announcements.
88,3.924000,4.012833,So I appreciate you Brian for adding this. I u...


In [11]:
import pandas as pd



# Merge the two tables based on start_time
merged = pd.merge_asof(titles.sort_values('start_time'), df.sort_values('start_time'), on='start_time', direction='backward')

# Drop rows with NaN values in the speaker_label column
merged = merged.dropna(subset=['speaker_label'])

# Rename the columns
merged = merged[['start_time', 'end_time_x', 'speaker_label', 'text']]
merged.columns = ['start_time', 'end_time', 'speaker_label', 'text']

# Reset the index
merged = merged.reset_index(drop=True)

# Print the merged table
print(merged)
merged.head(20)

     start_time   end_time speaker_label  \
0      0.000150   0.025817         spk_0   
1      0.046983   0.083983         spk_0   
2      0.096333   0.125650         spk_0   
3      0.132333   0.142000         spk_0   
4      0.154650   0.208983         spk_0   
..          ...        ...           ...   
963   42.458833  42.533150         spk_0   
964   42.535167  42.548817         spk_0   
965   42.565483  42.573150         spk_0   
966   42.589333  42.602483         spk_0   
967   42.640983  42.670483         spk_3   

                                                  text  
0                                         I can record  
1                              and we don't have a ton  
2                                  of items to get to,  
3                                                   uh  
4      and I might be able to do one that might be fun  
..                                                 ...  
963  Well, uh, this was kind of fun to do a little ...  
964            

Unnamed: 0,start_time,end_time,speaker_label,text
0,0.00015,0.025817,spk_0,I can record
1,0.046983,0.083983,spk_0,and we don't have a ton
2,0.096333,0.12565,spk_0,"of items to get to,"
3,0.132333,0.142,spk_0,uh
4,0.15465,0.208983,spk_0,and I might be able to do one that might be fun
5,0.220833,0.248317,spk_0,if we have a little bit of time.
6,0.255317,0.268167,spk_0,Um
7,0.272667,0.301833,spk_0,"So corporate events,"
8,0.314817,0.325817,spk_0,uh
9,0.34965,0.369667,spk_0,"they,"


In [15]:
merged.to_csv('/content/gdrive/My Drive/Capstone/product_marketing_meeting.csv')