# AWS SETUP

In [1]:
import os
import boto3
import re
import copy
import time
from time import gmtime, strftime
from sagemaker import get_execution_role

role = get_execution_role()
region = boto3.Session().region_name

bucket_name='finaldebatebucket' # Replace with your s3 bucket name
prefix = 'sagemaker/FinalProject' # Used as part of the path in the bucket where you store data
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket_name) # The URL to access the bucket

s3 = boto3.resource('s3')
try:
    if  region == 'us-east-1':
      s3.create_bucket(Bucket=bucket_name)
    else: 
      s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

# Start Notebook

## Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import requests

import boto3
import tpclean.tpclean as tp

from os import listdir
from __future__ import print_function
from datetime import datetime

## ETL

The goal here is to:
1. go through the Folder of soundfiles to, 
2. get the files trancriped from audio to text
3. transform the text to meaningful metadata aswell as the content
4. load everything into a database

### Extract

`bucket_path`must be : 'https://s3-us-east-2.amazonaws.com/finaldebatebucket'

In [3]:
#load Bucket Content
def find_audios(bucket_name, dtype = "wav"):
    """Get Audiofiles from an s3 bucket. This is meant to run on a Sagemaker instance
    Params:
    --------
    bucket_name : str
    dtype : str
        fileformat to look for e.g. wav """
    
    #connect to S3
    s3 = boto3.resource('s3')
    my_bucket = s3.Bucket(bucket_name)  
    
    s3files=[]
    for my_bucket_object in my_bucket.objects.all():
        filename = my_bucket_object.key.split("/")[-1]
        #check whether object is a wav file
        if dtype in filename.split(".")[-1]:
            s3files.append(filename)
    return s3files

In [4]:
audio_files = find_audios(bucket_name)
audio_files

['AM_101_affirmativeaction_pro.wav',
 'DJ_1_ban-video-games_pro.wav',
 'EH_1_ban-video-games_pro.wav']

In [5]:
#just for testing here uncomment as soon as not needed
filename = audio_files[1]

In [6]:
#load API 
sub_path = "cache-data"
job_uri = f"{bucket_path}/{sub_path}/{filename}"

def transcribe_wav(job_uri, dtype="wav" , lang = 'en-US' ,enforce = False, **kwargs):
    """Transcribe a wav file using a AWS trancribe web API call
    
    Params:
    --------
    job_uri : str
        path to aufiofile in an s3 bucket
    dtype : str
        file format of the audio file
    lang : str
        language spoken in the audiofile
    enforce : bool
        whether or not to enforce doing the transcription job when filename already found in prior joblist
    
    Returns:
    --------
    trans_json : Json-Object
        return from the API Call
    trans_json_uri : str
        url to the transcriptionjob json"""
      
    #Call API
    transcribe = boto3.client('transcribe')
    
    #create Jobname from Filename
    job_name = job_uri.split("/")[-1]
    
    #Check whether file is already transcribed
    jobs = transcribe.list_transcription_jobs()['TranscriptionJobSummaries']
    job_names = [job['TranscriptionJobName'] for job in jobs]
    
    if job_name in job_names:
        print("File already transcribed")
        go_on = enforce
    else:
        go_on = True
    
    #Call for Transcription Job
    if go_on:

        transcribe.start_transcription_job(
            TranscriptionJobName=job_name,
            Media={'MediaFileUri': job_uri},
            MediaFormat= dtype,
            LanguageCode= lang, 
            **kwargs)
    
        #print status update
        while True:
            status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
            if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
                break
            print("Not ready yet...", end="\r")
            time.sleep(5)
        print(status)
    
    #cache outputs
    trans_json = transcribe.get_transcription_job(TranscriptionJobName=job_name)
    trans_json_uri = trans_json["TranscriptionJob"]["Transcript"]["TranscriptFileUri"]
    
    #Insert JSON to DataBase here!
    
    
    
    
    print("Output succesfull")
    return trans_json , trans_json_uri

In [7]:
trans_json_uri = transcribe_wav(job_uri)[1]

File already transcribed
Output succesfull


In [8]:
#load json from URL
r = requests.get(trans_json_uri)
explore = r.json()

### Transform

Now that the desired files are laoded, they can be transformed to get meaningful metadata and the content of the speech

In [9]:
#store full text
fulltext = explore["results"]["transcripts"][0]["transcript"]

In [10]:
df = pd.DataFrame(explore["results"]["items"])
df.head()

Unnamed: 0,alternatives,end_time,start_time,type
0,"[{'confidence': '1.0000', 'content': 'We'}]",1.45,1.14,pronunciation
1,"[{'confidence': '1.0000', 'content': 'should'}]",1.69,1.45,pronunciation
2,"[{'confidence': '0.9717', 'content': 'ban'}]",2.16,1.69,pronunciation
3,"[{'confidence': '1.0000', 'content': 'the'}]",2.32,2.17,pronunciation
4,"[{'confidence': '1.0000', 'content': 'sale'}]",2.77,2.32,pronunciation


In [11]:
#unnest the data using tpclean
df = tp.unnest_df_list(df,["alternatives"])
df = tp.unnest_df_dict(df,["alternatives_1"])
df.rename({"alternatives_1_confidence":"confidence", 
           "alternatives_1_content": "content"}, 
          axis = "columns", inplace = True)

In [12]:
df.head()

Unnamed: 0,end_time,start_time,type,confidence,content
0,1.45,1.14,pronunciation,1.0,We
1,1.69,1.45,pronunciation,1.0,should
2,2.16,1.69,pronunciation,0.9717,ban
3,2.32,2.17,pronunciation,1.0,the
4,2.77,2.32,pronunciation,1.0,sale


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 577 entries, 0 to 576
Data columns (total 5 columns):
end_time      526 non-null object
start_time    526 non-null object
type          577 non-null object
confidence    577 non-null object
content       577 non-null object
dtypes: object(5)
memory usage: 22.6+ KB


In [14]:
#convert columns containing numbers into float datatype
for col in df.columns:
    try:
        df[col] = df[col].astype("float")
    except:
        continue

In [15]:
def get_pause(df,start_time,end_time):
    """Converts the end time of a word and the start time of the next word into the pause between these words
    Params:
    --------
    df : pandas DataFrame
        Dataframe containing the timestamps
    start_time : str
        Columnn name ot the start_time stamps
    end_time : str
        Columnn name ot the end_time stamps
    
    Returns:
    --------
    df : pandas DataFrame
        updated Dataframe containing a "pause_after" column
    """
    pause_after = []
    
    for i in range(len(df)-1):
        j=1
        
        #if next item i nan keep looking forward until it isn't
        while (np.isnan(df[start_time][i+j])) and (i+j <len(df)-1):
            j +=1
        pause_after.append(df[start_time][i+j]-df[end_time][i])
    
    #add zero to the end and push to dataframe
    df["pause_after"] = pd.Series(pause_after).append(pd.Series({len(pause_after):0}))
    return df

In [22]:
#engineer length of word and pauses between words
df["length"] = df.end_time-df.start_time
get_pause(df,"start_time","end_time");

#append filename
df["origin"] = filename

#append default speaker for now
df["speaker"] = "speaker_default"

#append word 
df.reset_index().rename({"index":"pos_in_conv"},axis = "columns");

In [17]:
df.head()

Unnamed: 0,end_time,start_time,type,confidence,content,length,pause_after,origin,speaker
0,1.45,1.14,pronunciation,1.0,We,0.31,0.0,DJ_1_ban-video-games_pro.wav,speaker_default
1,1.69,1.45,pronunciation,1.0,should,0.24,0.0,DJ_1_ban-video-games_pro.wav,speaker_default
2,2.16,1.69,pronunciation,0.9717,ban,0.47,0.01,DJ_1_ban-video-games_pro.wav,speaker_default
3,2.32,2.17,pronunciation,1.0,the,0.15,0.0,DJ_1_ban-video-games_pro.wav,speaker_default
4,2.77,2.32,pronunciation,1.0,sale,0.45,0.0,DJ_1_ban-video-games_pro.wav,speaker_default


In [18]:
import mysql.connector

In [19]:
!ls

Cache_Data  Private  README.md	Testbook.ipynb


In [20]:
from Private.private import user
from Private.private import password

In [24]:
conn = mysql.connector.Connect(
    host='debaterdb.c7oenlqovcjd.us-east-2.rds.amazonaws.com',
    user="piet",
    password="philipsWe200")

InterfaceError: 2003: Can't connect to MySQL server on 'debaterdb.c7oenlqovcjd.us-east-2.rds.amazonaws.com:3306' (110 Connection timed out)