In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from English_to_IPA import conversion
import re
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from num2words import num2words

In [2]:
categories = ['Oral Cavity',
              'Mouth Openness',
              'Obstruent',
              'Tongue Positioning',
              'Resonance',
              'Vowel','Round','Tense']
eSPEDict = dict()
with open('eSPEPhonologicalTableV2') as openFile:
    for line in openFile.readlines():
        line = line.strip().split('\t')
        eSPEDict[line[0]] = np.array([eval(x) for x in line[1:]])

In [3]:
def split_text(text):
    text = re.sub('\n+', '\n', text)
    splitted_text = re.sub(' +', ' ', clean_text(text)).split(' ')
    return [ word.lower() for word in splitted_text if word != '']

def clean_text(text):
    punct_str = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~«»“…‘”'
    for p in punct_str:
        text = text.replace(p,' ')
    return text  
    
def is_number(word):
    try:
        int(word)
        return True
    except ValueError:
        pass
    return False

def change_number_to_string(number):
    return num2words(int(number))

def is_empty(sequence):
    if len(sequence) == 0:
        return True
    return False

In [4]:
valid_dev = pd.read_csv('common-voice/cv-valid-dev.csv',index_col=0)
valid_dev.head()

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration,energy_score,text_len,ambiguous,emotion
0,cv-valid-dev/sample-000000.mp3,be careful with your prognostications said the...,1,0,,,,,[ 49 24 117 30 12 14 7 16 8],8,yes,"['anger', 'joy']"
1,cv-valid-dev/sample-000001.mp3,then why should they be surprised when they se...,2,0,,,,,[36 19 85 17 9 11 5 10 10],10,no,"['disgust', 'anger']"
2,cv-valid-dev/sample-000002.mp3,a young arab also loaded down with baggage ent...,2,0,,,,,[ 55 39 124 28 22 21 8 13 13],13,no,"['anticipation', 'disgust']"
3,cv-valid-dev/sample-000003.mp3,i thought that everything i owned would be des...,3,0,,,,,[36 17 89 21 5 12 8 12 9],9,no,"['sadness', 'surprise']"
4,cv-valid-dev/sample-000004.mp3,he moved about invisible but everyone could he...,1,0,fourties,female,england,,[ 40 18 106 14 11 15 9 8 9],9,no,"['fear', 'sadness']"


In [5]:
valid_train = pd.read_csv('common-voice/cv-valid-train.csv',dtype={'ambiguous': str,'emotion':object},index_col=0)
valid_train.head()

Unnamed: 0,Unnamed: 0.1,filename,text,up_votes,down_votes,age,gender,accent,duration,energy_score,text_len,ambiguous,emotion
0,0,cv-valid-train/sample-000000.mp3,learn to recognize omens and follow them the o...,1,0,,,,,[45 31 99 25 21 16 4 13 12],12,yes,"['disgust', 'trust']"
1,1,cv-valid-train/sample-000001.mp3,everything in the universe evolved he said,1,0,,,,,[25 16 77 17 8 12 7 9 7],7,yes,"['fear', 'trust']"
2,2,cv-valid-train/sample-000002.mp3,you came so that you could learn about your dr...,1,0,,,,,[ 62 25 109 26 11 16 14 14 14],14,no,"['trust', 'joy']"
3,3,cv-valid-train/sample-000003.mp3,so now i fear nothing because it was those ome...,1,0,,,,,[ 53 29 120 24 12 18 10 17 15],15,yes,"['joy', 'anger']"
4,4,cv-valid-train/sample-000004.mp3,if you start your emails with greetings let me...,3,2,,,,,[ 67 26 150 35 13 20 11 22 17],17,no,['trust']


In [6]:
valid_test = pd.read_csv('common-voice/cv-valid-test.csv',index_col=0)
valid_test.head()

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration,energy_score,text_len
0,cv-valid-test/sample-000000.mp3,without the dataset the article is useless,1,0,,,,,[36 18 86 20 3 13 5 14 7],7
1,cv-valid-test/sample-000001.mp3,i've got to go to him,1,0,twenties,male,,,[15 12 42 3 7 6 2 6 6],6
2,cv-valid-test/sample-000002.mp3,and you know it,1,0,,,,,[11 9 15 4 2 4 3 4 4],4
3,cv-valid-test/sample-000003.mp3,down below in the darkness were hundreds of pe...,4,0,twenties,male,us,,[ 45 21 118 26 14 17 5 16 12],12
4,cv-valid-test/sample-000004.mp3,hold your nose to keep the smell from disablin...,2,0,,,,,[ 60 30 125 27 21 16 10 16 12],12


In [7]:
def get_score(text):
    word_count = 0
    score = np.zeros((8,), dtype=np.int)
    words = split_text(text)
    for word in words:
        if word == '':
            continue
        if is_number(word):
            #print("Found number: "+ word)
            word = change_number_to_string(word)
        cmu, ipa = conversion.convert(word)
        punct = '[ˌˈ ]'
        for p in punct:
            ipa = ipa.replace(p , '')
        if '*' in ipa:        
            #print('No IPA Found for: '+ word)
            continue
        word_count += 1
        i = 0
        while i < len(ipa):
            if i == len(ipa)-1:
                sym = ipa[i]
                score += eSPEDict[sym]
                i += 1
            else:
                try:
                    sym = ipa[i] + ipa[i+1]
                    score += eSPEDict[sym]
                    i += 2
                except KeyError:
                    sym = ipa[i]
                    score += eSPEDict[sym]
                    i += 1
    score = np.append(score, word_count)
    return score

def len_words(text):
    return len(text.split(' '))

import requests
import json

def get_emotion(text):
    payload = {"lang":"und","text":text}
    url = 'http://140.114.77.14:8080/webresources/jammin/emotion'
    try:
        r = requests.post(url, data=json.dumps(payload))
    except Exception:
        r = requests.post(url, data=json.dumps(payload))
    pass
    r = r.json()
    return r['ambiguous'], list(item['name'] for item in r['groups'])

In [8]:
#valid_dev['energy_score'] = valid_dev['text'].apply(get_score)

In [9]:
#valid_train['energy_score'] = valid_train['text'].apply(get_score)

In [10]:
#valid_test['energy_score'] = valid_test['text'].apply(get_score)

In [11]:
#valid_train['text_len'] = valid_train['text'].apply(len_words)
#valid_dev['text_len'] = valid_dev['text'].apply(len_words)
#valid_test['text_len'] = valid_test['text'].apply(len_words)

# Get Emotion for each value

In [12]:
# for i, item in valid_dev.iterrows():
#     ambiguous, emotion = get_emotion(item['text'])
#     valid_dev.loc[i,'ambiguous'] =  ambiguous
#     valid_dev.loc[i,'emotion'] = str(emotion)
#     if i % 100 == 0:
#         #valid_dev.to_csv('common-voice/cv-valid-dev.csv')
#         print('Saved at index',i)
# #valid_dev.to_csv('common-voice/cv-valid-dev.csv')
# valid_dev.head()
    

In [15]:
# valid_train['ambiguous'], valid_train['emotion'] = valid_train['text'].apply(get_emotion)
for i, item in valid_train[valid_train.emotion != valid_train.emotion].iterrows():
    ambiguous, emotion = get_emotion(item['text'])
    valid_train.loc[i,'ambiguous'] =  ambiguous
    valid_train.loc[i,'emotion'] = str(emotion)
    if i % 100 == 0:
        print('at index',i)
        if i % 500 == 0:   
            valid_train.to_csv('common-voice/cv-valid-train.csv')
            print('Saved at index',i)
valid_train.to_csv('common-voice/cv-valid-train.csv')
valid_train.head()

Unnamed: 0,Unnamed: 0.1,filename,text,up_votes,down_votes,age,gender,accent,duration,energy_score,text_len,ambiguous,emotion
0,0,cv-valid-train/sample-000000.mp3,learn to recognize omens and follow them the o...,1,0,,,,,[45 31 99 25 21 16 4 13 12],12,yes,"['disgust', 'trust']"
1,1,cv-valid-train/sample-000001.mp3,everything in the universe evolved he said,1,0,,,,,[25 16 77 17 8 12 7 9 7],7,yes,"['fear', 'trust']"
2,2,cv-valid-train/sample-000002.mp3,you came so that you could learn about your dr...,1,0,,,,,[ 62 25 109 26 11 16 14 14 14],14,no,"['trust', 'joy']"
3,3,cv-valid-train/sample-000003.mp3,so now i fear nothing because it was those ome...,1,0,,,,,[ 53 29 120 24 12 18 10 17 15],15,yes,"['joy', 'anger']"
4,4,cv-valid-train/sample-000004.mp3,if you start your emails with greetings let me...,3,2,,,,,[ 67 26 150 35 13 20 11 22 17],17,no,['trust']


In [16]:
for i, item in valid_test.iterrows():
    ambiguous, emotion = get_emotion(item['text'])
    valid_test.loc[i,'ambiguous'] =  ambiguous
    valid_test.loc[i,'emotion'] = str(emotion)
    if i % 100 == 0:
        print('at index',i )
        if i % 500 == 0:   
            valid_train.to_csv('common-voice/cv-valid-test.csv')
            print('Saved at index',i)
valid_train.to_csv('common-voice/cv-valid-test.csv')
valid_train.head()

at index 0
Saved at index 0


KeyboardInterrupt: 