In [1]:
import pandas as pd
import os
import librosa
import numpy as np
import scipy
import re


#Data handling
from sklearn.model_selection import train_test_split

#Pickling
from six.moves import cPickle as pickle

#Metrics
from sklearn.metrics import recall_score, precision_score, f1_score

#HTTP
import requests
import json

#
import collections

import matplotlib.pyplot as plt
import matplotlib.style as ms
import librosa.display
import IPython.display as ipd
import seaborn
ms.use('seaborn-muted')
%matplotlib inline


In [205]:
def get_pattern(text):
    text = json.dumps(text)
    url = 'http://192.168.2.101:7878/api/get_patt'
    data = dict(input_tweets = text)
    resp = requests.post(url=url, data=data)
    r = json.loads(resp.text)
    return map(lambda x: x['pattern'],r)
    
def get_deep_emotion(text):
    text = json.dumps(text)
    url = 'http://192.168.2.101:7878/api/get_emo'
    data = dict(input_tweets = text)
    resp = requests.post(url=url, data=data)
    r = json.loads(resp.text)
    return r

def clean_text(text, remove_actions = True):
    punct_str = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~«»“…‘”'
    if(remove_actions):
        text = re.sub(r" ?\[[^)]+\]", "", text)
    for p in punct_str:
        text = text.replace(p,' ')
    text = re.sub(' +', ' ', text)
    return text.lower().strip()

def get_f1_score(precision,recall):
    return 2 * (precision * recall) / (precision + recall)

def get_patterns_load(data,patterns_df, emotion):
    index = data[data.emotion == emotion ]['index']
    patterns = patterns_df.loc[list(index)]
    load = []
    for pat in patterns.iterrows():
        load = load + list(pat[1].dropna())
    return load


def extract_patterns(data,extract=False):
    if(extract):
        patterns = {}
        for index, row in data.iterrows():
            patterns[row['index']] = set(get_pattern([row['text']])[0].values())
            print('Extracted pattern from '+ row['index'] + ' index:'+ str(index))
            print('Size: ', len(patterns[row['index']]), 'Patterns size', len(patterns))
        try:
            print('Saving Pickle')
            with open('pickles/patterns/pattern.pickle','wb') as f:
                save = {
                    'patterns' : patterns
                }
                pickle.dump(save,f,pickle.HIGHEST_PROTOCOL)
                print('Successfully saved in pattern.pickle')
                return patterns
        except Exception as e:
            print('Unable to save data to pickle', e)
            print('Patterns probably not saved.')
            return patterns
    else:
        try:
            with open('pickles/patterns/pattern.pickle','rb') as f:
                save = pickle.load(f)
                patterns = save['patterns']
                del save
                returning = {}
                for key in list(data['index']):
                    returning[key] = patterns[key]
                return returning
        except Exception as e:
            print('Error loading base datasets pickle: ', e)
            

def build_emotions_counter(data,patterns_df):
    emotions_counter ={}
    emotions_list = list(data['emotion'].unique())
    for emotion in emotions_list:
        load = get_patterns_load(data,patterns_df,emotion)
        emotions_counter[emotion] = collections.Counter(load)
    return emotions_counter

def build_frequencyframe(all_patterns,emotions_counter):
    df_patt = {}
    for pattern in all_patterns:
        df_patt[pattern] = {}
        for emotion in emotions_counter:
            df_patt[pattern][emotion] = emotions_counter[emotion][pattern]
    return pd.DataFrame(df_patt).T

def build_pfief(df_patt):
    ief = ((df_patt+1).rdiv(df_patt.sum(axis=1)+1, axis=0)+1).apply(np.log10)
    pf = ((df_patt.sum(axis=0)+1)/(df_patt+1)).apply(np.log10)
    return ief * pf

def balance_data(data):
    min_sample = min(data.groupby('emotion').count()['index'])
    emotions_list = list(data['emotion'].unique())
    samples = []
    for emotion in emotions_list:
        samples.append(data[data.emotion == emotion].sample(n=min_sample))
    result = pd.concat(samples).sample(frac=1)
    return result
        
def two_emotions(data,emotional_mapping,emotion1,emotion2):
    emotion_code = emotional_mapping[emotion1]
    emotion_sample = data[data.emotion_code == emotion_code]
    emotion_code2 = emotional_mapping[emotion2]
    emotion_sample2 = data[data.emotion_code == emotion_code2]
    if(len(emotion_sample2) < len(emotion_sample)):
        emotion_sample = emotion_sample.sample(n=len(emotion_sample2))
    else:
        emotion_sample2 = emotion_sample2.sample(n=len(emotion_sample))
    sample = pd.concat([emotion_sample,emotion_sample2]).sample(frac=1)
    return sample

def filter_word_count(data, n_count):
    return data[map(lambda x: len(x.split(' ')) >= n_count,data['text'])]

def remove_empty_patterns(data,patterns):
    empty_patterns = [k for k, v in patterns.iteritems() if len(v) < 1]
    patterns = { k:v for k, v in patterns.iteritems() if len(v) > 1 }
    data = filter(lambda x: x[1]['index'] not in empty_patterns ,data.iterrows())
    data = pd.DataFrame.from_items(data).T
    return data,patterns

In [None]:
# full = generate_IEMOCAP_df()
data = pd.read_csv('data/IEMOCAP_sentences.csv',index_col=0)

emotional_mapping = {'ang': 0, 'sad': 1, 'exc': 2, 'neu': 3,'fru': 4,'hap': 5,'fea': 6,'sur': 7,'dis': 8, 'xxx':9,'oth':10}
data['emotion_code'] = data['emotion'].map( emotional_mapping ).astype(int)

# Take away fear, surprise,disgust, xxx and others. Not enough data
data = data[data.emotion_code < 6]

# Clean Transcripts
data['text'] = data['text'].apply(clean_text)

# Filter Word Count
data = filter_word_count(data, 3)

patterns = extract_patterns(data) 
data,patterns = remove_empty_patterns(data,patterns)
data.groupby('emotion').count()['index']

In [186]:
# data = two_emotions(data,emotional_mapping,'neu','hap')
# Balance Data
data = balance_data(data)
y = data.emotion_code
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2)

In [188]:
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))


2491
2491
623
623


In [189]:
patterns = extract_patterns(X_train) 
# X_train,patterns = remove_empty_patterns(X_train,patterns)

patterns_df = pd.DataFrame.from_dict(patterns, orient='index')
emotions_counter = build_emotions_counter(X_train,patterns_df)
all_patterns = []
for pat in patterns_df.iterrows():
    all_patterns = all_patterns + list(pat[1].dropna())

In [190]:
df_patt = build_frequencyframe(all_patterns,emotions_counter)
em_df = build_pfief(df_patt)

In [191]:
df_patt.head(5)

Unnamed: 0,ang,exc,fru,hap,neu,sad
.+ .+ a,81,84,77,70,70,76
.+ .+ and,100,62,66,56,52,95
.+ .+ for,34,30,52,29,24,32
.+ .+ i,123,124,155,113,96,144
.+ .+ if,28,11,27,13,14,10


In [192]:
em_df.head(5)

Unnamed: 0,ang,exc,fru,hap,neu,sad
.+ .+ a,1.869813,1.784834,1.952301,1.948413,1.943972,1.940252
.+ .+ and,1.583129,2.098516,2.089434,2.172285,2.263011,1.633768
.+ .+ for,2.202732,2.323134,1.704206,2.31504,2.567713,2.27672
.+ .+ i,1.789243,1.735523,1.555995,1.788325,1.974476,1.608623
.+ .+ if,1.808011,3.019062,1.86872,2.718888,2.610125,3.208357


# Score

In [193]:
trainpatterns = patterns
patterns_list = np.array(list(em_df.index))
frequency_vectors = []
for index in trainpatterns:
    frequency_vectors.append(np.isin(patterns_list,np.array(list(trainpatterns[index]))))
train_vectors = pd.DataFrame(frequency_vectors,columns=patterns_list,index=trainpatterns.keys())
train_vectors = train_vectors * 1

In [194]:
em_matrix = em_df.T.as_matrix()
emotional_scores = []
for index, vector in train_vectors.iterrows():
    emotional_scores.append(em_matrix.dot(vector))
emotions_list = list(em_df.columns)
scores = pd.DataFrame(emotional_scores,columns=emotions_list,index=trainpatterns.keys())
scores['pred_emotion'] = map(lambda x: x[1].idxmax(),scores.iterrows())
scores['pred_code'] = scores['pred_emotion'].map(emotional_mapping).astype(int)
# scores
pred_y = list(scores['pred_code'])

In [195]:
print(len(y_train))
print(len(pred_y))

2491
2491


In [196]:
precision = precision_score(list(y_train),pred_y,average='macro')
recall = recall_score(list(y_train),pred_y,average='macro')
f1 = get_f1_score(precision,recall)
print('Precision Macro',precision)
print('Recall Macro',recall)
print('F1 Macro',f1)
print(' ')
precision = precision_score(list(y_train),pred_y,average='micro')
recall = recall_score(list(y_train),pred_y,average='micro')
f1 = get_f1_score(precision,recall)
print('Precision Micro',precision)
print('Recall Micro',recall)
print('F1 Micro',f1)
print(' ')
precision = precision_score(list(y_train),pred_y,average='weighted')
recall = recall_score(list(y_train),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)

('Precision Macro', 0.1636746849366546)
('Recall Macro', 0.1619048063983861)
('F1 Macro', 0.16278493506039735)
 
('Precision Micro', 0.1613809714973906)
('Recall Micro', 0.1613809714973906)
('F1 Micro', 0.1613809714973906)
 
('Precision Weighted', 0.1638268748752634)
('Recall Weighted', 0.1613809714973906)
('F1 Weighted', 0.16259472530964506)


#  Testing

In [197]:
testpatterns = extract_patterns(X_test)
# X_test,testpatterns = remove_empty_patterns(X_test,testpatterns)
patterns_list = np.array(list(em_df.index))
frequency_vectors = []
for index in testpatterns:
    frequency_vectors.append(np.isin(patterns_list,np.array(list(testpatterns[index]))))
test_vectors = pd.DataFrame(frequency_vectors,columns=patterns_list,index=testpatterns.keys())
test_vectors = test_vectors * 1

In [198]:
test_vectors.head(5)

Unnamed: 0,.+ .+ a,.+ .+ and,.+ .+ for,.+ .+ i,.+ .+ if,.+ .+ in,.+ .+ is,.+ .+ me,.+ .+ my,.+ .+ of,...,your .+ while,your .+ with,your .+ you,your .+ your,your eyes .+,your life .+,your name .+,your own .+,yours .+,yourself .+
Ses04M_impro06_F002,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ses05M_impro07_F026,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ses04F_script01_3_F009,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ses03F_script01_3_F005,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Ses01M_script01_3_M019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [199]:
em_matrix = em_df.T.as_matrix()
emotional_scores = []
for index, vector in test_vectors.iterrows():
    emotional_scores.append(em_matrix.dot(vector))

In [200]:
scores = pd.DataFrame(emotional_scores,columns=emotions_list,index=testpatterns.keys())
scores['pred_emotion'] = map(lambda x: x[1].idxmax(),scores.iterrows())
scores['pred_code'] = scores['pred_emotion'].map(emotional_mapping).astype(int)

In [201]:
pred_y = list(scores['pred_code'])

In [202]:
scores.head(5)

Unnamed: 0,ang,exc,fru,hap,neu,sad,pred_emotion,pred_code
Ses04M_impro06_F002,99.139289,89.965659,73.52643,84.657479,84.969327,76.523179,ang,0
Ses05M_impro07_F026,100.640608,65.73221,79.23218,64.317904,74.536743,73.048238,ang,0
Ses04F_script01_3_F009,63.751999,54.420359,54.272814,49.548197,61.404683,62.270964,ang,0
Ses03F_script01_3_F005,105.21351,114.618531,91.952336,116.690682,101.55503,104.836141,hap,5
Ses01M_script01_3_M019,7.033284,8.993313,7.439021,7.139038,7.12081,7.330418,exc,2


In [203]:
print(len(y_test))
print(len(pred_y))

623
623


In [204]:
precision = precision_score(list(y_test),pred_y,average='macro')
recall = recall_score(list(y_test),pred_y,average='macro')
f1 = get_f1_score(precision,recall)
print('Precision Macro',precision)
print('Recall Macro',recall)
print('F1 Macro',f1)
print(' ')
precision = precision_score(list(y_test),pred_y,average='micro')
recall = recall_score(list(y_test),pred_y,average='micro')
f1 = get_f1_score(precision,recall)
print('Precision Micro',precision)
print('Recall Micro',recall)
print('F1 Micro',f1)
print(' ')
precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)

('Precision Macro', 0.15013141725320098)
('Recall Macro', 0.16084251151503925)
('F1 Macro', 0.15530249949868674)
 
('Precision Micro', 0.1653290529695024)
('Recall Micro', 0.1653290529695024)
('F1 Micro', 0.1653290529695024)
 
('Precision Weighted', 0.15215266819067705)
('Recall Weighted', 0.1653290529695024)
('F1 Weighted', 0.15846743205764563)


# Sad & Hap

In [75]:
precision = precision_score(list(y_test),pred_y,average='macro')
recall = recall_score(list(y_test),pred_y,average='macro')
f1 = get_f1_score(precision,recall)
print('Precision Macro',precision)
print('Recall Macro',recall)
print('F1 Macro',f1)
print(' ')
precision = precision_score(list(y_test),pred_y,average='micro')
recall = recall_score(list(y_test),pred_y,average='micro')
f1 = get_f1_score(precision,recall)
print('Precision Micro',precision)
print('Recall Micro',recall)
print('F1 Micro',f1)
print(' ')
precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)

('Precision Macro', 0.5055895790803927)
('Recall Macro', 0.5053285145028265)
('F1 Macro', 0.5054590130822941)
 
('Precision Micro', 0.5)
('Recall Micro', 0.5)
('F1 Micro', 0.5)
 
('Precision Weighted', 0.5068047049674346)
('Recall Weighted', 0.5)
('F1 Weighted', 0.5033793569566477)


# Anger & Hap

In [95]:
precision = precision_score(list(y_test),pred_y,average='macro')
recall = recall_score(list(y_test),pred_y,average='macro')
f1 = get_f1_score(precision,recall)
print('Precision Macro',precision)
print('Recall Macro',recall)
print('F1 Macro',f1)
print(' ')
precision = precision_score(list(y_test),pred_y,average='micro')
recall = recall_score(list(y_test),pred_y,average='micro')
f1 = get_f1_score(precision,recall)
print('Precision Micro',precision)
print('Recall Micro',recall)
print('F1 Micro',f1)
print(' ')
precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)

('Precision Macro', 0.546664167916042)
('Recall Macro', 0.546149569085349)
('F1 Macro', 0.5464067473400861)
 
('Precision Micro', 0.5432692307692307)
('Recall Micro', 0.5432692307692307)
('F1 Micro', 0.5432692307692307)
 
('Precision Weighted', 0.5480787250605467)
('Recall Weighted', 0.5432692307692307)
('F1 Weighted', 0.5456633803620117)


# Anger & Sadness

In [115]:
precision = precision_score(list(y_test),pred_y,average='macro')
recall = recall_score(list(y_test),pred_y,average='macro')
f1 = get_f1_score(precision,recall)
print('Precision Macro',precision)
print('Recall Macro',recall)
print('F1 Macro',f1)
print(' ')
precision = precision_score(list(y_test),pred_y,average='micro')
recall = recall_score(list(y_test),pred_y,average='micro')
f1 = get_f1_score(precision,recall)
print('Precision Micro',precision)
print('Recall Micro',recall)
print('F1 Micro',f1)
print(' ')
precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)

('Precision Macro', 0.5026881720430108)
('Recall Macro', 0.5026881720430108)
('F1 Macro', 0.5026881720430108)
 
('Precision Micro', 0.5026737967914439)
('Recall Micro', 0.5026737967914439)
('F1 Micro', 0.5026737967914439)
 
('Precision Weighted', 0.5027025472945776)
('Recall Weighted', 0.5026737967914439)
('F1 Weighted', 0.5026881716319253)


# Neutral & Happiness

In [134]:
precision = precision_score(list(y_test),pred_y,average='macro')
recall = recall_score(list(y_test),pred_y,average='macro')
f1 = get_f1_score(precision,recall)
print('Precision Macro',precision)
print('Recall Macro',recall)
print('F1 Macro',f1)
print(' ')
precision = precision_score(list(y_test),pred_y,average='micro')
recall = recall_score(list(y_test),pred_y,average='micro')
f1 = get_f1_score(precision,recall)
print('Precision Micro',precision)
print('Recall Micro',recall)
print('F1 Micro',f1)
print(' ')
precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)

('Precision Macro', 0.5026881720430108)
('Recall Macro', 0.5026881720430108)
('F1 Macro', 0.5026881720430108)
 
('Precision Micro', 0.5026737967914439)
('Recall Micro', 0.5026737967914439)
('F1 Micro', 0.5026737967914439)
 
('Precision Weighted', 0.5027025472945776)
('Recall Weighted', 0.5026737967914439)
('F1 Weighted', 0.5026881716319253)


# Neutral & Sadness

In [316]:
precision = precision_score(list(y_test),pred_y,average='macro')
recall = recall_score(list(y_test),pred_y,average='macro')
f1 = get_f1_score(precision,recall)
print('Precision Macro',precision)
print('Recall Macro',recall)
print('F1 Macro',f1)
print(' ')
precision = precision_score(list(y_test),pred_y,average='micro')
recall = recall_score(list(y_test),pred_y,average='micro')
f1 = get_f1_score(precision,recall)
print('Precision Micro',precision)
print('Recall Micro',recall)
print('F1 Micro',f1)
print(' ')
precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)

('Precision Macro', 0.47599318351262115)
('Recall Macro', 0.4760416666666667)
('F1 Macro', 0.47601742385512175)
 
('Precision Micro', 0.4769585253456221)
('Recall Micro', 0.4769585253456221)
('F1 Micro', 0.4769585253456221)
 
('Precision Weighted', 0.4765562995818717)
('Recall Weighted', 0.4769585253456221)
('F1 Weighted', 0.4767573276273286)


# Neutral & Anger

In [348]:
precision = precision_score(list(y_test),pred_y,average='macro')
recall = recall_score(list(y_test),pred_y,average='macro')
f1 = get_f1_score(precision,recall)
print('Precision Macro',precision)
print('Recall Macro',recall)
print('F1 Macro',f1)
print(' ')
precision = precision_score(list(y_test),pred_y,average='micro')
recall = recall_score(list(y_test),pred_y,average='micro')
f1 = get_f1_score(precision,recall)
print('Precision Micro',precision)
print('Recall Micro',recall)
print('F1 Micro',f1)
print(' ')
precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)

('Precision Macro', 0.46908434525763043)
('Recall Macro', 0.4709975369458128)
('F1 Macro', 0.4700389943019242)
 
('Precision Micro', 0.47737556561085975)
('Recall Micro', 0.47737556561085975)
('F1 Micro', 0.47737556561085975)
 
('Precision Weighted', 0.47071297782701477)
('Recall Weighted', 0.47737556561085975)
('F1 Weighted', 0.4740208614181354)


In [None]:
# TODO
#Word count higher than 3
#Filter out empty patterns

In [349]:
data['text']

5977      what who says that bananas don't grow overnight
8253    what do you mean as you can who makes up these...
6006           well i guess you're not trying hard enough
4582                 oh very amusing indeed amanda listen
1622                                             what huh
9810          twice is every time we've tried that's ever
1978    dumber than you i mean they're a lot worse off...
1339                     i'm going to ask her to marry me
5837                      that's how you want to leave it
5955    well i can't be in charge of this i mean i got...
2653                           stop it go away i hate you
4928                          what is the matter with you
573                                           whose is it
1814                           that's very amusing indeed
1993    no it didn't um it didn't come out and i don't...
4966                                          yes sir yes
5775    okay what is it that i did wrong on the form t...
8214          

In [352]:
patterns_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,195,196,197,198,199,200,201,202,203,204
Ses01F_script02_2_F041,the .+,we .+,the .+ .+,right .+,.+ wrong,.+ but .+,.+ but we,.+ the right,.+ but,so .+,...,,,,,,,,,,
Ses05F_script01_1_M013,.+ mom,.+ years,for .+,.+ back,.+ comes,back .+,.+ after,.+ after .+,three .+,nobody .+,...,,,,,,,,,,
Ses05F_script03_2_M041,.+ music,very .+,.+ indeed,music .+,,,,,,,...,,,,,,,,,,
Ses04F_impro07_M076,you .+ to,you .+,.+ .+ you,.+ for it,.+ but .+,.+ it,i .+,.+ yeah,.+ have .+,.+ but,...,,,,,,,,,,
Ses04F_impro07_M077,.+ .+ you,.+ campus,.+ do you,.+ live,.+ year,.+ on,.+ going to,.+ going,.+ to,well .+,...,,,,,,,,,,
Ses05F_script03_2_F041,,,,,,,,,,,...,,,,,,,,,,
Ses04F_impro07_M071,the .+,.+ the .+,the .+ .+,.+ want,thing .+,want to .+,and .+,get .+,.+ to,.+ and,...,,,,,,,,,,
Ses04F_impro07_M072,of .+,.+ of .+,.+ boys,.+ of,drunk .+,lots of .+,,,,,...,,,,,,,,,,
Ses03F_impro08_M021,about .+,three .+,.+ dollars,.+ three,,,,,,,...,,,,,,,,,,
Ses01M_script01_1_M000,.+ it,.+ saw,he .+ it,saw .+,he .+,,,,,,...,,,,,,,,,,
