In [118]:
import pandas as pd
import numpy as np
import re
import os
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

#Pickling
from six.moves import cPickle as pickle
from nltk.corpus import stopwords

%matplotlib inline

no_alignment_file = [4764]
wrong_alignment = [3730]

In [119]:
from keras.layers import Activation, Input, Dense, Flatten, Dropout, Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.merge import concatenate
from keras import regularizers
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint, TensorBoard, ReduceLROnPlateau
from keras.preprocessing.text import Tokenizer

from gensim.models import word2vec
from gensim.models import KeyedVectors
from glove import Corpus, Glove

# Data Load Functions

In [120]:
def extract_patterns(data,extract=False):
    if(extract):
        patterns = {}
        for index, row in data.iterrows():
            patterns[row['index']] = set(get_pattern([row['text']])[0].values())
            print('Extracted pattern from '+ row['index'] + ' index:'+ str(index))
            print('Size: ', len(patterns[row['index']]), 'Patterns size', len(patterns))
        try:
            print('Saving Pickle')
            with open('pickles/patterns/pattern.pickle','wb') as f:
                save = {
                    'patterns' : patterns
                }
                pickle.dump(save,f,pickle.HIGHEST_PROTOCOL)
                print('Successfully saved in pattern.pickle')
                return patterns
        except Exception as e:
            print('Unable to save data to pickle', e)
            print('Patterns probably not saved.')
            return patterns
    else:
        try:
            with open('pickles/patterns/pattern.pickle','rb') as f:
                save = pickle.load(f)
                patterns = save['patterns']
                del save
                returning = {}
                for key in list(data['index']):
                    returning[key] = patterns[key]
                return returning
        except Exception as e:
            print('Error loading base datasets pickle: ', e)
            
def clean_text(text):
    punct_str = '!"#$%&()*+,-./:;<=>?@\\^_`{|}~«»“…‘”\t'
    for p in punct_str:
        text = text.replace(p,' ')
    text = re.sub(' +', ' ', text)
    return text.lower().strip()

def filter_word_count(data, n_count):
    return data[list(map(lambda x: len(x.split(' ')) >= n_count,data['text']))]

def remove_empty_patterns(data,patterns):
    empty_patterns = [k for k, v in patterns.items() if len(v) < 1]
    patterns = { k:v for k, v in patterns.items() if len(v) >= 1 }
    data = filter(lambda x: x[1]['index'] not in empty_patterns ,data.iterrows())
    data = pd.DataFrame.from_items(data).T
    return data,patterns

In [121]:
def load_data(word_count,emotional_mapping):
    # full = generate_IEMOCAP_df()
    data = pd.read_csv('data/IEMOCAP_sentences_votebased.csv',index_col=0)
    data['emotion_code'] = data['emotion'].map( emotional_mapping ).astype(int)
    # Take away fear, surprise,disgust, xxx and others. Not enough data
    data = data[data.emotion_code < 4]
    #Remove rows that don't have Alignment file
#     data = data.drop(no_alignment_file)
    # Remove rows that have wrong Alignment file
    data = data.drop(wrong_alignment)
    # Clean Transcripts
    data['text'] = data['text'].apply(clean_text)
    # Filter Word Count
    data = filter_word_count(data, word_count)
    patterns = extract_patterns(data)
    data,patterns = remove_empty_patterns(data,patterns)
    return data,patterns


In [122]:
emotional_mapping = {'ang': 0, 'sad': 1, 'hap': 2, 'neu': 3,'fru': 4,'exc': 5,'fea': 6,'sur': 7,'dis': 8, 'xxx':9,'oth':10}
data, patterns = load_data(3,emotional_mapping)
data.groupby('emotion').count()

Unnamed: 0_level_0,index,start_time,end_time,text,wav_path,alignment_path,valence,arousal,dominance,gender,emotion_code
emotion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ang,1153,1153,1153,1153,1153,1153,1153,1153,1153,1153,1153
hap,689,689,689,689,689,689,689,689,689,689,689
neu,1466,1466,1466,1466,1466,1466,1466,1466,1466,1466,1466
sad,974,974,974,974,974,974,974,974,974,974,974


In [123]:
scaled_feature_table = None
try:
    with open('pickles/patterns/scaled_pattern_features4emo.pickle','rb') as f:
        save = pickle.load(f)
        scaled_feature_table = save['feature_table']
        del save
except Exception as e:
    print('Error loading pattern features pickle: ', e)

In [124]:
test = scaled_feature_table['Ses01F_impro03_M024']
test

{u'25086_.+ enough': array([ 1.47662568,  1.64670445,  0.19012376, -0.06221796,  0.41093638,
         0.83077829,  0.24317282,  0.63061836, -0.42022054,  0.11148869,
         1.83209683,  1.47504898,  1.56258218, -0.23396023,  0.31952995,
         0.23395897,  0.37763398,  0.72121466, -0.72438772,  0.41479379,
         1.36804605,  0.02490234,  0.06434849,  0.08740234])}

# Parameter

In [125]:
# DATASET
TEST_SIZE      = 0.2

# EMBEDDING
# MAX_NUM_WORDS  = 1800 
EMBEDDING_DIM  = 24
MAX_SEQ_LENGTH = 500
USE_GLOVE      = True

# MODEL
FILTER_SIZES   = [2,2,3]
FEATURE_MAPS   = [100,100,100]
DROPOUT_RATE   = 0.5

# LEARNING
BATCH_SIZE     = 200
NB_EPOCHS      = 50
RUNS           = 5
VAL_SIZE       = 0.1

In [126]:
x_train, x_test, y_train, y_test = train_test_split(data, data.emotion_code, test_size=TEST_SIZE)

In [127]:


# tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
# tokenizer.fit_on_texts(x_train)
# sequences = tokenizer.texts_to_sequences(x_train)

# word_index = tokenizer.word_index
result = [len(scaled_feature_table[x].keys()) for x in x_train['index']]
# result = [len(scaled_feature_table[x].keys()) for x in scaled_feature_table.keys()]

print('Text informations:')
print('max length: %i / min length: %i / mean length: %i / limit length: %i' % (np.max(result),
                                                                                np.min(result),
                                                                                np.mean(result),
                                                                                MAX_SEQ_LENGTH))

# print('vocabulary size: %i / limit: %i' % (len(word_index), MAX_NUM_WORDS))
data = []
for key, row in x_train.iterrows():
    features = scaled_feature_table[row['index']]
    sorted(features)
#     sequences.append(features.values())
    feat_matrix = np.array(features.values())
    pad = np.zeros((MAX_SEQ_LENGTH,EMBEDDING_DIM))
    pad[:feat_matrix.shape[0],:feat_matrix.shape[1]] = feat_matrix
    data.append(pad)
    
# # Padding all sequences to same length of `MAX_SEQ_LENGTH`
# data   = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post')


Text informations:
max length: 491 / min length: 1 / mean length: 43 / limit length: 500


In [134]:
type(data[0][0][0])

numpy.float64

In [128]:
import cnn_model

histories = []
import time
for i in range(RUNS):
    print('Running iteration %i/%i' % (i+1, RUNS))
    start_time = time.time()
    X_train, X_val, labels, y_val = train_test_split(data, y_train, test_size=VAL_SIZE, random_state=42)
    
    emb_layer = None
    
    model = cnn_model.build_cnn(
        embedding_layer=emb_layer,
        embedding_dim= EMBEDDING_DIM,
        filter_sizes = FILTER_SIZES,
        feature_maps = FEATURE_MAPS,
        max_seq_length = MAX_SEQ_LENGTH,
        dropout_rate=DROPOUT_RATE
    )

Running iteration 1/5


Exception: Please define `num_words` and `embedding_dim` if you not use a pre-trained embedding