In [5]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

%matplotlib inline

In [25]:
from keras.layers import Activation, Input, Dense, Flatten, Dropout, Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.merge import concatenate
from keras import regularizers
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from gensim.models import word2vec
from gensim.models import KeyedVectors
from glove import Corpus, Glove

In [7]:
def load_data(word_count, emotional_mapping):
    # full = generate_IEMOCAP_df()
    data = pd.read_csv('data/IEMOCAP_sentences.csv',index_col=0)
    data['emotion_code'] = data['emotion'].map( emotional_mapping ).astype(int)
    # Take away fear, surprise,disgust, xxx and others. Not enough data
    data = data[data.emotion_code < 6]
    # Clean Transcripts
    data['text'] = data['text'].apply(clean_text)
    # Filter Word Count
    data = filter_word_count(data, word_count)
#     data,patterns = remove_empty_patterns(data,patterns)
    return data

def clean_text(text):
    punct_str = '!"#$%&()*+,-./:;<=>?@\\^_`{|}~«»“…‘”\t'
    for p in punct_str:
        text = text.replace(p,' ')
    text = re.sub(' +', ' ', text)
    text = re.sub(r"[0-9]+", "", text)
    text = re.sub(".*?\[(.*?)\]","",text) # Take out any [action] text in the transcript
    return text.lower().strip()

def filter_word_count(data, n_count):
    return data[list(map(lambda x: len(x.split(' ')) >= n_count,data['text']))]


In [8]:
emotional_mapping = {'ang': 0, 'sad': 1, 'exc': 2, 'neu': 3,'fru': 4,'hap': 5,'fea': 6,'sur': 7,'dis': 8, 'xxx':9,'oth':10}
data = load_data(3, emotional_mapping)
df = data[['text','emotion_code']]
df.head()

Unnamed: 0,text,emotion_code
2,is there a problem,3
5,well what's the problem let me change it,3
6,what i'm getting an id this is why i'm here my...,4
7,how am i supposed to get an id without an id h...,4
8,i'm here to get an id,4


In [24]:
glove

<module 'glove.glove' from '/Users/roblescoulter/anaconda3/lib/python3.6/site-packages/glove/glove.py'>

## Parameter

In [9]:
# DATASET
TEST_SIZE      = 0.2

# EMBEDDING
MAX_NUM_WORDS  = 2500 # 2954, 2000, 2700
EMBEDDING_DIM  = 200
MAX_SEQ_LENGTH = 100
USE_GLOVE      = True

# MODEL
FILTER_SIZES   = [3,4,5]
FEATURE_MAPS   = [10,10,10]
DROPOUT_RATE   = 0.5

# LEARNING
BATCH_SIZE     = 200
NB_EPOCHS      = 40
RUNS           = 5
VAL_SIZE       = 0.1

## Preprocessing

In [10]:
x_train, x_test, y_train, y_test = train_test_split(data.text, data.emotion_code, test_size=TEST_SIZE)

In [11]:
def max_length(lines):
    return max([len(s.split()) for s in lines])

tokenizer = Tokenizer()#num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)

length = max_length(x_train)
word_index = tokenizer.word_index

result = [len(x.split()) for x in x_train]
print('Text informations:')
print('max length: %i / min length: %i / mean length: %i / limit length: %i' % (np.max(result),
                                                                                np.min(result),
                                                                                np.mean(result),
                                                                                MAX_SEQ_LENGTH))

print('vocabulary size: %i / limit: %i' % (len(word_index), MAX_NUM_WORDS))

# Padding all sequences to same length of `MAX_SEQ_LENGTH`
data   = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post')

Text informations:
max length: 100 / min length: 3 / mean length: 13 / limit length: 100
vocabulary size: 2930 / limit: 2500


# Embeddings

In [15]:
def create_glove_embeddings(data, use_text8 = False):
    sentences = [x.split() for x in data]
    print(sentences)
    
def create_word2vec_embeddings(data = None, use_text8 = False):
    model = None
    if(use_text8):
        model = KeyedVectors.load_word2vec_format('models/text8.model.bin',binary=True)
    else:
        if(data != None):  
            model = word2vec.Word2Vec(data, size=EMBEDDING_DIM)
        else:
            print('No data found. Using text8 Corpus')
            model = KeyedVectors.load_word2vec_format('models/text8.model.bin',binary=True)
    
    embeddings_index = {}
    for word in model.wv.index2word:
        embeddings_index[word] = model.wv[word]
        
    embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))
    
    for word, i in tokenizer.word_index.items():
        if i >= MAX_NUM_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if(embedding_vector is not None):
            embedding_matrix[i] = embedding_vector
    return Embedding(input_dim=MAX_NUM_WORDS, output_dim=EMBEDDING_DIM, input_length = MAX_SEQ_LENGTH,
                    weights= [embedding_matrix], trainable=True)

In [148]:
word_embeddings = create_word2vec_embeddings(x_train)

[('so', 0.9997955560684204),
 ("'cause", 0.9997822046279907),
 ('um', 0.9997684955596924),
 ('hear', 0.9997516870498657),
 ('enough', 0.9997501373291016),
 ('live', 0.9997497797012329),
 ('though', 0.999740481376648),
 ('because', 0.9997397065162659),
 ('augie', 0.9997377991676331),
 ('stupid', 0.9997369647026062)]

In [26]:
import itertools

In [None]:
sentences = list(itertools.islice(word2vec.Text8Corpus('data/text8'),None))
corpus = Corpus()
corpus.fit(sentences, window=10)
glove = Glove(no_components=100,learning_rate=0.05)
glove.fit(corpus.matrix, epochs=30,no_threads=4,verbose=True)
glove.add_dictionary(corpus.dictionary)
# model = word2vec.Word2Vec(sentences, size=EMBEDDING_DIM)
# model.save('models/text8.model')
# model.wv.save_word2vec_format('models/text8.model.bin', binary=True)

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


In [None]:
# model1 = KeyedVectors.load_word2vec_format('models/text8.model.bin',binary=True)
# # model1