SOURCES:

- <b>dailydialog</b>: 2017, 102k <br>
- <b>emotion-stimulus</b>: 2015<br>
- <b>isear</b>:	1990	<br>


Based on the DWDS-API (see "Data collecting/Emotiondataset_builder.py"):
- german_emotionlexicon.csv

Combining the dataset, we get:
- fullset.csv

# Model Training
Here we introduce the used model and how we trained it.

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder

fullset = pd.read_csv('fullset.csv')
shuffle(fullset,random_state=42)

del fullset['Unnamed: 0']

### Text cleaning

In [2]:
#get rid of further symbols 
fullset['text_de'] = fullset['text_de'].replace({'[»„‘’“”…]': ' '}, regex=True)

#get rid of digits
fullset['text_de'] = fullset['text_de'].replace({'\w*\d\w*': 'Nummer'}, regex=True)

# get rid of urls
fullset['text_de'] = fullset['text_de'].replace({r"https?://\S+|www\.\S+": ' '}, regex=True)

#get not identified unicode
fullset['text_de'] = fullset['text_de'].replace('[\u0080-\uffff]w{1-3}', " ", regex=True)

# replaces all stringw which are unicodes (\u2009 \a0x) and also removes bashes
fullset['text_de'] = fullset['text_de'].replace({r"[^\x00-\x7F\w{1,3}]+": ' '}, regex=True)

# remove hashtags and @usernames
fullset['text_de'] = fullset['text_de'].replace({r"(#[\d\w\.]+)": ' '}, regex=True)
fullset['text_de'] = fullset['text_de'].replace({r"(@[\d\w\.]+)": ' '}, regex=True)

### Embedding

In [3]:
#define labels
lb_make = LabelEncoder()
fullset["label_id"] = lb_make.fit_transform(fullset["Emotion"])

In [4]:
# The maximum number of most frequent words to be used.
MAX_NB_WORDS = 15000
# Max number of words in each row
MAX_SEQUENCE_LENGTH = 150
#feature vector length
HIDDEN_DIM = 150

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True)
tokenizer.fit_on_texts(fullset['text_de'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 17087 unique tokens.


In [5]:
# define X and Y
X = tokenizer.texts_to_sequences(fullset['text_de'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', X.shape)

Y = pd.get_dummies(fullset['Emotion']).values
print('Shape of label tensor:', Y.shape)

# a really messy way to save labels
onehot_labels = pd.get_dummies(fullset['Emotion'])
labels = onehot_labels.drop_duplicates(subset=onehot_labels.columns).reset_index(drop=True)

Shape of data tensor: (11157, 150)
Shape of label tensor: (11157, 5)


### Test and train of the model

In [7]:
#split data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)

In [8]:
#save model
from __future__ import absolute_import, division, print_function
import os

checkpoint_path = "model_pretrain/model"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, 
                                                 save_weights_only=True,
                                                 verbose=1)

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding,SpatialDropout1D, LSTM,Conv1D,MaxPooling1D
from tensorflow.keras import layers

def emotion_model():
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, HIDDEN_DIM, input_length=X.shape[1]))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(5, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model 

In [12]:
#load if trained
model = emotion_model()

model.load_weights("model/pretrained_model")

# epochs = 7
# batch_size = 64
# model.fit(X_train, Y_train,epochs=epochs, batch_size=batch_size,validation_split=0.2, callbacks=[cp_callback])

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x118e724efd0>

In [13]:
#testing
accr = model.evaluate(X_test,Y_test)
print('\n Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))


 Test set
  Loss: 1.226
  Accuracy: 0.678
