## *Tools & Libraries*

In [1]:
import numpy as np
import pandas as pd
import gensim
import os
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## *Get the data*

In [2]:
data = pd.read_csv("cleaned_hm.csv")
data.head()

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,True,1,,affection
1,27674,2,24h,I was happy when my son got 90% marks in his e...,I was happy when my son got 90% marks in his e...,True,1,,affection
2,27675,1936,24h,I went to the gym this morning and did yoga.,I went to the gym this morning and did yoga.,True,1,,exercise
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,bonding
4,27677,6227,24h,I went with grandchildren to butterfly display...,I went with grandchildren to butterfly display...,True,1,,affection


## *Basic data cleaning*

In [3]:
data["predicted_category"].value_counts()

affection           34168
achievement         33993
enjoy_the_moment    11144
bonding             10727
leisure              7458
nature               1843
exercise             1202
Name: predicted_category, dtype: int64

In [4]:
data["num_sentence"].value_counts()

1     83711
2      9542
3      3847
4      1624
5       821
6       336
7       183
8       107
10       68
9        61
11       35
13       26
12       21
16       17
18       17
14       14
17       14
19       12
21       10
25        7
15        7
23        7
24        5
26        5
22        4
29        3
31        3
30        3
20        3
27        2
32        2
37        2
40        2
56        1
46        1
53        1
51        1
48        1
69        1
35        1
45        1
44        1
42        1
58        1
34        1
28        1
60        1
Name: num_sentence, dtype: int64

In [5]:
# deleting happy moments with more than 10 sentences
mod_data = data.loc[data['num_sentence'] <= 10]
mod_data["predicted_category"].value_counts()

affection           34020
achievement         33966
enjoy_the_moment    11115
bonding             10700
leisure              7458
nature               1839
exercise             1202
Name: predicted_category, dtype: int64

In [11]:
## categorical to numerical
encode = {
    "affection" : 0,
    "achievement"  : 1,       
    "bonding" : 2,    
    "enjoy_the_moment" : 3,     
    "leisure"  : 4,    
    "nature" : 5,    
    "exercise" : 6
}

In [12]:
mod_data["predicted_category"] = mod_data["predicted_category"].apply(lambda x: encode[x])
mod_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,True,1,,0
1,27674,2,24h,I was happy when my son got 90% marks in his e...,I was happy when my son got 90% marks in his e...,True,1,,0
2,27675,1936,24h,I went to the gym this morning and did yoga.,I went to the gym this morning and did yoga.,True,1,,6
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,2
4,27677,6227,24h,I went with grandchildren to butterfly display...,I went with grandchildren to butterfly display...,True,1,,0


## *Data cleaning for NLP*

In [7]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

happy_lines = list()
lines = mod_data["cleaned_hm"].values.tolist()

for line in lines:
    # tokenize the text
    tokens = word_tokenize(line)
    tokens = [w.lower() for w in tokens]
    # remove puntuations
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove non alphabetic characters
    words = [word for word in stripped if word.isalpha()]
    happy_lines.append(words)

happy_lines

[['i',
  'went',
  'on',
  'a',
  'successful',
  'date',
  'with',
  'someone',
  'i',
  'felt',
  'sympathy',
  'and',
  'connection',
  'with'],
 ['i',
  'was',
  'happy',
  'when',
  'my',
  'son',
  'got',
  'marks',
  'in',
  'his',
  'examination'],
 ['i', 'went', 'to', 'the', 'gym', 'this', 'morning', 'and', 'did', 'yoga'],
 ['we',
  'had',
  'a',
  'serious',
  'talk',
  'with',
  'some',
  'friends',
  'of',
  'ours',
  'who',
  'have',
  'been',
  'flaky',
  'lately',
  'they',
  'understood',
  'and',
  'we',
  'had',
  'a',
  'good',
  'evening',
  'hanging',
  'out'],
 ['i',
  'went',
  'with',
  'grandchildren',
  'to',
  'butterfly',
  'display',
  'at',
  'crohn',
  'conservatory'],
 ['i', 'meditated', 'last', 'night'],
 ['i',
  'made',
  'a',
  'new',
  'recipe',
  'for',
  'peasant',
  'bread',
  'and',
  'it',
  'came',
  'out',
  'spectacular'],
 ['i',
  'got',
  'gift',
  'from',
  'my',
  'elder',
  'brother',
  'which',
  'was',
  'really',
  'surprising',
  'me

## *Training Word2Vec model on HappyDB*

In [8]:
EMBEDDING_DIM = 100
model = gensim.models.Word2Vec(sentences=happy_lines, size=EMBEDDING_DIM, window=5, workers=4, min_count=1)
vocab_words = list(model.wv.vocab)
print(len(vocab_words))

26183


## *Saving the word vectors*

In [9]:
filename = "happydb_word2vec.txt"
model.wv.save_word2vec_format(filename, binary=False)

## *Loading the word vectors*

In [10]:
embeddings_index = {}
f = open(os.path.join('', 'happydb_word2vec.txt'),  encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

## *Train-Test split*

In [14]:
VALIDATION_SPLIT = 0.2
max_length = 55

tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(happy_lines)
sequences = tokenizer_obj.texts_to_sequences(happy_lines)

word_index = tokenizer_obj.word_index
print('Found %s unique tokens.' % len(word_index))

review_pad = pad_sequences(sequences, maxlen=max_length, padding='post')
sentiment =  mod_data['predicted_category'].values

indices = np.arange(review_pad.shape[0])
np.random.shuffle(indices)
review_pad = review_pad[indices]
sentiment = sentiment[indices]

n_values = np.max(sentiment) + 1
Y = np.eye(n_values)[sentiment]

num_validation_samples = int(VALIDATION_SPLIT * review_pad.shape[0])

X_train_pad = review_pad[:-num_validation_samples]
y_train = Y[:-num_validation_samples]
X_test_pad = review_pad[-num_validation_samples:]
y_test = Y[-num_validation_samples:]

Found 26183 unique tokens.


In [15]:
print('Shape of X_train_pad tensor:', X_train_pad.shape)
print('Shape of y_train tensor:', y_train.shape)

print('Shape of X_test_pad tensor:', X_test_pad.shape)
print('Shape of y_test tensor:', y_test.shape)

Shape of X_train_pad tensor: (80240, 55)
Shape of y_train tensor: (80240, 7)
Shape of X_test_pad tensor: (20060, 55)
Shape of y_test tensor: (20060, 7)


## *Making the embedding matrix*

In [16]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
print(num_words)

26184


## *GRU model*

In [17]:
model = Sequential()
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_length,
                            trainable=False)
model.add(embedding_layer)
model.add(GRU(units=64,  dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(7, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print('Summary of the built model...')
print(model.summary())

Summary of the built model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 55, 100)           2618400   
_________________________________________________________________
gru_1 (GRU)                  (None, 64)                31680     
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 455       
Total params: 2,650,535
Trainable params: 32,135
Non-trainable params: 2,618,400
_________________________________________________________________
None


## *Save the best model*

In [18]:
checkpoint = ModelCheckpoint('model-{epoch:03d}-{acc:03f}-{val_acc:03f}.h5', verbose=1, monitor='val_acc',save_best_only=True, mode='auto')  

## *Training*

In [19]:
model.fit(X_train_pad, y_train, batch_size=128, epochs=15, validation_data=(X_test_pad, y_test), callbacks=[checkpoint], verbose=2)

Train on 80240 samples, validate on 20060 samples
Epoch 1/15
 - 67s - loss: 1.3175 - acc: 0.4845 - val_loss: 0.6625 - val_acc: 0.7714

Epoch 00001: val_acc improved from -inf to 0.77144, saving model to model-001-0.484459-0.771436.h5
Epoch 2/15
 - 56s - loss: 0.5469 - acc: 0.8115 - val_loss: 0.4394 - val_acc: 0.8431

Epoch 00002: val_acc improved from 0.77144 to 0.84307, saving model to model-002-0.811503-0.843071.h5
Epoch 3/15
 - 58s - loss: 0.4476 - acc: 0.8397 - val_loss: 0.3945 - val_acc: 0.8563

Epoch 00003: val_acc improved from 0.84307 to 0.85628, saving model to model-003-0.839718-0.856281.h5
Epoch 4/15
 - 56s - loss: 0.4141 - acc: 0.8504 - val_loss: 0.3766 - val_acc: 0.8626

Epoch 00004: val_acc improved from 0.85628 to 0.86256, saving model to model-004-0.850436-0.862562.h5
Epoch 5/15
 - 55s - loss: 0.3924 - acc: 0.8571 - val_loss: 0.3728 - val_acc: 0.8635

Epoch 00005: val_acc improved from 0.86256 to 0.86351, saving model to model-005-0.857091-0.863509.h5
Epoch 6/15
 - 55s 

<keras.callbacks.History at 0x27556bd3710>