# Validate API Data

Validate and create a emotion labeled dataset

In [1]:
# Add project path to the PYTHONPATH

import os
import sys
from pathlib import Path

sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())

In [2]:
import json
from pathlib import Path

## Load Relations

Load the relations between queries and emotions

In [3]:
relations_path = Path('../query_relations.json').resolve()

In [4]:
with relations_path.open('rb') as file:
    relations = json.load(file)

## Load Tokenizer

Load the tokenizer, created at the model training process

In [5]:
import pickle

In [6]:
tokenizer_path = Path('../datasets/sentiment140/tokenizer.pickle').resolve()
with tokenizer_path.open('rb') as file:
    tokenizer = pickle.load(file)

## Load Model

Load the model, using the saved weights

In [7]:
from tensorflow.keras.layers import Input, Embedding, GRU
from tensorflow.keras.layers import Dropout, GlobalMaxPooling1D
from tensorflow.keras.layers import Bidirectional, Dense
from tensorflow.keras.models import Sequential

In [8]:
input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)
embedding_dim = 200
input_length = 100
gru_units = 128
gru_dropout = 0.1
recurrent_dropout = 0.1
dropout = 0.1

In [9]:
model = Sequential()
model.add(Embedding(
    input_dim=input_dim,
    output_dim=embedding_dim,
    input_shape=(input_length,)
))

model.add(Bidirectional(GRU(
    gru_units,
    return_sequences=True,
    dropout=gru_dropout,
    recurrent_dropout=recurrent_dropout
)))
model.add(GlobalMaxPooling1D())
model.add(Dense(32, activation='relu'))
model.add(Dropout(dropout))

model.add(Dense(1, activation='sigmoid'))

In [10]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 200)          2000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 256)          253440    
_________________________________________________________________
global_max_pooling1d (Global (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                8224      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 2,261,697
Trainable params: 2,261,697
Non-trainable params: 0
______________________________________________

In [11]:
weights_path = Path('../models/sentiment_analysis/model_weights.h5').resolve()
model.load_weights(weights_path.as_posix())

## Group data by emotion


In [12]:
import os
import re
import pandas as pd
from tqdm import tqdm

In [13]:
files_dir = Path('../datasets/tweepy').resolve()

In [15]:
emotion_data_dict = {}

filenames = os.listdir(files_dir)
with tqdm(total=len(filenames)) as t:
    for filename in filenames:
        query = re.findall(r'(#[^.]+|:.+:)', filename)[0]
        emotion = relations[query]

        file_data = pd.read_csv(os.path.join(files_dir, filename))
        dict_data = emotion_data_dict[emotion] if emotion in emotion_data_dict else None
        emotion_data_dict[emotion] = pd.concat([dict_data, file_data])
        t.update()

100%|██████████| 29/29 [00:00<00:00, 66.96it/s]


## Predict emotion and filter data

Predict emotion and filter rows for each group created in the step above

In [16]:
import re
import numpy as np
from emoji import demojize
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nlp import preprocess

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vladislavklyuev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
def get_score_range(mean):
  if mean < 0.5:
    return (0.0, mean)
  return (mean, 1.0)

In [18]:
result_data = []

messages = []
with tqdm(total=len(emotion_data_dict.items())) as t:
    for emotion, dataset in emotion_data_dict.items():
        t.set_description('Processing "' + emotion + '" data')

        cleaned_texts = preprocess(dataset.text, quiet=True)
        predict_sequences = [text.split() for text in cleaned_texts]
        list_tokenized_predict = tokenizer.texts_to_sequences(predict_sequences)
        x_predict = pad_sequences(list_tokenized_predict, maxlen=100)

        result = model.predict(x_predict)
        mean = np.mean(result)
        std = np.std(result)
        low, high = get_score_range(mean)
        messages.append(emotion.capitalize() + ": Score Range: {:4f} - {:4f}".format(low, high))
        dataset = dataset[np.all([(result >= low), (result <= high)], axis=0)]
        dataset.insert(0, 'label', emotion)

        result_data = result_data + [dataset]
        t.update()

for message in messages:
    print(message)

Processing "fear" data: 100%|██████████| 4/4 [02:44<00:00, 41.18s/it]   

Joy: Score Range: 0.791465 - 1.000000
Anger: Score Range: 0.000000 - 0.458358
Sadness: Score Range: 0.000000 - 0.244089
Fear: Score Range: 0.000000 - 0.498132





## Save dataset

Save the resulting data

In [19]:
if len(result_data) > 0:
    result_data = pd.concat(result_data)

    path = Path('../datasets/sentiment_analysis/dataset.csv').resolve()
    result_data.to_csv(path, index=None)

    print('Files saved under "' + path.as_posix() + '"')

Files saved under "/Users/vladislavklyuev/Desktop/Thesis/Realisation/emotion-from-tweet-1.0.0/datasets/sentiment_analysis/dataset.csv"
