In [1]:
# CNN
import torch.nn.functional as F
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn

# others
import numpy as np
import matplotlib.pyplot as plt
import time
import os
from PIL import Image
from tempfile import TemporaryDirectory
import time

# dataset
import torchvision
from torchvision import datasets, models, transforms
from torchvision.datasets import Flowers102

# read file 
import pandas as pd

# label
from scipy.io import loadmat
import json

# Use TweetEval emotion recognition dataset 

In [2]:
root = '../../Data/tweeteval/datasets/emotion/'
mapping_file = os.path.join(root, 'mapping.txt')
test_labels_file = os.path.join(root, 'test_labels.txt')
test_text_file = os.path.join(root, 'test_text.txt')
train_labels_file = os.path.join(root, 'train_labels.txt')
train_text_file = os.path.join(root, 'train_text.txt')
val_labels_file = os.path.join(root, 'val_labels.txt')
val_text_file = os.path.join(root, 'val_text.txt')

In [3]:
mapping_pd = pd.read_csv(mapping_file, sep='\t', header=None)
test_label_pd = pd.read_csv(test_labels_file, sep='\t', header=None)
test_dataset = open(test_text_file).read().split('\n')[:-1] # remove last empty line 
train_label_pd = pd.read_csv(train_labels_file, sep='\t', header=None)
train_dataset = open(train_text_file).read().split('\n')[:-1] # remove last empty line
val_label_pd = pd.read_csv(val_labels_file, sep='\t', header=None)
val_dataset = open(val_text_file).read().split('\n')[:-1] # remove last empty line

In [4]:
print(f'len(train_dataset)= {len(train_dataset)}')
print(f'len(train_label_pd)= {len(train_label_pd)}')
print(f'=== train_label_pd === \n{train_label_pd.value_counts()}')

print(f'len(train_dataset)= {len(test_dataset)}')
print(f'len(train_label_pd)= {len(test_label_pd)}')
print(f'=== train_label_pd === \n{test_label_pd.value_counts()}')

len(train_dataset)= 3257
len(train_label_pd)= 3257
=== train_label_pd === 
0    1400
3     855
1     708
2     294
Name: count, dtype: int64
len(train_dataset)= 1421
len(train_label_pd)= 1421
=== train_label_pd === 
0    558
3    382
1    358
2    123
Name: count, dtype: int64


# Task 2
- Use spaCy to split the tweets into words.

Before start using spaCy
```
conda install -c conda-forge spacy
python -m spacy download en_core_web_sm
```

In [5]:
import spacy 
from collections import Counter

# use spacy to tokenize the sentence with english model 
nlp = spacy.load("en_core_web_sm")

# join all the sentence together 
text = ' '.join(train_dataset)

# use spacy to tokenize the sentence 
doc = nlp(text)

# filter out the punctuation and stop words
word_freq = Counter(token.text for token in doc \
                    if not token.is_punct and \
                        not token.is_stop and \
                            not token.is_space )
word_freq

Counter({'@user': 2019,
         'like': 212,
         'amp': 148,
         'people': 126,
         'know': 96,
         'think': 92,
         'sad': 90,
         'got': 85,
         'day': 81,
         'u': 80,
         'time': 78,
         '✨': 75,
         '😂': 75,
         'want': 74,
         'life': 73,
         'going': 69,
         'feel': 67,
         'angry': 66,
         '2': 65,
         'depression': 65,
         'fear': 64,
         'love': 64,
         'good': 63,
         'today': 61,
         'work': 59,
         'fucking': 57,
         'sadness': 56,
         'lost': 56,
         'm': 55,
         'need': 55,
         'new': 53,
         'way': 52,
         'man': 52,
         'anger': 52,
         'awful': 52,
         'terrorism': 52,
         'anxiety': 51,
         'right': 49,
         'let': 48,
         'horrible': 47,
         'rage': 47,
         'fuming': 46,
         'shocking': 44,
         'nightmare': 44,
         'terrible': 43,
         'little': 43,
 

In [45]:
# 選擇最常見的 5000 個單詞作為詞彙表
most_common_words = word_freq.most_common(5000)
# 建立詞彙到索引的映射 e.g. hello -> 1, like -> 2 ...
vocab = {word[0]: idx for idx, word in enumerate(most_common_words)}

[(k, v) for k, v in vocab.items() if v == 4999]

[('Paul.should', 4999)]

In [46]:
# 轉換單詞為索引，超出詞彙表的單詞用佔位索引 5001 代替 因為我們會收集前 5000 個單詞，如果沒有在裡面則用 5001 代替
placeholder_index = 5000
indexed_dataset = []
for tweet in train_dataset:
    indexed_words = []
    for token in nlp(tweet):
        if not token.is_punct and not token.is_stop and not token.is_space:
            word = token.text
            if word in vocab:
                indexed_words.append(vocab[word])
            else:
                indexed_words.append(placeholder_index)
    indexed_dataset.append(indexed_words)

# 打印轉換後的數據
print(indexed_dataset)

[[2013, 3615, 269, 3616, 3617, 1426, 717, 86], [1069, 339, 2014, 2015, 44, 2016], [606, 3618, 340, 162, 3619, 3620, 2017, 3621], [3622, 25, 3623, 25, 196, 3624, 537, 3625], [163, 52, 9, 341, 3626, 140, 1427, 3627], [0, 1428, 380, 270, 3628, 607, 7, 297, 2018], [2019, 3629, 1429, 342, 1070, 718, 3630, 343, 872, 381, 718], [3631, 271, 3632, 1430, 3633, 344, 3634, 210], [0, 2020, 42, 2021, 2022, 17, 3635], [0, 480, 2023, 433], [0, 0, 608, 2024, 74, 10, 111, 609, 71, 39, 244], [3636, 3637, 2, 719, 3638, 272, 3639, 211, 0, 3640, 3641, 873, 65, 3642, 53, 273], [141, 1071, 3643, 720, 3644, 874, 382, 2025, 1431, 245, 1432, 2, 224, 116, 23], [25, 2026, 0, 2027], [3645, 1072, 3646, 383, 3647], [0, 3648, 538, 539, 298, 1433, 538, 1434, 20, 2028], [0, 3649, 610, 611, 721, 3650, 1073, 3651, 3652, 101, 274, 3653, 3654], [1435, 3655, 2029, 14, 117, 176, 1074, 3656], [0, 3657, 39, 46, 118, 3658, 2030, 125, 118, 3659], [0, 0, 0, 3660, 2031, 1436, 3661, 1437, 3662, 3663, 49], [212, 3664, 26, 1438, 3665,

- Follow the example described here. Use the same architecture, but:
  - only use the last output of the LSTM in the loss function
  - use an embedding dim of 128
  - use a hidden dim of 256.  

In [8]:
mapping = dict(zip(mapping_pd[0], mapping_pd[1]))
mapping

{0: 'anger', 1: 'joy', 2: 'optimism', 3: 'sadness'}

In [9]:
EMBEDDING_DIM = 128 # 將word轉換成維度為128的向量
HIDDEN_DIM = 256 # 在RNN或LSTM中模型中隱藏曾神經元的數量大小
word_to_ix = vocab # 詞彙表
tag_to_ix = mapping # 標籤表

In [88]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, dropout=0.0):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        # 把每個詞都轉換成詞向量
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, dropout=dropout)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence) # 將輸入的句子中的每個詞都轉換成詞向量
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1)) # 將詞向量作為LSTM模型的輸入 得到LSTM曾的輸出和隱藏狀態
        # Take only the last output of the LSTM
        last_output = lstm_out[-1].view(1, -1)  # Selecting the last output
        tag_space = self.hidden2tag(last_output) # 將LSTM模型的最後輸出轉換成 詞標籤 空間
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [47]:
len(word_to_ix)+1

5001

In [56]:
# vocab_size 要添加 1 因為如果 sentence 中有出現沒在 vocab 中的單字，使用 5000 來代替，所以要加 1
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix)+1, len(tag_to_ix))
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [12]:
# for sentence to sequence 
def prepare_sentence_sequence(seq, to_ix):
    idx = []
    # use spacy to tokenize the sentence 
    for token in nlp(seq):
        # filter out the punctuation and stop words and space 
        if not token.is_punct and not token.is_stop and not token.is_space:
            word = token.text
            # if the token is in the top 5000 words in the vocab, add its index to the list
            if word in to_ix:
                idx.append(to_ix[word])
            else:
                # else add the index of the placeholder token
                idx.append(placeholder_index)
    return torch.tensor(idx, dtype=torch.long)

In [63]:
def one_hot_encode(val, to_ix):
    result = []
    for k, v in to_ix.items(): 
        if val == k:
            result.append(1)
        else:
            result.append(0)
    return torch.tensor(result, dtype=torch.float32)

In [64]:
print(mapping)
print(one_hot_encode(2, tag_to_ix))

{0: 'anger', 1: 'joy', 2: 'optimism', 3: 'sadness'}
tensor([0., 0., 1., 0.])


In [68]:

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i. ; j有3個、i有5個字，i, j 表示第i個字的第j個tag的分數
# Here we don't need to train, so the code is wrapped in torch.no_grad()
sentence_idx = 1
with torch.no_grad():
    inputs = prepare_sentence_sequence(train_dataset[sentence_idx], word_to_ix)
    labels = one_hot_encode(train_label_pd[0][sentence_idx], tag_to_ix)
    outputs = model(inputs)
    _, preds = torch.max(outputs, 1)
    result_idx = torch.argmax(outputs).item()
    loss = loss_function(outputs[0], labels)

    print(f'First Sentense = {train_dataset[sentence_idx]}')
    print(f'Sentense to tensor = {inputs}')
    print(f'Sentense of result to tensor = {labels}')
    print(f'tag_scores = {outputs}')
    print(f'loss = {loss}')
    print(f'preds = {preds}')
    print(f'result = {result_idx}, ans = {train_label_pd[0][sentence_idx]}')

First Sentense = My roommate: it's okay that we can't spell because we have autocorrect. #terrible #firstworldprobs 
Sentense to tensor = tensor([1070,  340, 2015, 2016,   45, 2017])
Sentense of result to tensor = tensor([1., 0., 0., 0.])
tag_scores = tensor([[-1.3280, -1.4272, -1.4998, -1.3026]])
loss = 1.32795250415802
preds = tensor([3])
result = 3, ans = 0


In [35]:
dataloaders = {'train': train_dataset, 'test': test_dataset}
resultloaders = {'train': train_label_pd[0].tolist(), 'test': test_label_pd[0].tolist()}
dataset_sizes = {x: len(dataloaders[x]) for x in ['train', 'test']}
dataset_sizes

{'train': 3257, 'test': 1421}

In [36]:
phase = 'train'

len(resultloaders[phase])

3257

In [69]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=1):
    since = time.time()
    with TemporaryDirectory() as tempdir:
        best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')

        torch.save(model.state_dict(), best_model_params_path)
        best_acc = 0.0

        for epoch in range(num_epochs):
            print(f'Epoch {epoch}/{num_epochs - 1}')
            print('-' * 10)

            # Each epoch has a training and validation phase
            for phase in ['train', 'test']:
                if phase == 'train':
                    model.train()
                else: 
                    model.eval()
                
                running_loss = 0.0
                running_corrects = 0

                # Iterate over data.
                for input, label in zip(dataloaders[phase], resultloaders[phase]):
                    inputs_vector = prepare_sentence_sequence(input, word_to_ix)
                    labels_vector = one_hot_encode(label, tag_to_ix)
                    
                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs_vector) # 取得針對每個emotion的預測結果tensor (e.g. tensor([[-1.3948, -1.4476, -1.3804, -1.3261]]))
                        pred = torch.argmax(outputs).item() # 取得最大值的index (e.g. 2)
                        loss = criterion(outputs[0], labels_vector) # 外面還有一層，只需取得內層 [-1.3948, -1.4476, -1.3804, -1.3261] 與 [0, 0, 1, 0] 的計算loss

                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                    # statistics
                    running_loss += loss.item()
                    if pred == label:
                        running_corrects += 1

                if phase == 'train':
                    scheduler.step()

                epoch_loss = running_loss / dataset_sizes[phase]
                epoch_acc = running_corrects / dataset_sizes[phase]
                print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f} Time elapsed: {round((time.time() - since))} sec.')
                
                # deep copy the model
                if phase == 'test' and epoch_acc > best_acc:
                    best_acc = epoch_acc
                    torch.save(model.state_dict(), best_model_params_path)

            print()

        time_elapsed = time.time() - since
        print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
        print(f'Best val Acc: {best_acc:4f}')

        # load best model weights
        model.load_state_dict(torch.load(best_model_params_path))
    return model
            

In [70]:
model = train_model(model, loss_function, optimizer, exp_lr_scheduler, num_epochs=30)

Epoch 0/29
----------
train Loss: 1.2157 Acc: 0.4642 Time elapsed: 25 sec.
test Loss: 1.2095 Acc: 0.4553 Time elapsed: 32 sec.

Epoch 1/29
----------
train Loss: 1.1019 Acc: 0.5333 Time elapsed: 58 sec.
test Loss: 1.1816 Acc: 0.4708 Time elapsed: 65 sec.

Epoch 2/29
----------
train Loss: 1.0151 Acc: 0.5812 Time elapsed: 92 sec.
test Loss: 1.1603 Acc: 0.4898 Time elapsed: 99 sec.

Epoch 3/29
----------
train Loss: 0.9054 Acc: 0.6386 Time elapsed: 125 sec.
test Loss: 1.1535 Acc: 0.5201 Time elapsed: 131 sec.

Epoch 4/29
----------
train Loss: 0.7659 Acc: 0.7000 Time elapsed: 157 sec.
test Loss: 1.1363 Acc: 0.5348 Time elapsed: 164 sec.

Epoch 5/29
----------
train Loss: 0.6118 Acc: 0.7753 Time elapsed: 191 sec.
test Loss: 1.1445 Acc: 0.5531 Time elapsed: 198 sec.

Epoch 6/29
----------
train Loss: 0.4562 Acc: 0.8410 Time elapsed: 225 sec.
test Loss: 1.2045 Acc: 0.5714 Time elapsed: 232 sec.

Epoch 7/29
----------
train Loss: 0.2953 Acc: 0.9137 Time elapsed: 258 sec.
test Loss: 1.1451 Ac

In [78]:
class GRUTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, dropout=0.0):
        super(GRUTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.gru = nn.GRU(embedding_dim, hidden_dim, dropout=dropout)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence) # 將輸入的句子中的每個詞都轉換成詞向量
        lstm_out, _ = self.gru(embeds.view(len(sentence), 1, -1)) # 將詞向量作為LSTM模型的輸入 得到LSTM曾的輸出和隱藏狀態
        # Take only the last output of the LSTM
        last_output = lstm_out[-1].view(1, -1)  # Selecting the last output
        tag_space = self.hidden2tag(last_output) # 將LSTM模型的最後輸出轉換成 詞標籤 空間
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [85]:
# vocab_size 要添加 2 因為如果 sentence 中有出現沒在 vocab 中的單字，使用 5001 來代替，所以要加 1
modelGRU = GRUTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix)+1, len(tag_to_ix), dropout=0.5)
loss_function_gru = nn.CrossEntropyLoss()
optimizer_gru = optim.SGD(modelGRU.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler_gru = lr_scheduler.StepLR(optimizer_gru, step_size=7, gamma=0.1)

In [86]:
modelGRU = train_model(modelGRU, loss_function_gru, optimizer_gru, exp_lr_scheduler_gru, num_epochs=30)

Epoch 0/29
----------
train Loss: 1.1921 Acc: 0.4842 Time elapsed: 27 sec.
test Loss: 1.2221 Acc: 0.4588 Time elapsed: 34 sec.

Epoch 1/29
----------
train Loss: 1.0732 Acc: 0.5502 Time elapsed: 62 sec.
test Loss: 1.1963 Acc: 0.4701 Time elapsed: 69 sec.

Epoch 2/29
----------
train Loss: 0.9747 Acc: 0.5972 Time elapsed: 96 sec.
test Loss: 1.1551 Acc: 0.5053 Time elapsed: 104 sec.

Epoch 3/29
----------
train Loss: 0.8445 Acc: 0.6702 Time elapsed: 131 sec.
test Loss: 1.1211 Acc: 0.5327 Time elapsed: 138 sec.

Epoch 4/29
----------
train Loss: 0.6843 Acc: 0.7393 Time elapsed: 166 sec.
test Loss: 1.1305 Acc: 0.5707 Time elapsed: 173 sec.

Epoch 5/29
----------
train Loss: 0.5111 Acc: 0.8142 Time elapsed: 200 sec.
test Loss: 1.2057 Acc: 0.5672 Time elapsed: 207 sec.

Epoch 6/29
----------
train Loss: 0.3387 Acc: 0.8901 Time elapsed: 234 sec.
test Loss: 1.3223 Acc: 0.5693 Time elapsed: 241 sec.

Epoch 7/29
----------
train Loss: 0.1779 Acc: 0.9472 Time elapsed: 269 sec.
test Loss: 1.2429 A

In [89]:
# vocab_size 要添加 2 因為如果 sentence 中有出現沒在 vocab 中的單字，使用 5001 來代替，所以要加 1
model_LSTM = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix)+1, len(tag_to_ix), dropout=0.5)
loss_function_LSTM = nn.CrossEntropyLoss()
optimizer_LSTM = optim.SGD(model_LSTM.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler_LSTM = lr_scheduler.StepLR(optimizer_LSTM, step_size=7, gamma=0.1)



In [90]:
modelLSTM = train_model(model_LSTM, loss_function_LSTM, optimizer_LSTM, exp_lr_scheduler_LSTM, num_epochs=30)

Epoch 0/29
----------
train Loss: 1.2062 Acc: 0.4673 Time elapsed: 27 sec.
test Loss: 1.1884 Acc: 0.4659 Time elapsed: 34 sec.

Epoch 1/29
----------
train Loss: 1.0779 Acc: 0.5379 Time elapsed: 62 sec.
test Loss: 1.1504 Acc: 0.4891 Time elapsed: 69 sec.

Epoch 2/29
----------
train Loss: 0.9885 Acc: 0.5840 Time elapsed: 97 sec.
test Loss: 1.1279 Acc: 0.5236 Time elapsed: 104 sec.

Epoch 3/29
----------
train Loss: 0.8893 Acc: 0.6371 Time elapsed: 132 sec.
test Loss: 1.1053 Acc: 0.5369 Time elapsed: 139 sec.

Epoch 4/29
----------
train Loss: 0.7683 Acc: 0.7003 Time elapsed: 168 sec.
test Loss: 1.0772 Acc: 0.5658 Time elapsed: 175 sec.

Epoch 5/29
----------
train Loss: 0.6276 Acc: 0.7679 Time elapsed: 204 sec.
test Loss: 1.0637 Acc: 0.5749 Time elapsed: 211 sec.

Epoch 6/29
----------
train Loss: 0.4785 Acc: 0.8268 Time elapsed: 240 sec.
test Loss: 1.0931 Acc: 0.5890 Time elapsed: 247 sec.

Epoch 7/29
----------
train Loss: 0.2978 Acc: 0.9140 Time elapsed: 275 sec.
test Loss: 1.0633 A