In [1]:
pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
pip install TextBlob

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
pip install spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Import Libraries

In [1]:
import emoji
from textblob import TextBlob
import re
from nltk.stem.porter import PorterStemmer
from tqdm import tqdm
import itertools
import numpy as np
import pandas as pd
from IPython.display import clear_output
import spacy
import matplotlib.pyplot as plt
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [2]:
nlp = spacy.blank("en")
%matplotlib inline

## Set GPU Device

In [3]:
# use the GPU
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


## Load Texts and Labels


In [4]:
class TextLoader:
    """Download the files and load them into a dataframe.

    This TextLoader performs file loading through tags filtering and concatting
    texts by positive and negative tags.
    
    Attributes:
    ----------
    pos_label: Pre-defined positive tag
    neg_label: Pre-defined negative tag

    """
    def __init__(self, data_name, pos_label, neg_label):
        self.data_name = data_name
        self.pos_label = pos_label
        self.neg_label = neg_label

    # import the dataset (txt file) line by line
    def load_text(self):
        raw_data = []
        for item in ["_text.txt", "_labels.txt"]:
            with open(self.data_name + item, 'rb') as f:
                texts = []
                for line in f:
                    texts.append(line.decode(errors='ignore').lower().strip())
            raw_data.append(texts)
        return raw_data

    def sample_text(self):
        raw_data = self.load_text()
        raw_texts = raw_data[0]
        raw_labels = raw_data[1]

        pos_indexs = [i for i, x in enumerate(raw_labels) if x == self.pos_label]
        neg_indexs = [i for i, x in enumerate(raw_labels) if x == self.neg_label]
        pos_raw_text = [raw_texts[i] for i in pos_indexs]
        neg_raw_text = [raw_texts[i] for i in neg_indexs]

        # concat negative and positive texts 
        texts = pos_raw_text + neg_raw_text

        # we know the order in texts variable, so we can label it accordingly
        labels = np.array([1]*len(pos_raw_text) + [0]*len(neg_raw_text))

        return texts, labels

## Preprocessing

In [5]:
class TweetCleaner:
    """Clean tweet text data.

    This Cleaner performes text pre-processing through a series of 
    operations while creating a NLP application.

    Attributes
    ----------
    text: series of length n_tweets
    """
     
    def __init__(self, text):
        self.text = text
        self.ps = PorterStemmer()

    def preprocess(self):
        corpus = []

        for text in self.text:
            text = text.replace('\\n', ' ')     # Filtering Line breaks
            text = re.sub('@\S+', '', text)     # Filtering User names
            text = re.sub('#', '', text)        # Change Tags into texts
            new_text = [self.ps.stem(word) for word in text.split()]    # Stemming
            text = ' '.join(new_text)           
            text = emoji.demojize(text)         # turn emoji into text
            text = re.sub(':', ' ', text)       # Change Tags into texts
            text = re.sub('[^a-z _]+', '', text) # Filtering Symbols and nums
            corpus.append(text)
        
        print("Data preprocessing done.")
        return corpus

## Tokenizing

In [6]:
class ContextGenerator:
    """Generate the context
    """  
    def __init__(self):
        return

    def tokenize(self, texts):
        max_len = 0
        tokenized_texts = []
        word2idx = {}

        # Add <pad> and <unk> tokens to the vocabulary
        word2idx['<pad>'] = 0
        word2idx['<unk>'] = 1

        # Building our vocab from the corpus starting from index 0
        idx = 2
        for sent in texts:
            tokenized_sent = nlp(sent)
            # Add `tokenized_sent` to `tokenized_texts`
            tokenized_texts.append(tokenized_sent)
            # Add new token to `word2idx`
            for token in tokenized_sent:
                # string any token objects are different things, be careful.
                if token.text not in word2idx:
                    word2idx[token.text] = idx
                    idx += 1

                # Update `max_len`
            max_len = max(max_len, len(tokenized_sent))
            
        return tokenized_texts, word2idx, max_len

    def encode(self, tokenized_texts, word2idx, max_len):
        input_ids = []
        for tokenized_sent in tokenized_texts:
            # Pad sentences to max_len
            tokenized_padded_sent = list(tokenized_sent) + ['<pad>'] * (max_len - len(tokenized_sent))
    
            # Encode tokens to input_ids
            input_id = [word2idx.get(str(token)) for token in tokenized_padded_sent]
            input_ids.append(input_id)
        
        return np.array(input_ids)

## DataLoader

In [7]:
class Loader:
    """Create DataLoader in Torch.

    This Loader encapsulates TweetCleaner, ContextGenerator and generate
    Torch DataLoader.

    Attributes
    ----------
    dl: object of TweetCleaner
    """
    def __init__(self, dl):
        self.dl = dl
    
    def loader(self):
        texts, labels = self.dl.sample_text()
        cleaner = TweetCleaner(texts)
        texts = cleaner.preprocess()

        cg = ContextGenerator()
        tokenized_texts, word2idx, max_len = cg.tokenize(texts)
        input_ids = cg.encode(tokenized_texts, word2idx, max_len)

        # Convert data type to torch.Tensor
        inputs = torch.from_numpy(input_ids)
        labels = torch.from_numpy(labels)

        # Specify batch_size
        batch_size = 2

        # Create DataLoader for training data
        data = TensorDataset(inputs, labels)
        sampler = RandomSampler(data)
        dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)

        return dataloader, len(word2idx)

## CNN Model Class

In [8]:
class CNN(nn.Module):
    def __init__(self,
                 vocab_size=0,
                 embed_dim=300,
                 filter_sizes=[3, 4, 5],
                 num_filters=[100, 100, 100],
                 num_classes=2,
                 dropout=0.5):
        """
        The constructor for CNN class.
        Args:
            vocab_size (int): Vocabulary size.
            embed_dim (int): Dimension of word vectors.
                when pretrained word embeddings are not used. Default: 300
            filter_sizes (List[int]): List of filter sizes. Default: [3, 4, 5]
            num_filters (List[int]): List of number of filters, has the same
                length as `filter_sizes`. Default: [100, 100, 100]
            n_classes (int): Number of classes. Default: 2
            dropout (float): Dropout rate. Default: 0.5
        """

        super(CNN, self).__init__()
        # Random Embedding layer
        self.embed_dim = embed_dim
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=self.embed_dim,
                                          padding_idx=0,
                                          max_norm=5.0)
        # Conv Network
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
        # Fully-connected layer and Dropout
        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input_ids):
        """Perform a forward pass through the network.

        Args:
            input_ids (torch.Tensor): A tensor of token ids with shape
                (batch_size, max_sent_length)

        Returns:
            logits (torch.Tensor): Output logits with shape (batch_size,
                n_classes)
        """

        # Get embeddings from `input_ids`. Output shape: (b, max_len, embed_dim)
        x_embed = self.embedding(input_ids).float()

        # Permute `x_embed` to match input shape requirement of `nn.Conv1d`.
        # Output shape: (b, embed_dim, max_len)
        x_reshaped = x_embed.permute(0, 2, 1)

        # Apply CNN and ReLU. Output shape: (b, num_filters[i], L_out)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]

        # Max pooling. Output shape: (b, num_filters[i], 1)
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]
        
        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (b, sum(num_filters))
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)
        
        # Compute logits. Output shape: (b, n_classes)
        logits = self.fc(self.dropout(x_fc))

        return logits

## DataLoader Implement for dataset 1 (Joy-Anger)

In [9]:
train_dl = TextLoader("train", "1", "0")
test_dl = TextLoader("test", "1", "0")
val_dl = TextLoader("val", "1", "0")

train_DL = Loader(train_dl)
test_DL = Loader(test_dl)
val_DL = Loader(val_dl)

trainloader, size = train_DL.loader()
testloader, item1 = test_DL.loader()
valloader, item2= val_DL.loader()

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


## DataLoader Implement for dataset 2 (Joy-Sadness)

In [10]:
train_dl_2 = TextLoader("train", "1", "3")
test_dl_2 = TextLoader("test", "1", "3")
val_dl_2 = TextLoader("val", "1", "3")

train_DL_2 = Loader(train_dl_2)
test_DL_2 = Loader(test_dl_2)
val_DL_2 = Loader(val_dl_2)

trainloader_2, size_2 = train_DL_2.loader()
testloader_2, item1 = test_DL_2.loader()
valloader_2, item2= val_DL_2.loader()

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


## Implement CNN Model for both datasets

In [11]:
import torch.optim as optim

# Instantiate CNN model
model = CNN(embed_dim=300,
            vocab_size=size,
            filter_sizes=[3, 5, 7],
            num_filters=[100, 100, 100],
            num_classes=2,
            dropout=0.2)
    
# Send model to `device` (GPU/CPU)
model.to(device)

# Instantiate Adadelta optimizer
optimizer = optim.Adadelta(model.parameters(),
                               lr=0.01)

In [12]:
import torch.optim as optim
# Instantiate CNN model
model_2 = CNN(embed_dim=300,
            vocab_size=size_2,
            filter_sizes=[3, 5, 7],
            num_filters=[100, 100, 100],
            num_classes=2,
            dropout=0.5)
    
# Send model to `device` (GPU/CPU)
model_2.to(device)

# Instantiate Adadelta optimizer
optimizer = optim.Adam(model_2.parameters(),
                               lr=0.01)

## Model Training Function

In [13]:
import random
import time

def training(model):
    # Specify loss function
    loss_fn = nn.CrossEntropyLoss()

    # Start training loop
    print("Start training...\n")
    print(f"{'Epoch':^7} | {'Train Loss':^12}")
    print("-"*60)

    avg_val_losses = []
    for epoch_i in range(30):
        total_loss = 0
        # Put the model into the training mode
        model.train()
        for step, batch in enumerate(trainloader_2):
        
            # Load batch to GPU
            b_input_ids, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()
            
            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)

            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Update parameters
            optimizer.step()

            # Calculate the average loss over the entire training data
            avg_train_loss = total_loss / len(trainloader_2) 
        print(f"{epoch_i + 1:^7} | {avg_train_loss:^12.6f}")
    return model

## Training on dataset 1 (Joy-Anger)

In [12]:
model = training(model)

Start training...

 Epoch  |  Train Loss 
------------------------------------------------------------
   1    |   0.653515  
   2    |   0.638520  
   3    |   0.627388  
   4    |   0.619170  
   5    |   0.607263  
   6    |   0.598289  
   7    |   0.596140  
   8    |   0.582525  
   9    |   0.572793  
  10    |   0.556810  
  11    |   0.553192  
  12    |   0.546828  
  13    |   0.531434  
  14    |   0.520958  
  15    |   0.512214  
  16    |   0.499813  
  17    |   0.490283  
  18    |   0.479410  
  19    |   0.469557  
  20    |   0.453643  
  21    |   0.441909  
  22    |   0.427926  
  23    |   0.413588  
  24    |   0.406225  
  25    |   0.389115  
  26    |   0.385705  
  27    |   0.369956  
  28    |   0.362443  
  29    |   0.349529  
  30    |   0.339331  


## Training on dataset 2 (Joy-Sadness)

In [14]:
model_2 = training(model_2)

Start training...

 Epoch  |  Train Loss 
------------------------------------------------------------
   1    |   9.792317  
   2    |   7.531464  
   3    |   3.841768  
   4    |   2.760264  
   5    |   3.595385  
   6    |   4.363451  
   7    |   4.173997  
   8    |   3.812296  
   9    |   4.160670  
  10    |   3.702138  
  11    |   7.077987  
  12    |   4.935472  
  13    |   4.310239  
  14    |   2.552106  
  15    |   3.575094  
  16    |   2.767444  
  17    |   2.406714  
  18    |   2.269345  
  19    |   3.105965  
  20    |   7.041555  
  21    |   3.585576  
  22    |   3.568698  
  23    |   5.641461  
  24    |   2.841337  
  25    |   2.979980  
  26    |   3.597410  
  27    |   3.010501  
  28    |   1.742867  
  29    |   4.502463  
  30    |   3.319356  


In [21]:
pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Evaluating Function

In [17]:
from torchmetrics.classification import BinaryF1Score
from torchmetrics import Accuracy
classes = ('joy','sadness')

def evaluate(loader, model):
    with torch.no_grad():
        # prepare to count predictions for each class
        correct_pred = {classname: 0 for classname in classes}
        total_pred = {classname: 0 for classname in classes}

        targets = torch.empty(0, dtype=torch.float64)
        outputs = torch.empty(0, dtype=torch.float64)
        preds = torch.empty(0, dtype=torch.float64)
        outputs = outputs.cuda(device)
        targets = targets.cuda(device)
        preds = preds.cuda(device)

        for inputs, label in enumerate(loader):
            b_input_ids, b_labels = tuple(t.to(device) for t in label)
            output = model(b_input_ids)
            _, predicted = torch.max(output, 1)
            
            # collect the correct predictions for each class
            for label, prediction in zip(b_labels, predicted):
                if label == prediction:
                    correct_pred[classes[label]] += 1
                total_pred[classes[label]] += 1
            
            outputs = torch.cat((outputs, output))
            targets = torch.cat((targets, b_labels))
            preds = torch.cat((preds, predicted))
        
        accuracy = 0
        for classname, correct_count in correct_pred.items():
            accuracy = 100 * float(correct_count) / total_pred[classname]
            print(f'Accuracy for class: {classname:5s} is {accuracy:.1f} %')

    f1 = BinaryF1Score().to(device)
    f1_score = f1(preds, targets)
    

    print(f'F1_macro of the network: {f1_score}')

## Evaluation on Model 1
accuracy and F1-macro on the training and development set

### Joy-Anger Training Dataset

In [14]:
evaluate(trainloader, model.to(device))

Accuracy for class: joy   is 98.7 %
Accuracy for class: anger is 80.9 %
F1_macro of the network: 0.8821138143539429


### Joy-Anger Validating Dataset

In [15]:
evaluate(valloader, model.to(device))

Accuracy for class: joy   is 63.1 %
Accuracy for class: anger is 28.9 %
F1_macro of the network: 0.30434781312942505


## Evaluation on Model 2
In this part, we tuned 14 groups of parameters and found the best 3 choices. The output only shows one of them because the codes are not encapsulated perfectly. We wrote OOP-oriented classes most of the time, but there are still some repetitive codes. Optimizing the project codes is time-consuming, however. In this situation, we tuned the parameters manually and wrote the results into a table in the lab report.

### Joy-Anger Testing Dataset
accuracy and F1-macro on the testing set with model 2.

In [16]:
evaluate(testloader, model_2.to(device))

Accuracy for class: joy   is 20.4 %
Accuracy for class: anger is 87.2 %
F1_macro of the network: 0.5601436495780945


### Joy-Sadness Testing Dataset
accuracy and F1-macro on the testing set with model 2.

In [18]:
evaluate(testloader_2, model_2.to(device))

Accuracy for class: joy   is 9.7 %
Accuracy for class: sadness is 86.3 %
F1_macro of the network: 0.6106719374656677
