# Assignment Two:  Sentiment Classification

For this exercise you will be using the "SemEval 2017 task 4" corpus provided on the module website, available through the following link: https://warwick.ac.uk/fac/sci/dcs/teaching/material/cs918/semeval-tweets.tar.bz2 You will focus particularly on Subtask A, i.e. classifying the overall sentiment of a tweet as positive, negative or neutral.

You are requested to produce a Jupyter notebook for the coursework submission. The input to your program is the SemEval data downloaded. Note that TAs need to run your program on their own machine by using the original SemEval data. As such, don’t submit a Python program that takes as input some preprocessed files.

#### Define Utility Functions

In [1]:
from collections.abc import Generator
from os import path, getcwd, listdir, makedirs


def read_file_lines_from(file_path: str, /) -> Generator[str, None, None]:
    """
    Read lines from a file and yield each line as a string.
    The path to the file is relative to the current working directory.

    Args:
        file_path (str): The path to the file to be read.

    Yields:
        str: Each line of the file, stripped of leading and trailing whitespace.
    """
    full_path = path.join(getcwd(), file_path)
    buffer_size = 1024 * 1024
    with open(full_path, 'r', buffering=buffer_size, encoding='utf8') as file:
        for line in file:
            yield line.strip()


def ls(dir_path: str, /) -> tuple[str, ...]:
    """
    List all files and directories in the specified directory.
    """
    full_path = path.join(getcwd(), dir_path)
    return tuple(listdir(full_path))


def mkdir(dir_path: str, /) -> None:
    """
    Create a directory.
    """
    full_path = path.join(getcwd(), dir_path)
    if not path.exists(full_path):
        makedirs(full_path)


def path_exists(location: str, /) -> bool:
    """
    Check if the specified path exists.
    """
    full_path = path.join(getcwd(), location)
    return path.exists(full_path)

In [2]:
from collections.abc import Callable
from threading import Thread, Lock
from typing import final


@final
class BackgroundTask:
    """
    Represents a background task that can be executed concurrently.

    Args:
        task (Callable): The function or method to be executed as a background task.
        *args: Variable length argument list to be passed to the task.
        **kwargs: Arbitrary keyword arguments to be passed to the task.
    """

    def __init__(self, task: Callable[..., None], *args, **kwargs):
        self.__task = Thread(
            target=task,
            args=args,
            kwargs=kwargs,
            daemon=True
        )
        with Lock():
            self.__task.start()

    def wait(self) -> None:
        """
        Waits for the background task to complete.
        """
        return self.__task.join()

In [3]:
def unzip_file_to(file_path: str, /, destination: str) -> None:
    """
    Unzip a file to a specified destination.

    Args:
        file_path (str): The path to the file to be unzipped.
        destination (str): The path to the directory where the file will be unzipped.
    """
    import zipfile
    full_path = path.join(getcwd(), file_path)
    with zipfile.ZipFile(full_path, 'r') as zip_ref:
        zip_ref.extractall(destination)

In [4]:
from typing import Any, Final, Optional
from shelve import open as shelve_open


class GlobalCache:
    """
    A simple global cache for storing data in memory.
    """

    __runtime_cache: Final[dict[str, Any]] = {}
    __cache_file_name: Final[str] = 'cache'

    def put(self, key: str, value: object, /) -> None:
        """
        Put a value into the cache.

        Args:
            key (str): The key to be used to store the value.
            value (object): The value to be stored.
        """

        self.__runtime_cache[key] = value
        with shelve_open(self.__cache_file_name, 'c') as cache:
            cache[key] = value

    def get(self, key: str, /) -> Optional[Any]:
        """
        Get a value from the cache.

        Args:
            key (str): The key to be used to retrieve the value.

        Returns:
            object: The value stored in the cache.
        """

        if key in self.__runtime_cache:
            return self.__runtime_cache[key]

        with shelve_open(self.__cache_file_name, 'c') as cache:
            return cache.get(key)

    def remove(self, key: str, /) -> None:
        """
        Remove a value from the cache.

        Args:
            key (str): The key to be used to remove the value.
        """

        self.__runtime_cache.pop(key, None)
        with shelve_open(self.__cache_file_name, 'c') as cache:
            del cache[key]

    def clear(self) -> None:
        """
        Clear the cache.
        """

        self.__runtime_cache.clear()
        with shelve_open(self.__cache_file_name, 'c') as cache:
            cache.clear()

#### Package imports for Application logic

In [5]:
import pandas as pd
import numpy as np
import csv
import regex as re
import contractions
import torch

from os import cpu_count
from typing import Final, final
from types import NoneType
from string import punctuation, digits
from functools import lru_cache
from concurrent.futures import ProcessPoolExecutor, as_completed
from enum import Enum
from collections import defaultdict, Counter, OrderedDict
from collections.abc import Sequence
from copy import copy
from huggingface_hub import hf_hub_download
from emoji import demojize
from nltk.downloader import download as nltk_download
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel, BertConfig, get_linear_schedule_with_warmup
from tqdm import tqdm

#### Define global instances

In [6]:
dataset_base_path: Final[str] = 'data'
glove_data_dir: Final[str] = f"{dataset_base_path}/glove"
target_glove_file_name: Final[str] = "glove.6B.100d.txt"
target_bert_model_name: Final[str] = 'bert-base-multilingual-uncased'
target_mixtral_model_name: Final[str] = 'mistralai/Mistral-7B-Instruct-v0.2'

# names of the test set files
test_set_names: Final[tuple[str, ...]] = (
    'twitter-test1.txt',
    'twitter-test2.txt',
    'twitter-test3.txt',
)
training_data_file_name: Final[str] = 'twitter-training-data.txt'
devlopment_data_file_name: Final[str] = 'twitter-dev-data.txt'


@final
class Sentiment(Enum):
    """
    An enumeration of the three possible sentiment values.
    """
    positive = 2
    negative = 1
    neutral = 0

    @classmethod
    @lru_cache
    def gts(cls) -> tuple[str, ...]:
        return tuple(cls.__members__.keys())


global_cache = GlobalCache()

TweetID = str
ShouldMarkedAsBackground = NoneType

#### Define data retrieval functions

In [7]:
@lru_cache(typed=True)
def get_tweets_from(file_name_: str, /) -> tuple[dict[TweetID, str], tuple[Sentiment, ...]]:
    """
    Read tweets from a file and return dictionaries containing tweet IDs, contents, and sentiments.

    Parameters:
    - file_name_ (str): The name of the file to read tweets from.

    Returns:
    - A tuple containing two dictionaries:
        - id_gts (dict[TweetID, str]): A dictionary mapping tweet IDs to their contents.
        - id_sentiments (dict[TweetID, Sentiment]): A dictionary mapping tweet IDs to their sentiments.
    """
    id_gts: OrderedDict[TweetID, str] = OrderedDict()
    sentiments: list[Sentiment] = []
    lines = read_file_lines_from(f'{dataset_base_path}/{file_name_}')
    for line in lines:
        fields = line.split('\t')
        tweet_id = fields[0]
        gt = fields[1]
        content = ' '.join(fields[2:])
        id_gts[tweet_id] = content
        sentiments.append(Sentiment[gt])

    return id_gts, tuple(sentiments)

#### Define GloVe data preparation functions

In [8]:
def prepare_glove_data() -> ShouldMarkedAsBackground:
    if path_exists(glove_data_dir) and len(ls(glove_data_dir)) == 4:
        return

    glove_data_pack_name = 'glove.6B.zip'

    hf_hub_download(
        repo_id='stanfordnlp/glove',
        filename=glove_data_pack_name,
        local_dir=dataset_base_path,
        revision='1db2080b2d94def6e5b0386a523102f9d8849e9d',
    )

    # perform shell command using python code since the thread management can be done in python.
    mkdir(glove_data_dir)
    unzip_file_to(
        f'{dataset_base_path}/{glove_data_pack_name}',
        destination=glove_data_dir
    )

In [9]:
@lru_cache(typed=True)
def parse_glove_data(file_name_: str) -> tuple[dict[str, int], np.ndarray]:
    """
    Parse the GloVe data from a given file.

    Args:
        file_name_ (str): The name of the file containing the GloVe data.

    Returns:
        tuple[dict[str, int], np.ndarray]: A tuple containing two elements:
            - A dictionary mapping words to their corresponding indices.
            - A numpy array containing the word vectors.
    """
    file_frame = pd.read_csv(
        f"{glove_data_dir}/{file_name_}",
        delimiter=' ',
        quoting=csv.QUOTE_NONE,
        header=None,
        encoding='utf-8',
        skip_blank_lines=True,
    )
    
    vectors = file_frame.iloc[:, 1:].to_numpy(dtype=np.float64)
    
    # Add a custom <pad> to the top of the vectors
    vectors = np.insert(vectors, 0, np.zeros(vectors.shape[1]), axis=0)

    return (
        file_frame.reset_index().set_index(0)['index'].to_dict(),
        vectors
    )

#### Define data preprocessing functions

In [10]:
def filter_text(src: str, /, *, patterns: Sequence[re.Pattern]) -> str:
    """
    Filters the given source text by removing all occurrences of the specified patterns.

    Args:
        src (str): The source text to be filtered.
        patterns (Sequence[Pattern]): A sequence of regular expression patterns to be removed from the source text.

    Returns:
        str: The filtered text with all occurrences of the specified patterns removed.
    """
    filtered = copy(src)

    for pattern in patterns:
        filtered = pattern.sub('', filtered)

    return filtered


def process_texts(src_dict: dict[Any, str], callable: Callable, *args, **kwargs) -> dict[Any, str]:
    """
    Process a dictionary of texts using a callable function in parallel using a thread pool executor.

    Args:
        src_dict (dict[Any, str]): A dictionary containing the texts to be processed.
        callable (Callable[[str], Any]): A callable function that will be applied to each text.
        *args: Variable length argument list to be passed to the callable function.
        **kargs: Arbitrary keyword arguments to be passed to the callable function.

    Returns:
        dict[Any, str]: A dictionary containing the processed texts.
    """

    multi_process = False

    if multi_process:
        with ProcessPoolExecutor(max_workers=(cpu_count() or 1)) as executor:
            future_to_key = {
                executor.submit(callable, src_dict[key], *args, **kwargs): key for key in src_dict
            }
            return {
                future_to_key[future]: future.result()
                for future in as_completed(future_to_key)
            }

    return {
        key: callable(value, *args, **kwargs)
        for key, value in src_dict.items()
    }


def run_pipelines(
    callables: Sequence[Callable[[str], str]],
    /,
    *,
    tweets: dict[TweetID, str]
) -> dict[str, str]:
    """
    Run a sequence of callables on a dictionary of texts in parallel using a thread pool executor.

    Args:
        callables (Sequence[Callable[[dict[str, str]], dict[str, str]]]): A sequence of callable functions to be applied to the dictionary of texts.
        tweets (dict[str, str]): A dictionary containing the texts to be processed.

    Returns:
        dict[str, str]: A dictionary containing the processed texts.
    """
    processed = copy(tweets)

    for callable in callables:
        processed = process_texts(processed, callable)

    return processed

#### Define confusion matrix function

In [11]:
def show_confusion(*, predict_results: dict[TweetID, Sentiment], test_set_file_name_: str) -> None:
    """
    Display the confusion matrix based on the predicted results and the sentiment labels from the test set file.

    Args:
        predict_results (dict[TweetID, Sentiment]): A dictionary containing the predicted sentiment for each tweet ID.
        test_set_file_name_ (str): The file name of the test set containing the sentiment labels for each tweet ID.
    """
    id_tweets, sentiments = get_tweets_from(test_set_file_name_)

    conf: Final[dict[Sentiment, dict[Sentiment, int]]] = defaultdict(
        lambda: {
            Sentiment.positive: 0,
            Sentiment.negative: 0,
            Sentiment.neutral: 0,
        }
    )

    for tweet_id, sentiment in zip(id_tweets, sentiments):
        if tweet_id in predict_results:
            pred = predict_results[tweet_id]
        else:
            pred = Sentiment.neutral
        conf[pred][sentiment] += 1

    print(''.ljust(12) + '  '.join(Sentiment.gts()))

    for c1 in Sentiment:
        print(c1.name.ljust(12), end='')
        for c2 in Sentiment:
            if c1_sum := sum(conf[c1].values()) > 0:
                p = conf[c1][c2] / float(c1_sum)
                print(f"{p:.3f}     ", end='')
            else:
                print('0.000     ', end='')
        print()
    print()

#### Define evaluation functions

In [12]:
def evaluate(predict_results: dict[TweetID, Sentiment], test_set_file_name_: str, classifier_name_: str) -> None:
    """
    Evaluate the performance of a sentiment classifier by comparing the predicted results with the ground truth sentiment labels.

    Parameters:
        - predict_results: A dictionary mapping TweetIDs to predicted Sentiments.
        - test_set_file_name_: The name of the test set file.
        - classifier_name_: The name of the classifier.
    """
    id_tweets, sentiments = get_tweets_from(test_set_file_name_)

    acc_by_class: Final[dict[Sentiment, dict[str, int]]] = defaultdict(
        lambda: {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}
    )

    for tweet_id, sentiment in zip(id_tweets, sentiments):
        if tweet_id in predict_results:
            pred = predict_results[tweet_id]
        else:
            pred = Sentiment.neutral

        if sentiment == pred:
            acc_by_class[sentiment]['tp'] += 1
        else:
            acc_by_class[sentiment]['fn'] += 1
            acc_by_class[pred]['fp'] += 1

    cat_count = 0
    item_count = 0
    macro: dict[str, float] = {'p': 0.0, 'r': 0.0, 'f1': 0.0}
    micro: dict[str, float] = {'p': 0.0, 'r': 0.0, 'f1': 0.0}
    sem_eval_macro: dict[str, float] = {'p': 0.0, 'r': 0.0, 'f1': 0.0}

    micro_tp = 0.0
    micro_fp = 0.0
    micro_tn = 0.0
    micro_fn = 0.0

    cat_f1s: dict[Sentiment, float] = {}

    for cat, acc in acc_by_class.items():
        cat_count += 1

        micro_tp += acc['tp']
        micro_fp += acc['fp']
        micro_tn += acc['tn']
        micro_fn += acc['fn']

        p = 0.0
        if (acc['tp'] + acc['fp']) > 0:
            p = float(acc['tp']) / (acc['tp'] + acc['fp'])

        r = 0.0
        if (acc['tp'] + acc['fn']) > 0:
            r = float(acc['tp']) / (acc['tp'] + acc['fn'])

        f1 = 0.0
        if (p + r) > 0:
            f1 = 2 * p * r / (p + r)

        cat_f1s[cat] = f1

        n = acc['tp'] + acc['fn']

        macro['p'] += p
        macro['r'] += r
        macro['f1'] += f1

        if cat in ['positive', 'negative']:
            sem_eval_macro['p'] += p
            sem_eval_macro['r'] += r
            sem_eval_macro['f1'] += f1

        item_count += n

    micro['p'] = micro_tp / (micro_tp + micro_fp)
    micro['r'] = micro_tp / (micro_tp + micro_fn)
    micro['f1'] = 2 * micro['p'] * micro['r'] / (micro['p'] + micro['r'])

    sem_eval_macro_f1 = sem_eval_macro['f1'] / 2

    print(
        f"{test_set_file_name_} ({classifier_name_}): {sem_eval_macro_f1:.3f}"
    )

#### Load training set, dev set and testing set
Here, you need to load the training set, the development set and the test set. For better classification results, you may need to preprocess tweets before sending them to the classifiers.

In [13]:
training_data, training_sentiments = get_tweets_from(training_data_file_name)
dev_data, dev_sentiments = get_tweets_from(devlopment_data_file_name)
test_datas, test_sentiments = zip(*[
    get_tweets_from(file_name)
    for file_name in test_set_names
])

#### Download network resources

In [14]:
glove_prepare_task = BackgroundTask(prepare_glove_data)


def download_nltk_resources(resource_names: Sequence[str]) -> ShouldMarkedAsBackground:
    for resource_name in resource_names:
        nltk_download(resource_name, quiet=True)


nltk_prepare_task = BackgroundTask(
    download_nltk_resources,
    ('stopwords', 'vader_lexicon', 'punkt', 'wordnet',)
)


def pre_load_bert_tokenizer() -> ShouldMarkedAsBackground:
    BertTokenizer.from_pretrained(target_bert_model_name)
    BertModel.from_pretrained(target_bert_model_name)


bert_prepare_task = BackgroundTask(pre_load_bert_tokenizer)

#### Lowercase the tweets

In [15]:
# Lowercase all the words.

def lowercase_tweet(tweet: str, /) -> str:
    return tweet.lower()

#### Regexp filter

In [16]:
# Filter the tweets based on the selected regexp patterns.

re_flags = re.IGNORECASE | re.MULTILINE

pattern_html_tags = re.compile(r'<[^>]+?>', re_flags)
pattern_mentions = re.compile(r'@[a-zA-Z0-9_]+', re_flags)
pattern_hashtags = re.compile(r'#[a-zA-Z0-9_]+', re_flags)
pattern_alphanumeric = re.compile(r'[^a-zA-Z0-9 ]+?', re_flags)
pattern_only_one_char = re.compile(r'\b[a-zA-Z0-9]\b', re_flags)
pattern_fully_numeric = re.compile(r'\b([0-9]+)\b', re_flags)

pattern_punctuation = re.compile(
    "[" + re.escape(punctuation+"“”…‘’") + "]+?",
    re_flags
)
custom_removal_puncuations = punctuation+'“”…‘’'
punctuation_translator = str.maketrans(
    custom_removal_puncuations,
    ' ' * len(custom_removal_puncuations)
)

pattern_url = re.compile(
    r'(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?'
    r'(//(?:[a-zA-Z0-9-._~%!$&\'()*+,;=:]*(?::[a-zA-Z0-9-._~%!$&\'()*+,;=:]+)?@)?'
    r'(?:\[[0-9a-fA-F:.]+]|(?:[a-zA-Z0-9-]+\.)*[a-zA-Z]{2,}|[0-9.]+|localhost)'
    r'(?::\d+)?)(/[a-zA-Z0-9-._~%!$&\'()*+,;=:@]*/?)*'
    r'(?:\?[a-zA-Z0-9-._~%!$&\'()*+,;=:@/]*)?'
    r'(?:#[a-zA-Z0-9-._~%!$&\'()*+,;=:@/]*)?',
    re_flags
)

# all_emojis = tuple(EMOJI_DATA.keys())
# pattern_emojis = re.compile('|'.join(map(re.escape, all_emojis)) + '?')

pattern_ampm = re.compile(r'([0-9]+(am|pm))')
pattern_ordinals = re.compile(r'([0-9]+)(?:st|nd|rd|th)')

selected_filter_patterns: tuple[re.Pattern[str], ...] = (
    pattern_url,
    pattern_html_tags,
    pattern_mentions,
    pattern_hashtags,
    pattern_fully_numeric,
    pattern_only_one_char,
    pattern_ampm,
    pattern_ordinals,
)


def regexp_filter(tweet: str, /) -> str:
    return filter_text(tweet, patterns=selected_filter_patterns)


def punctuation_filter(tweet: str, /) -> str:
    return tweet.translate(punctuation_translator)

#### Remove emojis

In [17]:
def remove_emojis(tweet: str, /) -> str:
    return demojize(tweet, delimiters=('', ''))

#### Remove numbers

In [18]:
remove_digits = str.maketrans('', '', digits)


def remove_numbers(tweet: str, /) -> str:
    return tweet.translate(remove_digits)

#### Remove non-English words

In [19]:
from langdetect import detect

def remove_non_en(tweet: str, /) -> str:
    try:
        if detect(tweet) == 'en':
            return tweet
        else:
            return ''
    except:
        return ''

#### Tokenization & Lemmatization

In [20]:
nltk_prepare_task.wait()

lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer(
    reduce_len=True,
    strip_handles=True,
    preserve_case=False
)

custom_preserved_words: frozenset[str] = frozenset((
    'not', "wouldn't", 'nor',
    'but', "weren't", 'couldn',
    'didn', 'doesn', 'hadn',
    'hasn', 'haven', 'isn', 'mightn',
    "aren't", 'mustn', 'needn',
    "shan't", 'shouldn', 'wasn',
    "mustn't", "didn't", "doesn't",
    "mightn't", "isn't", "shouldn't",
    "mustn't", "haven", "isn",
    "didn't", "aren't", "wouldn't",
    "shouldn't", "couldn't", "hadn't",
    "don't", "won't", "can't",
    "hasn't", "wasn't", 'the'
))
custom_stop_words: frozenset[str] = frozenset((
    "call", 'upon', 'still', 'nevertheless', 
    'down', 'every', 'forty', '‘re', 'always', 
    'whole', 'side', 'now', 'however', 
    'an', 'show', 'least', 'give', 'below', 
    'did', 'sometimes', 'which', "'s", 
    'nowhere', 'per', 'hereupon', 'yours', 
    'she', 'moreover', 'eight', 'somewhere', 
    'within', 'whereby', 'few', 'has', 'so', 
    'have', 'for', 'noone', 'top', 'were', 
    'those', 'thence', 'eleven', 'after', 
    '’ll', 'others', 'ourselves', 
    'themselves', 'though', 'that', 'just', '’s', 
    'before', 'had', 'toward', 'another', 'should', 
    'herself', 'and', 'these', 'such', 'elsewhere',
    'further', 'next', 'indeed', 'bottom', 'anyone',
    'his', 'each', 'then', 'both', 'became', 'third',
    'whom', '‘ve', 'mine', 'take', 'many', 'anywhere',
    'to', 'well', 'thereafter', 'besides', 'almost',
    'front', 'fifteen', 'towards', 'be', 'herein',
    'two', 'using', 'whatever', 'please', 'perhaps',
    'full', 'ca', 'we', 'latterly', 'here', 'therefore',
    'us', 'how', 'was', 'made', 'or', 'may', '’re',
    'namely', "'ve", 'anyway', 'amongst', 'used', 'ever',
    'of', 'there', 'than', 'why', 'really', 'whither', 'in',
    'only', 'wherein', 'last', 'under', 'own', 'therein',
    'go', 'seems', '‘m', 'wherever', 'either', 'someone',
    'up', 'doing', 'on', 'rather', 'ours', 'again', 'same',
    'over', '‘s', 'latter', 'during', 'done', "'re", 
    'put', "'m", 'much', 'among', 'seemed', 'into', 
    'once', 'my', 'otherwise', 'part', 'everywhere',
    'myself', 'must', 'will', 'am', 'although',
    'as', 'beyond', 'are', 'too', 'becomes', 'does', 
    'a', 'everyone', 'some', 'regarding', '‘ll', 
    'throughout', 'yourselves', 'him', "'d", 'it',
    'himself', 'whether', 'move', '’m', 'hereafter',
    're', 'while', 'whoever', 'your', 'first', 'amount',
    'twelve', 'serious', 'other', 'any', 'off', 'seeming',
    'four', 'itself', 'nothing', 'beforehand', 'make', 'out',
    'very', 'already', 'various', 'until', 'hers', 'they', 
    'them', 'where', 'would', 'since', 'everything', 'at', 
    'together', 'yet', 'more', 'six', 'back', 'with', 'thereupon',
    'becoming', 'around', 'due', 'keep', 'somehow', 'across',
    'all', 'when', 'i', 'empty', 'nine', 'five', 'get', 'see',
    'been', 'name', 'between', 'hence', 'ten', 'several', 'from',
    'whereupon', 'through', 'hereby', "'ll", 'alone', 'something',
    'formerly','above', 'onto', 'except', 'enough', 'become', 
    'behind', '’d', 'its', 'most', 'n’t', 'might', 'whereas',
    'anything', 'if', 'her', 'via', 'fifty', 'is', 'thereby', 
    'twenty', 'often', 'whereafter', 'their', 'also', 'anyhow', 
    'our', 'could', 'because', 'who', 'beside', 'by', 'whence', 
    'being', 'meanwhile', 'this', 'afterwards', 'whenever', 'mostly',
    'what', 'one', 'nobody', 'seem', 'do', '‘d', 'say',
    'thus', 'unless', 'along', 'yourself', 'former', 'thru',
    'he', 'hundred', 'three', 'sixty', 'me', 'sometime', 'amp',
    'whose', 'you', 'quite', '’ve', 'about', 'even',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday',
    'saturday', 'sunday', 'january', 'february', 'march', 'april',
    'may', 'june', 'july', 'august', 'september', 'october',
    'november', 'december'
))
stop_words = (frozenset(stopwords.words('english')) | custom_stop_words) - custom_preserved_words


def nltk_tokenize_anti_stopwords_lemmatize(tweet: str, /) -> str:
    wordnet.ensure_loaded()
    tokens = tokenizer.tokenize(tweet)
    return ' '.join(
        lemmatizer.lemmatize(token)
        for token in tokens
        if token not in stop_words
    )

#### Fix contractions

In [21]:
def fix_contractions(tweet: str, /) -> str:
    return f'{contractions.fix(tweet)}'

#### Run all preprocessing steps

In [22]:
all_pipelines = (
    lowercase_tweet,
    fix_contractions,
    regexp_filter,
    remove_emojis,
    remove_numbers,
    punctuation_filter,
    nltk_tokenize_anti_stopwords_lemmatize,
)

cleaned_training_tweets = run_pipelines(
    all_pipelines,
    tweets=training_data
)

cleaned_dev_tweets = run_pipelines(
    all_pipelines,
    tweets=dev_data
)

cleaned_test_tweets = [
    run_pipelines(
        all_pipelines,
        tweets=test_data
    )
    for test_data in test_datas
]

#### Show the top 10 most frequent words in the training set

In [23]:
all_words = ' '.join(cleaned_training_tweets.values()).split()
word_freq = Counter(all_words)

freq_frame = pd.DataFrame(
    word_freq.most_common(20),
    columns=['word', 'freq']
)

print(freq_frame)

        word   freq
0        the  34055
1        not   6834
2   tomorrow   5911
3        but   3702
4        day   3650
5      going   3282
6      night   2558
7       like   2405
8       time   2367
9       want   1848
10       new   1699
11      game   1666
12      know   1481
13     today   1443
14      good   1364
15     think   1252
16   tonight   1240
17      come   1230
18    cannot   1157
19      love   1132


In [24]:
# TODO: Dev code.
from pprint import pprint
import random
# generate a random number between 0 and 100

window_size = 15
rand_num = random.randint(0, len(cleaned_training_tweets) - window_size)

pprint(list(cleaned_training_tweets.values())[rand_num:rand_num+window_size])
print(rand_num)

['josh hamilton got the mandatory boo returned angel stadium daily news',
 'highlight grayson allen lead duke past william mary',
 'happy birthday know singapore but happy birthday',
 'kicking the world cup black v tonga yeeeaah looking forward laxing popcorn '
 'glass bubbly',
 'never ending gold rush china china the largest producer gold the biggest '
 'china daily',
 'funny thing think david price going bombed tomorrow win the',
 'playing foo fighter cold day the sun',
 'samsung tease round smartwatch heavy apple watch influence samsung unpacked '
 'event brough',
 'pierce the veil northern light sold albany ny oct',
 'backing hdd preparation the tb sshd coming new p',
 'wheel dad gf tonight but worth kenny chesney jason aldean',
 'think work buying complete manga like fullmetal alchemist naruto not sure',
 'ready madonna the',
 'cannot believe mariah carey want christmas playing right',
 'pancake delicious perfect sleep tomorrow going ascot day long']
128


#### TF-IDF Vectorization

In [25]:
from scipy.sparse import csr_matrix

vectorizer = TfidfVectorizer(
    analyzer='word',
    stop_words='english',
    encoding='utf-8',
    lowercase=False,
    min_df=3,
    max_df=0.8,
    max_features=4940,
    ngram_range=(1, 7)
)

training_matrix = np.asarray(
    csr_matrix(
        vectorizer.fit_transform(cleaned_training_tweets.values())
    ).todense()
)
dev_matrix = np.asarray(
    csr_matrix(vectorizer.transform(cleaned_dev_tweets.values())).todense()
)
test_matrix = [
    np.asarray(csr_matrix(vectorizer.transform(test_data.values())).todense())
    for test_data in cleaned_test_tweets
]

print(training_matrix.shape)

(45101, 4940)


#### Build sentiment classifiers
You need to create your own classifiers (at least 3 classifiers). For each classifier, you can choose between the bag-of-word features and the word-embedding-based features. Each classifier has to be evaluated over 3 test sets. Make sure your classifier produce consistent performance across the test sets. Marking will be based on the performance over all 5 test sets (2 of them are not provided to you).

##### SVM

In [26]:
# SVM

from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

model = LinearSVC(verbose=False, max_iter=1000, dual=False)
model.fit(
    training_matrix,
    [sentiments.value for sentiments in training_sentiments]
)

for x in range(3):
    svm_predictions = model.predict(test_matrix[x])
    svm_report = classification_report(
        [sentiments.value for sentiments in test_sentiments[x]],
        svm_predictions,
        target_names=Sentiment.gts(),
        digits=5
    )

    print(svm_report)

              precision    recall  f1-score   support

    positive    0.59484   0.78191   0.67567      1504
    negative    0.62171   0.33932   0.43902       557
     neutral    0.72640   0.61769   0.66765      1470

    accuracy                        0.64373      3531
   macro avg    0.64765   0.57964   0.59411      3531
weighted avg    0.65385   0.64373   0.63500      3531

              precision    recall  f1-score   support

    positive    0.54842   0.72795   0.62556       669
    negative    0.60909   0.33168   0.42949       202
     neutral    0.75556   0.65784   0.70332       982

    accuracy                        0.64760      1853
   macro avg    0.63769   0.57249   0.58612      1853
weighted avg    0.66481   0.64760   0.64540      1853

              precision    recall  f1-score   support

    positive    0.55296   0.77009   0.64371       983
    negative    0.56502   0.34711   0.43003       363
     neutral    0.71919   0.54792   0.62198      1033

    accuracy        

#### Navie Bayes

In [27]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report

model = BernoulliNB(
    alpha=0.1,
    fit_prior=True,
)
model.fit(
    training_matrix,
    [sentiments.value for sentiments in training_sentiments]
)

for x in range(3):
    svm_predictions = model.predict(test_matrix[x])
    svm_report = classification_report(
        [sentiments.value for sentiments in test_sentiments[x]],
        svm_predictions,
        target_names=Sentiment.gts(),
        digits=5
    )

    print(svm_report)

              precision    recall  f1-score   support

    positive    0.57676   0.71941   0.64024      1504
    negative    0.54662   0.30521   0.39171       557
     neutral    0.67485   0.61701   0.64463      1470

    accuracy                        0.61144      3531
   macro avg    0.59941   0.54721   0.55886      3531
weighted avg    0.61284   0.61144   0.60286      3531

              precision    recall  f1-score   support

    positive    0.53286   0.67862   0.59698       669
    negative    0.57647   0.24257   0.34146       202
     neutral    0.72817   0.67923   0.70285       982

    accuracy                        0.63141      1853
   macro avg    0.61250   0.53348   0.54709      1853
weighted avg    0.64112   0.63141   0.62523      1853

              precision    recall  f1-score   support

    positive    0.54476   0.73042   0.62408       983
    negative    0.48485   0.30854   0.37710       363
     neutral    0.68675   0.55179   0.61192      1033

    accuracy        

#### LSTM Classifier

In [28]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


##### GloVe Embedding Vectorization

In [29]:
@final
class GloVeVectorizer:
    def __init__(self, word_indexes: dict[str, int], word_vectors: np.ndarray):
        self.word_indexes = word_indexes
        self.word_vectors = word_vectors

    def __vectorize_impl(self, tokens: Sequence[str], /) -> np.ndarray:
        vecs = np.zeros(
            (len(tokens), self.word_vectors.shape[1]),
            dtype=np.float64
        )
        for i, token in enumerate(tokens):
            if token in self.word_indexes:
                vecs[i] = self.word_vectors[self.word_indexes[token]]
        return vecs

    def __ordinalization_impl(self, tokens: Sequence[str], /) -> np.ndarray:
        return np.asarray([
            self.word_indexes.get(token, 400001) # 400001 is the index of the <UNK> token.
            for token in tokens
        ])

    @staticmethod
    def __largest_element_count(element_sets: Sequence[Sequence | np.ndarray], /) -> int:
        return max(
            len(element)
            for element in element_sets
        )

    @staticmethod
    def __expand_2d_to_size(
        _2d_targets: Sequence[np.ndarray],
        /,
        *,
        size: int
    ) -> list[np.ndarray]:
        assert size > 0
        assert len(_2d_targets) > 0
        return [
            np.vstack((
                _2d_target,
                np.zeros(
                    (size - _2d_target.shape[0], _2d_target.shape[1]),
                    dtype=np.float64
                )
            ))
            for _2d_target in _2d_targets
        ]

    @staticmethod
    def __expand_1d_to_size(
        _1d_targets: Sequence[np.ndarray],
        /,
        *,
        size: int
    ) -> list[np.ndarray]:
        assert size > 0
        assert len(_1d_targets) > 0
        return [
            np.pad(
                _1d_target,
                (0, size - _1d_target.shape[0]),
                mode='constant',
                constant_values=0 # 0 is the index of the <PAD> token. (self added)
            )
            for _1d_target in _1d_targets
        ]

    def vectorize(self, tweets: dict[TweetID, str], /) -> np.ndarray:
        tokens = [
            self.__vectorize_impl(tweet.split())
            for tweet in tweets.values()
        ]
        return np.asarray(
            self.__expand_2d_to_size(
                tokens,
                size=self.__largest_element_count(tokens)
            )
        )

    def ordinalization(self, tweets: dict[TweetID, str], /) -> np.ndarray:
        indexes = [
            self.__ordinalization_impl(tweet.split())
            for tweet in tweets.values()
        ]
        return np.asarray(
            self.__expand_1d_to_size(
                indexes,
                size=self.__largest_element_count(indexes)
            ),
            dtype=np.int32
        )

##### Define LSTM model

In [30]:
class AttentionMechanism(torch.nn.Module):
    """
    A module that implements an attention mechanism.

    Args:
        hidden_size (int): The size of the hidden state.

    Attributes:
        hidden_size (int): The size of the hidden state.
        attention_weights (torch.Tensor): The attention weights.

    Methods:
        forward(lstm_output: torch.Tensor) -> torch.Tensor: Performs the forward pass of the attention mechanism.

    """

    def __init__(self, hidden_size: int) -> None:
        super(AttentionMechanism, self).__init__()
        self.hidden_size = hidden_size
        self.attention_weights = torch.nn.Parameter(
            torch.Tensor(hidden_size, 1)
        )
        torch.nn.init.xavier_uniform_(self.attention_weights.data, gain=1.414)

    def forward(self, lstm_output: torch.Tensor) -> torch.Tensor:
        attention_scores = torch.matmul(lstm_output, self.attention_weights)
        attention_scores = attention_scores.squeeze(2)
        attention_weights = torch.softmax(attention_scores, dim=1)
        weighted_output = lstm_output * attention_weights.unsqueeze(2)
        return weighted_output.sum(1)


class SentimentLSTMModel(torch.nn.Module):
    """
    A sentiment analysis LSTM model.

    This model takes in a sequence of input tokens and predicts the sentiment of the input text.
    It consists of an embedding layer, a bidirectional LSTM layer, an attention mechanism, and a fully connected layer.

    Args:
        vocab_size (int): The size of the vocabulary.
        embedding_dim (int): The dimension of the word embeddings.
        hidden_size (int): The size of the hidden state of the LSTM.
        output_size (int): The number of output classes.
        pretrained_embeddings (torch.Tensor): Pretrained word embeddings.

    Attributes:
        hidden_size (int): The size of the hidden state of the LSTM.
        embedding (torch.nn.Embedding): The embedding layer.
        lstm (torch.nn.LSTM): The bidirectional LSTM layer.
        attention (AttentionMechanism): The attention mechanism.
        fc (torch.nn.Linear): The fully connected layer.
        dropout (torch.nn.Dropout): The dropout layer.
    """

    def __init__(
        self,
        embedding_dim: int,
        hidden_size: int,
        output_size: int,
        pretrained_embeddings: torch.Tensor
    ):
        super(SentimentLSTMModel, self).__init__()

        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding.from_pretrained(
            embeddings=pretrained_embeddings,
            padding_idx=0,
            freeze=False
        )
        self.embedding.weight.requires_grad = False

        self.lstm = torch.nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=2,
            dropout=0.5,
            batch_first=True,
            bidirectional=True
        )

        self.attention = AttentionMechanism(hidden_size * 2)

        self.dropout = torch.nn.Dropout(0.5)

        self.fc = torch.nn.Linear(hidden_size * 2, output_size)
        
        self.batch_norm = torch.nn.BatchNorm1d(output_size)

    def forward(self, x: torch.Tensor, /) -> torch.Tensor:
        embedded = self.dropout(self.embedding(x))
        lstm_out, _ = self.lstm(embedded)
        attn_out = self.attention(lstm_out)
        out = self.fc(self.dropout(attn_out))
        return self.batch_norm(out)

##### Define LSTM model training and evaluation functions

In [31]:
def calculate_accuracy(predictions: torch.Tensor, labels: torch.Tensor) -> float:
    _, predicted_labels = torch.max(predictions, dim=1)
    correct = (predicted_labels == labels).float()
    accuracy = correct.sum() / len(correct)
    return accuracy.item()


def process_epoch(
    dataset_loader: DataLoader,
    model: SentimentLSTMModel,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.Module,
    is_training: bool = True
) -> tuple[float, float]:

    if is_training:
        model.train()
    else:
        model.eval()

    total_loss, total_accuracy, batch_count = 0.0, 0.0, 0
    for inputs, labels in dataset_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        with torch.set_grad_enabled(is_training):
            predictions = model(inputs)
            loss = criterion(predictions, labels)
            if is_training:
                optimizer.zero_grad(set_to_none=True)
                loss.backward()
                optimizer.step()

        total_loss += loss.item()
        total_accuracy += calculate_accuracy(predictions, labels)
        batch_count += 1

    avg_loss = total_loss / batch_count
    avg_accuracy = total_accuracy / batch_count
    return avg_loss, avg_accuracy


def run_training_loop(
    model: SentimentLSTMModel,
    train_loader: DataLoader,
    valid_loader: DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.Module,
    n_epochs: int,
    early_stopping_patience: int
) -> None:
    best_valid_loss = float('inf')
    patience_counter = 0

    for epoch in range(n_epochs):
        train_loss, train_accuracy = process_epoch(
            train_loader,
            model,
            optimizer,
            criterion,
            is_training=True
        )
        valid_loss, valid_accuracy = process_epoch(
            valid_loader,
            model,
            optimizer,
            criterion,
            is_training=False
        )

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_lstm_model.pt')
        else:
            patience_counter += 1
            if patience_counter >= early_stopping_patience:
                print("Early stopping triggered.")
                break

        print(
            f'Epoch {epoch + 1}/{n_epochs} '
            f'- Train Loss: {train_loss:.4f}, '
            f'Train Accuracy: {train_accuracy:.4f}, '
            f'Valid Loss: {valid_loss:.4f}, '
            f'Valid Accuracy: {valid_accuracy:.4f}'
        )


def train_torch_model(
    model: torch.nn.Module,
    train_loader: DataLoader,
    loss_fn: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
) -> float:
    """Train the model for one epoch.

    Args:
        model: The neural network model.
        train_loader: DataLoader for the training dataset.
        loss_fn: The loss function.
        optimizer: The optimizer.
        device: The device to run the training on.

    Returns:
        The average loss for this training epoch.
    """
    model.train()
    total_loss = 0.0

    for tweets, sentiments in train_loader:
        optimizer.zero_grad(set_to_none=True)
        with torch.set_grad_enabled(True), torch.cuda.amp.autocast():
            outputs = model(tweets)
            loss = loss_fn(outputs, sentiments)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    return average_loss


def evaluate_torch_model(
    model: torch.nn.Module,
    val_loader: DataLoader,
    loss_fn: torch.nn.Module,
) -> tuple[float, float]:
    """Evaluate the model performance on the validation set.

    Args:
        model: The neural network model.
        val_loader: DataLoader for the validation dataset.
        device: The device to run the evaluation on.

    Returns:
        A tuple of accuracy and the average loss on the validation set.
    """
    model.eval()
    correct = 0
    total = 0
    total_loss = 0.0

    for tweets, sentiments in val_loader:
        with torch.no_grad(), torch.cuda.amp.autocast():
            outputs = model(tweets)
            loss = loss_fn(outputs, sentiments)
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += sentiments.size(0)
        correct += (predicted == sentiments).sum().item()

    accuracy = correct / total
    average_loss = total_loss / len(val_loader)
    return accuracy, average_loss

##### Get GloVe Embedding vector

In [32]:
glove_word_indexes: Optional[dict[str, int]] = global_cache.get("glove_word_indexes")
glove_word_vectors: Optional[np.ndarray] = global_cache.get("glove_word_vectors")

if (glove_word_indexes is None) or (glove_word_vectors is None):
    glove_prepare_task.wait()
    glove_word_indexes, glove_word_vectors = parse_glove_data(target_glove_file_name)

    try:
        global_cache.put("glove_word_indexes", glove_word_indexes)
        global_cache.put("glove_word_vectors", glove_word_vectors)
    except Exception as e:
        print(e)

##### Setup the model, loss function, and optimizer

In [33]:
assert glove_word_indexes is not None
assert glove_word_vectors is not None

glove_vectorizer = GloVeVectorizer(glove_word_indexes, glove_word_vectors)

training_glove_indexes = glove_vectorizer.ordinalization(
    cleaned_training_tweets
)

# Calculate how many tokens in the training set are not in the GloVe word vectors.
training_glove_tokens = np.count_nonzero(training_glove_indexes == 400001)
print(f"Training set tokens not in GloVe: {training_glove_tokens}")
print(len(training_glove_indexes))

dev_glove_indexes = glove_vectorizer.ordinalization(cleaned_dev_tweets)

test_glove_indexs = [
    glove_vectorizer.ordinalization(test_data)
    for test_data in cleaned_test_tweets
]

training_dataset = TensorDataset(
    torch.from_numpy(training_glove_indexes).to_dense().to(device),
    torch.tensor(
        [sentiments.value for sentiments in training_sentiments]
    ).to_dense().to(device)
)
dev_dataset = TensorDataset(
    torch.from_numpy(dev_glove_indexes).to_dense().to(device),
    torch.tensor(
        [sentiments.value for sentiments in dev_sentiments]
    ).to_dense().to(device)
)
test_datasets = [
    TensorDataset(
        torch.from_numpy(test_glove_index).to_dense().to(device),
        torch.tensor(
            [sentiments.value for sentiments in test_sentiments]
        ).to_dense().to(device)
    )
    for test_glove_index, test_sentiments in zip(
        test_glove_indexs,
        test_sentiments
    )
]

lstm_model = SentimentLSTMModel(
    embedding_dim=glove_word_vectors.shape[1],
    hidden_size=256,
    output_size=len(Sentiment.gts()),
    pretrained_embeddings=torch.from_numpy(glove_word_vectors).float()
).to(device)

lstm_loss_fn = torch.nn.CrossEntropyLoss(
    ignore_index=400001
).to(device)
lstm_optimizer = torch.optim.AdamW(
    lstm_model.parameters(),
    lr=1e-2,
    weight_decay=1e-4
)

print(lstm_model)

Training set tokens not in GloVe: 8298
45101
SentimentLSTMModel(
  (embedding): Embedding(400002, 100, padding_idx=0)
  (lstm): LSTM(100, 256, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (attention): AttentionMechanism()
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=3, bias=True)
  (batch_norm): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


##### train the model

In [34]:
epochs = 20
batch_size = 512
workers = cpu_count() or 1

training_loader = DataLoader(
    training_dataset,
    batch_size=batch_size,
    shuffle=True,
)
dev_loader = DataLoader(
    dev_dataset,
    batch_size=batch_size,
    shuffle=False,
)
test_loaders = [
    DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
    )
    for test_dataset in test_datasets
]
torch.jit.enable_onednn_fusion(True)
torch.backends.cudnn.benchmark = True

for epoch in range(epochs):
    train_loss = train_torch_model(
        lstm_model,
        training_loader,
        lstm_loss_fn,
        lstm_optimizer
    )
    dev_accuracy, dev_loss = evaluate_torch_model(
        lstm_model,
        dev_loader,
        lstm_loss_fn
    )
    print(
        f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.5f}, "
        f"Dev Loss: {dev_loss:.5f}, Dev Accuracy: {dev_accuracy:.5f}"
    )

test_accuracies = [
    evaluate_torch_model(
        lstm_model,
        test_loader,
        lstm_loss_fn
    )
    for test_loader in test_loaders
]

for i, (test_accuracy, _) in enumerate(test_accuracies):
    print(f"Test {i+1} Accuracy: {test_accuracy:.5f}")

Epoch 1/20, Train Loss: 0.93655, Dev Loss: 0.85045, Dev Accuracy: 0.60150
Epoch 2/20, Train Loss: 0.84788, Dev Loss: 0.77900, Dev Accuracy: 0.63200
Epoch 3/20, Train Loss: 0.82878, Dev Loss: 0.75911, Dev Accuracy: 0.64850
Epoch 4/20, Train Loss: 0.81021, Dev Loss: 0.75249, Dev Accuracy: 0.64600
Epoch 5/20, Train Loss: 0.79623, Dev Loss: 0.73259, Dev Accuracy: 0.65950
Epoch 6/20, Train Loss: 0.78665, Dev Loss: 0.73459, Dev Accuracy: 0.66600
Epoch 7/20, Train Loss: 0.77795, Dev Loss: 0.72510, Dev Accuracy: 0.66850
Epoch 8/20, Train Loss: 0.77262, Dev Loss: 0.73088, Dev Accuracy: 0.65800
Epoch 9/20, Train Loss: 0.77104, Dev Loss: 0.72714, Dev Accuracy: 0.66950
Epoch 10/20, Train Loss: 0.76195, Dev Loss: 0.72265, Dev Accuracy: 0.66450
Epoch 11/20, Train Loss: 0.75909, Dev Loss: 0.72807, Dev Accuracy: 0.67300
Epoch 12/20, Train Loss: 0.75556, Dev Loss: 0.73383, Dev Accuracy: 0.66650
Epoch 13/20, Train Loss: 0.75062, Dev Loss: 0.71130, Dev Accuracy: 0.67850
Epoch 14/20, Train Loss: 0.74488, 

#### BERT Classifier

In [41]:
bert_prepare_task.wait()

bert_tokenizer: BertTokenizer = BertTokenizer.from_pretrained(
    target_bert_model_name)

bert_model_pretrained: BertModel = BertModel.from_pretrained(
    target_bert_model_name
)  # type: ignore

In [42]:
def preprocess_bert() -> tuple[TensorDataset, TensorDataset, list[TensorDataset]]:
    training_bert = bert_tokenizer(
        list(cleaned_training_tweets.values()),
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    dev_bert = bert_tokenizer(
        list(cleaned_dev_tweets.values()),
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    test_berts = [
        bert_tokenizer(
            list(test_data.values()),
            padding=True,
            truncation=True,
            return_tensors="pt"
        )
        for test_data in cleaned_test_tweets
    ]

    training_bert_dataset = TensorDataset(
        training_bert.input_ids.to_dense().to(device),
        training_bert.attention_mask.to_dense().to(device),
        torch.tensor(
            [sentiments.value for sentiments in training_sentiments]
        ).to_dense().to(device)
    )
    dev_bert_dataset = TensorDataset(
        dev_bert.input_ids.to_dense().to(device),
        dev_bert.attention_mask.to_dense().to(device),
        torch.tensor(
            [sentiments.value for sentiments in dev_sentiments]
        ).to_dense().to(device)
    )
    test_bert_datasets = [
        TensorDataset(
            test_bert.input_ids.to_dense().to(device),
            test_bert.attention_mask.to_dense().to(device),
            torch.tensor(
                [sentiments.value for sentiments in test_sentiments]
            ).to_dense().to(device)
        )
        for test_bert, test_sentiments in zip(
            test_berts,
            test_sentiments
        )
    ]

    return training_bert_dataset, dev_bert_dataset, test_bert_datasets

In [43]:
class SentimentBERTModel(torch.nn.Module):
    def __init__(self, freeze: bool = False, config: BertConfig = BertConfig()):
        super(SentimentBERTModel, self).__init__()
        label_count = len(Sentiment.gts())
        custom_hidden_size = 256
        self.bert = bert_model_pretrained
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(config.hidden_size, custom_hidden_size),
            torch.nn.ReLU(),
            torch.nn.Linear(custom_hidden_size, label_count),
            torch.nn.Dropout(config.hidden_dropout_prob),
            torch.nn.BatchNorm1d(label_count)
        )

        if freeze:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state_cls = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(last_hidden_state_cls)
        logits = self.classifier(pooled_output)
        return logits

In [44]:
def bert_train(
    model: torch.nn.Module,
    train_loader: DataLoader,
    optimizer: torch.optim.Optimizer,
    scheduler: torch.optim.lr_scheduler.LambdaLR,
    loss_fn: torch.nn.Module,
) -> float:
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_loader, desc="Bert Training"):
        batch = [item for item in batch]
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
        }
        labels = batch[2]

        optimizer.zero_grad(set_to_none=True)

        with torch.set_grad_enabled(True), torch.cuda.amp.autocast():
            outputs = model(**inputs)
            loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    return average_loss


def bert_evaluate(
    model: torch.nn.Module,
    validation_loader: DataLoader,
    loss_fn: torch.nn.Module,
) -> tuple[float, float]:
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad(), torch.cuda.amp.autocast():
        for batch in tqdm(validation_loader, desc="Bert Evaluating"):
            batch = [item for item in batch]
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
            }
            labels = batch[2]

            outputs = model(**inputs)
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()
            correct += (torch.argmax(outputs, dim=1) == labels).sum().item()
            total += labels.size(0)

    average_loss = total_loss / len(validation_loader)
    accuracy = correct / total
    return average_loss, accuracy

In [45]:
epochs = 20
batch_size = 32

bert_model = SentimentBERTModel(freeze=True).to(device)
bert_optimizer = torch.optim.AdamW(bert_model.parameters(), lr=2e-5)
bert_looser = torch.nn.CrossEntropyLoss().to(device)

print(bert_model)

total_steps = len(cleaned_training_tweets) * epochs
bert_scheduler = get_linear_schedule_with_warmup(
    bert_optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

training_bert_dataset, dev_bert_dataset, test_bert_datasets = preprocess_bert()

training_bert_loader = DataLoader(
    training_bert_dataset,
    batch_size=batch_size,
    shuffle=True
)
dev_bert_loader = DataLoader(
    dev_bert_dataset,
    batch_size=batch_size,
    shuffle=False
)
test_bert_loaders = [
    DataLoader(
        test_bert_dataset,
        batch_size=batch_size,
        shuffle=False
    )
    for test_bert_dataset in test_bert_datasets
]

SentimentBERTModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [46]:
for epoch in range(epochs):
    bert_train_loss = bert_train(
        bert_model,
        training_bert_loader,
        bert_optimizer,
        bert_scheduler,
        bert_looser
    )
    bert_dev_loss, bert_dev_accuracy = bert_evaluate(
        bert_model,
        dev_bert_loader,
        bert_looser
    )
    print(
        f"Epoch {epoch+1}/{epochs}, Train Loss: {bert_train_loss:.5f}, "
        f"Dev Loss: {bert_dev_loss:.5f}, "
        f"Dev Accuracy: {bert_dev_accuracy:.5f}"
    )

bert_test_results = [
    bert_evaluate(
        bert_model,
        test_bert_loader,
        bert_looser
    )
    for test_bert_loader in test_bert_loaders
]

for i, (test_loss, test_accuracy) in enumerate(bert_test_results):
    print(f"Test {i+1} Loss: {test_loss:.5f}, Test {i+1} Accuracy: {test_accuracy:.5f}")

Bert Training: 100%|██████████| 1410/1410 [01:19<00:00, 17.74it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 1/20, Train Loss: 1.11937, Dev Loss: 0.96240, Dev Accuracy: 0.53550


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.27it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 2/20, Train Loss: 1.02891, Dev Loss: 0.94133, Dev Accuracy: 0.53550


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.26it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 3/20, Train Loss: 1.00876, Dev Loss: 0.93975, Dev Accuracy: 0.53900


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.26it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 4/20, Train Loss: 0.99448, Dev Loss: 0.93596, Dev Accuracy: 0.55050


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.25it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 5/20, Train Loss: 0.98747, Dev Loss: 0.91360, Dev Accuracy: 0.55500


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.25it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 6/20, Train Loss: 0.97815, Dev Loss: 0.89665, Dev Accuracy: 0.57950


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.25it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 7/20, Train Loss: 0.97189, Dev Loss: 0.90067, Dev Accuracy: 0.56000


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.25it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 8/20, Train Loss: 0.96291, Dev Loss: 0.88619, Dev Accuracy: 0.58100


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.24it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 9/20, Train Loss: 0.95903, Dev Loss: 0.90733, Dev Accuracy: 0.55700


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.25it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 10/20, Train Loss: 0.95434, Dev Loss: 0.89884, Dev Accuracy: 0.56450


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.24it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 11/20, Train Loss: 0.94655, Dev Loss: 0.87950, Dev Accuracy: 0.58750


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.23it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 12/20, Train Loss: 0.94839, Dev Loss: 0.88305, Dev Accuracy: 0.58300


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.23it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.28it/s]


Epoch 13/20, Train Loss: 0.93863, Dev Loss: 0.88554, Dev Accuracy: 0.57650


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.23it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.28it/s]


Epoch 14/20, Train Loss: 0.93705, Dev Loss: 0.87803, Dev Accuracy: 0.57500


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.22it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 15/20, Train Loss: 0.93646, Dev Loss: 0.87726, Dev Accuracy: 0.58550


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.23it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 16/20, Train Loss: 0.93162, Dev Loss: 0.87599, Dev Accuracy: 0.57350


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.24it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 17/20, Train Loss: 0.93038, Dev Loss: 0.88928, Dev Accuracy: 0.57500


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.23it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 18/20, Train Loss: 0.92698, Dev Loss: 0.87322, Dev Accuracy: 0.58250


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.23it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 19/20, Train Loss: 0.92501, Dev Loss: 0.87616, Dev Accuracy: 0.57550


Bert Training: 100%|██████████| 1410/1410 [01:21<00:00, 17.23it/s]
Bert Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 20/20, Train Loss: 0.92256, Dev Loss: 0.86651, Dev Accuracy: 0.58250


Bert Evaluating: 100%|██████████| 111/111 [00:06<00:00, 17.95it/s]
Bert Evaluating: 100%|██████████| 58/58 [00:02<00:00, 23.13it/s]
Bert Evaluating: 100%|██████████| 75/75 [00:03<00:00, 23.36it/s]

Test 1 Loss: 0.85966, Test 1 Accuracy: 0.59615
Test 2 Loss: 0.82451, Test 2 Accuracy: 0.60982
Test 3 Loss: 0.87440, Test 3 Accuracy: 0.58806



