# Assignment Two:  Sentiment Classification

For this exercise you will be using the "SemEval 2017 task 4" corpus provided on the module website, available through the following link: https://warwick.ac.uk/fac/sci/dcs/teaching/material/cs918/semeval-tweets.tar.bz2 You will focus particularly on Subtask A, i.e. classifying the overall sentiment of a tweet as positive, negative or neutral.

You are requested to produce a Jupyter notebook for the coursework submission. The input to your program is the SemEval data downloaded. Note that TAs need to run your program on their own machine by using the original SemEval data. As such, don’t submit a Python program that takes as input some preprocessed files.

#### Import necessary packages
You may import more packages here.

In [None]:
# Import necessary packages
from os.path import join
from utils.file import read_file_lines_from

In [None]:
# Define test sets
test_set_names: tuple[str, ...] = (
    'twitter-test1.txt', 
    'twitter-test2.txt', 
    'twitter-test3.txt',
)

In [None]:
gts: tuple[str, ...] = ('positive', 'negative', 'neutral')

# Skeleton: Evaluation code for the test sets
def read_test(test_set_name_: str) -> dict[str, str]:
    """
    Read in the test_set and return a dictionary
    :param test_set_name_: str, the file name of the test_set to compare
    """
    id_gts: dict[str, str] = {}
    lines = read_file_lines_from(f'data/{test_set_name_}')
    for line in lines:
        fields = line.split('\t')
        tweet_id = fields[0]
        gt = fields[1]

        id_gts[tweet_id] = gt

    return id_gts


def confusion(id_predicts_: dict[str, str], test_set_name_: str) -> None:
    """
    print the confusion matrix of {'positive', 'negative'} between predicts and test_set
    :param id_predicts_: a dictionary of predictions formatted as {<tweet_id>:<sentiment>, ... }
    :param test_set_name_: str, the file name of the test_set to compare
    :classifier: str, the name of the classifier
    """
    id_gts: dict[str, str] = read_test(test_set_name_)

    # FIXME: dead code.
    # gts = []
    # for m, c1 in id_gts.items():
    #     if c1 not in gts:
    #         gts.append(c1)

    conf: dict[str, dict[str, int]] = {}
    for c1 in gts:
        conf[c1] = {}
        for c2 in gts:
            conf[c1][c2] = 0

    for tweet_id, gt in id_gts.items():
        if tweet_id in id_predicts_:
            pred = id_predicts_[tweet_id]
        else:
            pred = 'neutral'
        conf[pred][gt] += 1

    print(''.ljust(12) + '  '.join(gts))

    for c1 in gts:
        print(c1.ljust(12), end='')
        for c2 in gts:
            if sum(conf[c1].values()) > 0:
                print(f"{conf[c1][c2] / float(sum(conf[c1].values())):.3f}     ", end='')
            else:
                print('0.000     ', end='')
        print()

    print()


def evaluate(id_predicts_: dict[str, str], test_set_name_: str, classifier_):
    """
    print the macro-F1 score of {'positive', 'negative'} between predicts and test_set
    :param id_predicts_: a dictionary of predictions formatted as {<tweet_id>:<sentiment>, ... }
    :param test_set_name_: str, the file name of the test_set to compare
    :param classifier_: str, the name of the classifier
    """
    id_gts: dict[str, str] = read_test(test_set_name_)

    acc_by_class: dict[str, dict[str, int]] = {}
    for gt in gts:
        acc_by_class[gt] = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}

    ok = 0
    for tweet_id, gt in id_gts.items():
        if tweet_id in id_predicts_:
            pred = id_predicts_[tweet_id]
        else:
            pred = 'neutral'

        if gt == pred:
            ok += 1
            acc_by_class[gt]['tp'] += 1
        else:
            acc_by_class[gt]['fn'] += 1
            acc_by_class[pred]['fp'] += 1

    cat_count = 0
    item_count = 0
    macro: dict[str, float] = {'p': 0.0, 'r': 0.0, 'f1': 0.0}
    micro: dict[str, float] = {'p': 0.0, 'r': 0.0, 'f1': 0.0}
    sem_eval_macro: dict[str, int] = {'p': 0, 'r': 0, 'f1': 0}

    micro_tp = 0
    micro_fp = 0
    micro_tn = 0
    micro_fn = 0

    cat_f1s: dict[str, int] = {}

    for cat, acc in acc_by_class.items():
        cat_count += 1

        micro_tp += acc['tp']
        micro_fp += acc['fp']
        micro_tn += acc['tn']
        micro_fn += acc['fn']

        p = 0
        if (acc['tp'] + acc['fp']) > 0:
            p = float(acc['tp']) / (acc['tp'] + acc['fp'])

        r = 0
        if (acc['tp'] + acc['fn']) > 0:
            r = float(acc['tp']) / (acc['tp'] + acc['fn'])

        f1 = 0
        if (p + r) > 0:
            f1 = 2 * p * r / (p + r)

        cat_f1s[cat] = f1

        n = acc['tp'] + acc['fn']

        macro['p'] += p
        macro['r'] += r
        macro['f1'] += f1

        if cat in ['positive', 'negative']:
            sem_eval_macro['p'] += p
            sem_eval_macro['r'] += r
            sem_eval_macro['f1'] += f1

        item_count += n

    micro['p'] = float(micro_tp) / float(micro_tp + micro_fp)
    micro['r'] = float(micro_tp) / float(micro_tp + micro_fn)
    micro['f1'] = 2 * float(micro['p']) * micro['r'] / float(micro['p'] + micro['r'])

    sem_eval_macro_f1 = sem_eval_macro['f1'] / 2

    print(f"{test_set_name_} ({classifier_}): {sem_eval_macro_f1:.3f}")

#### Load training set, dev set and testing set
Here, you need to load the training set, the development set and the test set. For better classification results, you may need to preprocess tweets before sending them to the classifiers.

In [None]:
# Load training set, dev set and testing set
data: dict[str, list[...]] = {}
tweet_ids: dict[str, list[...]] = {}
tweet_gts: dict[str, list[...]] = {}
tweets: dict[str, list[...]] = {}

for dataset in ('twitter-training-data.txt',) + test_set_names:
    data[dataset] = []
    tweets[dataset] = []
    tweet_ids[dataset] = []
    tweet_gts[dataset] = []

    # write code to read in the datasets here

#### Build sentiment classifiers
You need to create your own classifiers (at least 3 classifiers). For each classifier, you can choose between the bag-of-word features and the word-embedding-based features. Each classifier has to be evaluated over 3 test sets. Make sure your classifier produce consistent performance across the test sets. Marking will be based on the performance over all 5 test sets (2 of them are not provided to you).

In [None]:
# Build traditional sentiment classifiers. An example classifier name 'svm' is given
# in the code below. You should replace the other two classifier names
# with your own choices. For features used for classifier training, 
# the 'bow' feature is given in the code. But you could also explore the 
# use of other features.
for classifier in ('svm', '<classifier-2-name>', '<classifier-3-name>',):
    for features in ('bow', '<feature-2-name>',):
        # Skeleton: Creation and training of the classifiers
        if classifier == 'svm':
            # write the svm classifier here
            print('Training ' + classifier)
        elif classifier == '<classifier-2-name>':
            # write the classifier 2 here
            print('Training ' + classifier)
        elif classifier == '<classifier-3-name>':
            # write the classifier 3 here
            print('Training ' + classifier)
        elif classifier == 'LSTM':
            # write the LSTM classifier here
            if features == 'bow':
                continue
            print('Training ' + classifier)
        else:
            print('Unknown classifier name' + classifier)
            continue

        # Prediction performance of the classifiers
        for test_set_name in test_set_names:
            id_predicts = {}
            # write the prediction and evaluation code here

            test_set_path = join('semeval-tweets', test_set_name)
            evaluate(id_predicts, test_set_path, features + '-' + classifier)