# Assignment Two:  Sentiment Classification

For this exercise you will be using the "SemEval 2017 task 4" corpus provided on the module website, available through the following link: https://warwick.ac.uk/fac/sci/dcs/teaching/material/cs918/semeval-tweets.tar.bz2 You will focus particularly on Subtask A, i.e. classifying the overall sentiment of a tweet as positive, negative or neutral.

You are requested to produce a Jupyter notebook for the coursework submission. The input to your program is the SemEval data downloaded. Note that TAs need to run your program on their own machine by using the original SemEval data. As such, don’t submit a Python program that takes as input some preprocessed files.

#### Define Utility Functions

In [79]:
from collections.abc import Generator
from os import path, getcwd, listdir, makedirs


def read_file_lines_from(file_path: str, /) -> Generator[str, None, None]:
    """
    Read lines from a file and yield each line as a string.
    The path to the file is relative to the current working directory.

    Args:
        file_path (str): The path to the file to be read.

    Yields:
        str: Each line of the file, stripped of leading and trailing whitespace.
    """
    full_path = path.join(getcwd(), file_path)
    buffer_size = 1024 * 1024
    with open(full_path, 'r', buffering=buffer_size, encoding='utf8') as file:
        for line in file:
            yield line.strip()


def ls(dir_path: str, /) -> tuple[str, ...]:
    """
    List all files and directories in the specified directory.
    """
    full_path = path.join(getcwd(), dir_path)
    return tuple(listdir(full_path))


def mkdir(dir_path: str, /) -> None:
    """
    Create a directory.
    """
    full_path = path.join(getcwd(), dir_path)
    if not path.exists(full_path):
        makedirs(full_path)


def path_exists(location: str, /) -> bool:
    """
    Check if the specified path exists.
    """
    full_path = path.join(getcwd(), location)
    return path.exists(full_path)

In [80]:
from collections.abc import Callable
from threading import Thread, Lock
from typing import final


@final
class BackgroundTask:
    """
    Represents a background task that can be executed concurrently.

    Args:
        task (Callable): The function or method to be executed as a background task.
        *args: Variable length argument list to be passed to the task.
        **kwargs: Arbitrary keyword arguments to be passed to the task.
    """

    def __init__(self, task: Callable[..., None], *args, **kwargs):
        self.__task = Thread(
            target=task,
            args=args,
            kwargs=kwargs,
            daemon=True
        )
        with Lock():
            self.__task.start()

    def wait(self) -> None:
        """
        Waits for the background task to complete.
        """
        return self.__task.join()

In [81]:
def unzip_file_to(file_path: str, /, destination: str) -> None:
    """
    Unzip a file to a specified destination.

    Args:
        file_path (str): The path to the file to be unzipped.
        destination (str): The path to the directory where the file will be unzipped.
    """
    import zipfile
    full_path = path.join(getcwd(), file_path)
    with zipfile.ZipFile(full_path, 'r') as zip_ref:
        zip_ref.extractall(destination)

In [82]:
from typing import Any, Final, Optional
from shelve import open as shelve_open


class GlobalCache:
    """
    A simple global cache for storing data in memory.
    """

    __runtime_cache: Final[dict[str, Any]] = {}
    __cache_file_name: Final[str] = 'cache'

    def put(self, key: str, value: object, /) -> None:
        """
        Put a value into the cache.

        Args:
            key (str): The key to be used to store the value.
            value (object): The value to be stored.
        """

        self.__runtime_cache[key] = value
        with shelve_open(self.__cache_file_name, 'c') as cache:
            cache[key] = value

    def get(self, key: str, /) -> Optional[Any]:
        """
        Get a value from the cache.

        Args:
            key (str): The key to be used to retrieve the value.

        Returns:
            object: The value stored in the cache.
        """

        if key in self.__runtime_cache:
            return self.__runtime_cache[key]

        with shelve_open(self.__cache_file_name, 'c') as cache:
            return cache.get(key)

    def remove(self, key: str, /) -> None:
        """
        Remove a value from the cache.

        Args:
            key (str): The key to be used to remove the value.
        """

        self.__runtime_cache.pop(key, None)
        with shelve_open(self.__cache_file_name, 'c') as cache:
            del cache[key]

    def clear(self) -> None:
        """
        Clear the cache.
        """

        self.__runtime_cache.clear()
        with shelve_open(self.__cache_file_name, 'c') as cache:
            cache.clear()

#### Package imports for Application logic

In [83]:
import pandas as pd
import numpy as np
import csv
import regex as re
import contractions

from os import cpu_count
from typing import Final, final
from types import NoneType
from string import punctuation
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor, Future
from enum import Enum
from collections import defaultdict, Counter
from collections.abc import Sequence
from copy import copy
from huggingface_hub import hf_hub_download
from emoji import EMOJI_DATA, demojize
from nltk.downloader import download as nltk_download
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

#### Define global instances

In [84]:
dataset_base_path: Final[str] = 'data'
glove_data_dir: Final[str] = f"{dataset_base_path}/glove"
target_glove_file_name: Final[str] = "glove.6B.100d.txt"

# names of the test set files
test_set_names: Final[tuple[str, ...]] = (
    'twitter-test1.txt',
    'twitter-test2.txt',
    'twitter-test3.txt',
)
training_data_file_name: Final[str] = 'twitter-training-data.txt'
devlopment_data_file_name: Final[str] = 'twitter-dev-data.txt'


@final
class Sentiment(Enum):
    """
    An enumeration of the three possible sentiment values.
    """
    positive = 1
    negative = -1
    neutral = 0

    @classmethod
    @lru_cache
    def gts(cls) -> tuple[str, ...]:
        return tuple(cls.__members__.keys())


global_cache = GlobalCache()

TweetID = str
ShouldMarkedAsBackground = NoneType

#### Define data retrieval functions

In [85]:
@lru_cache(typed=True)
def get_tweets_from(file_name_: str, /) -> tuple[dict[TweetID, str], dict[TweetID, Sentiment]]:
    """
    Read tweets from a file and return dictionaries containing tweet IDs, contents, and sentiments.

    Parameters:
    - file_name_ (str): The name of the file to read tweets from.

    Returns:
    - A tuple containing two dictionaries:
        - id_gts (dict[TweetID, str]): A dictionary mapping tweet IDs to their contents.
        - id_sentiments (dict[TweetID, Sentiment]): A dictionary mapping tweet IDs to their sentiments.
    """
    id_gts: dict[TweetID, str] = {}
    id_sentiments: dict[TweetID, Sentiment] = {}
    lines = read_file_lines_from(f'{dataset_base_path}/{file_name_}')
    for line in lines:
        fields = line.split('\t')
        tweet_id = fields[0]
        gt = fields[1]
        content = ' '.join(fields[2:])
        id_gts[tweet_id] = content
        id_sentiments[tweet_id] = Sentiment[gt]

    return id_gts, id_sentiments

#### Define GloVe data preparation functions

In [86]:
def prepare_glove_data() -> ShouldMarkedAsBackground:
    if path_exists(glove_data_dir) and len(ls(glove_data_dir)) == 4:
        return

    glove_data_pack_name = 'glove.6B.zip'

    hf_hub_download(
        repo_id='stanfordnlp/glove',
        filename=glove_data_pack_name,
        local_dir=dataset_base_path,
        revision='1db2080b2d94def6e5b0386a523102f9d8849e9d',
    )

    # perform shell command using python code since the thread management can be done in python.
    mkdir(glove_data_dir)
    unzip_file_to(
        f'{dataset_base_path}/{glove_data_pack_name}',
        destination=glove_data_dir
    )

In [87]:
@lru_cache(typed=True)
def parse_glove_data(file_name_: str) -> tuple[dict[int, str], np.ndarray]:
    """
    Parse the GloVe data from a given file.

    Args:
        file_name_ (str): The name of the file containing the GloVe data.

    Returns:
        tuple[dict[int, str], np.ndarray]: A tuple containing a dictionary mapping integers to words and a NumPy array of word vectors.
    """
    file_frame = pd.read_csv(
        f"{glove_data_dir}/{file_name_}",
        delimiter=' ',
        quoting=csv.QUOTE_NONE,
        header=None,
        encoding='utf-8',
        skip_blank_lines=True,
    )

    return file_frame[0].to_dict(), file_frame.iloc[:, 1:].to_numpy(dtype=np.float64)

#### Define data preprocessing functions

In [88]:
def filter_text(src: str, /, *, patterns: Sequence[re.Pattern]) -> str:
    """
    Filters the given source text by removing all occurrences of the specified patterns.

    Args:
        src (str): The source text to be filtered.
        patterns (Sequence[Pattern]): A sequence of regular expression patterns to be removed from the source text.

    Returns:
        str: The filtered text with all occurrences of the specified patterns removed.
    """
    filtered = copy(src)

    for pattern in patterns:
        filtered = pattern.sub('', filtered)

    return filtered


def process_texts(src_dict: dict[Any, str], callable: Callable, *args, **kargs) -> dict[Any, str]:
    """
    Process a dictionary of texts using a callable function in parallel using a thread pool executor.

    Args:
        src_dict (dict[Any, str]): A dictionary containing the texts to be processed.
        callable (Callable[[str], Any]): A callable function that will be applied to each text.
        *args: Variable length argument list to be passed to the callable function.
        **kargs: Arbitrary keyword arguments to be passed to the callable function.

    Returns:
        dict[Any, str]: A dictionary containing the processed texts.
    """

    multi_threaded = False

    result: dict[Any, Future[str]] = {}

    if multi_threaded:
        with ThreadPoolExecutor(max_workers=(cpu_count() or 1)+4) as executor:
            for key, value in src_dict.items():
                result[key] = executor.submit(
                    callable,
                    value,
                    *args,
                    **kargs,
                )

        return {key: future.result() for key, future in result.items()}

    return {
        key: callable(value, *args, **kargs)
        for key, value in src_dict.items()
    }


def run_pipelines(
    callables: Sequence[Callable[[str], str]],
    /,
    *,
    tweets: dict[TweetID, str]
) -> dict[str, str]:
    """
    Run a sequence of callables on a dictionary of texts in parallel using a thread pool executor.

    Args:
        callables (Sequence[Callable[[dict[str, str]], dict[str, str]]]): A sequence of callable functions to be applied to the dictionary of texts.
        tweets (dict[str, str]): A dictionary containing the texts to be processed.

    Returns:
        dict[str, str]: A dictionary containing the processed texts.
    """
    processed = copy(tweets)

    for callable in callables:
        processed = process_texts(processed, callable)

    return processed

#### Define confusion matrix function

In [89]:
def show_confusion(*, predict_results: dict[TweetID, Sentiment], test_set_file_name_: str) -> None:
    """
    Display the confusion matrix based on the predicted results and the sentiment labels from the test set file.

    Args:
        predict_results (dict[TweetID, Sentiment]): A dictionary containing the predicted sentiment for each tweet ID.
        test_set_file_name_ (str): The file name of the test set containing the sentiment labels for each tweet ID.
    """
    _, id_sentiments = get_tweets_from(test_set_file_name_)

    conf: Final[dict[Sentiment, dict[Sentiment, int]]] = defaultdict(
        lambda: {
            Sentiment.positive: 0,
            Sentiment.negative: 0,
            Sentiment.neutral: 0,
        }
    )

    for tweet_id, sentiment in id_sentiments.items():
        if tweet_id in predict_results:
            pred = predict_results[tweet_id]
        else:
            pred = Sentiment.neutral
        conf[pred][sentiment] += 1

    print(''.ljust(12) + '  '.join(Sentiment.gts()))

    for c1 in Sentiment:
        print(c1.name.ljust(12), end='')
        for c2 in Sentiment:
            if c1_sum := sum(conf[c1].values()) > 0:
                p = conf[c1][c2] / float(c1_sum)
                print(f"{p:.3f}     ", end='')
            else:
                print('0.000     ', end='')
        print()
    print()

#### Define evaluation functions

In [90]:
def evaluate(predict_results: dict[TweetID, Sentiment], test_set_file_name_: str, classifier_name_: str) -> None:
    """
    Evaluate the performance of a sentiment classifier by comparing the predicted results with the ground truth sentiment labels.

    Parameters:
        - predict_results: A dictionary mapping TweetIDs to predicted Sentiments.
        - test_set_file_name_: The name of the test set file.
        - classifier_name_: The name of the classifier.
    """
    _, id_sentiments = get_tweets_from(test_set_file_name_)

    acc_by_class: Final[dict[Sentiment, dict[str, int]]] = defaultdict(
        lambda: {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}
    )

    for tweet_id, sentiment in id_sentiments.items():
        if tweet_id in predict_results:
            pred = predict_results[tweet_id]
        else:
            pred = Sentiment.neutral

        if sentiment == pred:
            acc_by_class[sentiment]['tp'] += 1
        else:
            acc_by_class[sentiment]['fn'] += 1
            acc_by_class[pred]['fp'] += 1

    cat_count = 0
    item_count = 0
    macro: dict[str, float] = {'p': 0.0, 'r': 0.0, 'f1': 0.0}
    micro: dict[str, float] = {'p': 0.0, 'r': 0.0, 'f1': 0.0}
    sem_eval_macro: dict[str, float] = {'p': 0.0, 'r': 0.0, 'f1': 0.0}

    micro_tp = 0.0
    micro_fp = 0.0
    micro_tn = 0.0
    micro_fn = 0.0

    cat_f1s: dict[Sentiment, float] = {}

    for cat, acc in acc_by_class.items():
        cat_count += 1

        micro_tp += acc['tp']
        micro_fp += acc['fp']
        micro_tn += acc['tn']
        micro_fn += acc['fn']

        p = 0.0
        if (acc['tp'] + acc['fp']) > 0:
            p = float(acc['tp']) / (acc['tp'] + acc['fp'])

        r = 0.0
        if (acc['tp'] + acc['fn']) > 0:
            r = float(acc['tp']) / (acc['tp'] + acc['fn'])

        f1 = 0.0
        if (p + r) > 0:
            f1 = 2 * p * r / (p + r)

        cat_f1s[cat] = f1

        n = acc['tp'] + acc['fn']

        macro['p'] += p
        macro['r'] += r
        macro['f1'] += f1

        if cat in ['positive', 'negative']:
            sem_eval_macro['p'] += p
            sem_eval_macro['r'] += r
            sem_eval_macro['f1'] += f1

        item_count += n

    micro['p'] = micro_tp / (micro_tp + micro_fp)
    micro['r'] = micro_tp / (micro_tp + micro_fn)
    micro['f1'] = 2 * micro['p'] * micro['r'] / (micro['p'] + micro['r'])

    sem_eval_macro_f1 = sem_eval_macro['f1'] / 2

    print(
        f"{test_set_file_name_} ({classifier_name_}): {sem_eval_macro_f1:.3f}"
    )

#### Load training set, dev set and testing set
Here, you need to load the training set, the development set and the test set. For better classification results, you may need to preprocess tweets before sending them to the classifiers.

In [91]:
training_data, training_sentiments = get_tweets_from(training_data_file_name)
dev_data, dev_sentiments = get_tweets_from(devlopment_data_file_name)
test_datas, test_sentiments = zip(*[
    get_tweets_from(file_name)
    for file_name in test_set_names
])

#### Download network resources

In [92]:
glove_prepare_task = BackgroundTask(prepare_glove_data)


def download_nltk_resources(resource_names: Sequence[str]) -> ShouldMarkedAsBackground:
    for resource_name in resource_names:
        nltk_download(resource_name, quiet=True)


nltk_prepare_task = BackgroundTask(
    download_nltk_resources,
    ('stopwords', 'vader_lexicon', 'punkt', 'wordnet',)
)

In [93]:
# TODO: move this cell to the top of the place where glove is used

glove_word_indexes = global_cache.get('glove_word_indexes')
glove_word_vectors = global_cache.get('glove_word_vectors')

if glove_word_indexes is None or glove_word_vectors is None:
    glove_prepare_task.wait()
    glove_word_indexes, glove_word_vectors = parse_glove_data(
        target_glove_file_name
    )
    global_cache.put('glove_word_indexes', glove_word_indexes)
    global_cache.put('glove_word_vectors', glove_word_vectors)

#### Data Preprocessing

In [94]:
# Lowercase all the words.

def lowercase_tweet(tweet: str, /) -> str:
    return tweet.lower()

In [95]:
# Filter the tweets based on the selected regexp patterns.

re_flags = re.IGNORECASE | re.MULTILINE

pattern_html_tags = re.compile(r'<[^>]+?>', re_flags)
pattern_mentions = re.compile(r'@[a-zA-Z0-9_]+', re_flags)
pattern_hashtags = re.compile(r'#[a-zA-Z0-9_]+', re_flags)
pattern_alphanumeric = re.compile(r'[^a-zA-Z0-9 ]+?', re_flags)
pattern_only_one_char = re.compile(r'\b[a-zA-Z0-9]\b', re_flags)
pattern_fully_numeric = re.compile(r'\b\d+?\b', re_flags)

pattern_punctuation = re.compile(
    "[" + re.escape(punctuation+"“”…‘’") + "]+?",
    re_flags
)

pattern_url = re.compile(
    r'(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?'
    r'(//(?:[a-zA-Z0-9-._~%!$&\'()*+,;=:]*(?::[a-zA-Z0-9-._~%!$&\'()*+,;=:]+)?@)?'
    r'(?:\[[0-9a-fA-F:.]+]|(?:[a-zA-Z0-9-]+\.)*[a-zA-Z]{2,}|[0-9.]+|localhost)'
    r'(?::\d+)?)(/[a-zA-Z0-9-._~%!$&\'()*+,;=:@]*/?)*'
    r'(?:\?[a-zA-Z0-9-._~%!$&\'()*+,;=:@/]*)?'
    r'(?:#[a-zA-Z0-9-._~%!$&\'()*+,;=:@/]*)?',
    re_flags
)

all_emojis = tuple(EMOJI_DATA.keys())
pattern_emojis = re.compile('|'.join(map(re.escape, all_emojis)) + '?')

pattern_ampm = re.compile(r'([0-9]+(am|pm))')

selected_filter_patterns: tuple[re.Pattern[str], ...] = (
    pattern_url,
    pattern_html_tags,
    pattern_mentions,
    pattern_hashtags,
    pattern_punctuation,
    pattern_fully_numeric,
    pattern_ampm,
)


def regexp_filter(tweet: str, /) -> str:
    return filter_text(tweet, patterns=selected_filter_patterns)

#### Remove emojis

In [96]:
def remove_emojis(tweet: str, /) -> str:
    return demojize(tweet, delimiters=('', ''))

#### Remove non-English words

In [97]:
from langdetect import detect

def remove_non_en(tweet: str, /) -> str:
    try:
        if detect(tweet) == 'en':
            return tweet
        else:
            return ''
    except:
        return ''

#### Tokenization & Lemmatization

In [98]:
nltk_prepare_task.wait()

lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer(
    reduce_len=True,
    strip_handles=True,
    preserve_case=False
)
stop_words = frozenset(stopwords.words('english'))


def nltk_tokenize_anti_stopwords_lemmatize(tweet: str, /) -> str:
    wordnet.ensure_loaded()
    tokens = tokenizer.tokenize(tweet)
    return ' '.join(
        lemmatizer.lemmatize(token)
        for token in tokens
        if token not in stop_words
    )

#### Fix contractions

In [99]:
def fix_contractions(tweet: str, /) -> str:
    return f'{contractions.fix(tweet)}'

#### Run all preprocessing steps

In [100]:
all_pipelines = (
    lowercase_tweet,
    regexp_filter,
    remove_emojis,
    # remove_non_en, TODO: optimize for performance
    nltk_tokenize_anti_stopwords_lemmatize,
    fix_contractions,
)

cleaned_training_tweets = run_pipelines(
    all_pipelines,
    tweets=training_data
)

cleaned_dev_tweets = run_pipelines(
    all_pipelines,
    tweets=dev_data
)

cleaned_test_tweets = [
    run_pipelines(
        all_pipelines,
        tweets=test_data
    )
    for test_data in test_datas
]

#### Show the top 10 most frequent words in the training set

In [101]:
all_words = ' '.join(cleaned_training_tweets.values()).split()
word_freq = Counter(all_words)

freq_frame = pd.DataFrame(
    word_freq.most_common(20),
    columns=['word', 'freq']
)

print(freq_frame)

        word  freq
0   tomorrow  5829
1        may  5503
2        not  3796
3        day  3538
4          i  3417
5      going  3274
6         am  2937
7        you  2848
8      night  2511
9        see  2466
10      like  2392
11       get  2361
12      time  2297
13    sunday  2162
14       1st  2158
15    friday  2077
16       one  1933
17        go  1860
18      want  1847
19       new  1692


In [102]:
# TODO: Dev code.
from pprint import pprint
import random
# generate a random number between 0 and 100

window_size = 15
rand_num = random.randint(0, len(cleaned_training_tweets) - window_size)

pprint(list(cleaned_training_tweets.values())[rand_num:rand_num+window_size])
print(rand_num)

['post seinfeld tv today first thing tomorrow',
 'ru showing black v tonga game friday nite sound definitely showing league '
 'right',
 'bold prediction think tim howard going to get hattie tomorrow long range '
 'effort goal',
 'bomb squad bake sale thursday 25th do not miss',
 'july 23rd national hot dog day come celebrate you blake brock deck',
 'happy birthday gorgeous may day magical movie magic mike xxl p',
 'look eye vine',
 'remember iron maiden album drop friday',
 'gucci inexpressibles ego acceptation may embody twosome meaningful open door '
 'junta elenchus extan',
 'omg happy 2nd birthday baby i am obsessed little monkey prince george stay '
 'adorbs cutie',
 'pointless committee one need right kim jongun',
 'lol keller insinuating ric flair may fed meltzer cena holding charlotte '
 'story',
 'become 1st footballer sportsowned legendary rapper jayz',
 'good day vids tomorrow stasis long dark rebirth minecraft',
 'fake ua activist buy fake follower cheek call faker']
13458

#### TF-IDF Vectorization

In [103]:
from scipy.sparse import csr_matrix

vectorizer = TfidfVectorizer(
    analyzer='word',
    stop_words='english',
    encoding='utf-8',
    lowercase=False,
    min_df=3,
    max_df=0.8,
)

training_matrix = np.asarray(
    csr_matrix(
        vectorizer.fit_transform(cleaned_training_tweets.values())
    ).todense()
)
dev_matrix = np.asarray(
    csr_matrix(vectorizer.transform(cleaned_dev_tweets.values())).todense()
)
test_matrix = [
    np.asarray(csr_matrix(vectorizer.transform(test_data.values())).todense())
    for test_data in cleaned_test_tweets
]

print(training_matrix.shape)

(45101, 11313)


#### Build sentiment classifiers
You need to create your own classifiers (at least 3 classifiers). For each classifier, you can choose between the bag-of-word features and the word-embedding-based features. Each classifier has to be evaluated over 3 test sets. Make sure your classifier produce consistent performance across the test sets. Marking will be based on the performance over all 5 test sets (2 of them are not provided to you).

In [104]:
# SVM

from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

model = LinearSVC(dual=False, verbose=False, max_iter=1000)
model.fit(
    training_matrix,
    [sentiments.name for sentiments in training_sentiments.values()]
)
svm_predictions = model.predict(test_matrix[0])

svm_report = classification_report(
    [sentiments.name for sentiments in test_sentiments[0].values()],
    svm_predictions,
    target_names=Sentiment.gts(),
    digits=5
)

print(svm_report)

              precision    recall  f1-score   support

    positive    0.57460   0.32496   0.41514       557
    negative    0.57030   0.72008   0.63650      1504
     neutral    0.65831   0.58980   0.62217      1470

    accuracy                        0.60351      3531
   macro avg    0.60107   0.54494   0.55794      3531
weighted avg    0.60762   0.60351   0.59562      3531



#### LSTM Classifier

In [105]:
import torch

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")



Using cuda device


#### DEAD CODE (too ugly)

In [106]:
# # For features used for classifier training, the 'bow' feature is given in the code.
# # But you could also explore the use of other features.
# for classifier in ('svm', '<classifier-2-name>', '<classifier-3-name>',):
#     for features in ('bow', '<feature-2-name>',):
#         # Skeleton: Creation and training of the classifiers
#         if classifier == 'svm':
            
#             print('Training ' + classifier)
#         elif classifier == '<classifier-2-name>':
#             # write the classifier 2 here
#             print('Training ' + classifier)
#         elif classifier == '<classifier-3-name>':
#             # write the classifier 3 here
#             print('Training ' + classifier)
#         elif classifier == 'LSTM':
#             # write the LSTM classifier here
#             if features == 'bow':
#                 continue
#             print('Training ' + classifier)
#         else:
#             print('Unknown classifier name' + classifier)
#             continue

#         # Prediction performance of the classifiers
#         for test_set_name in test_set_names:
#             id_predicts = {}
#             # write the prediction and evaluation code here
#             evaluate(id_predicts, test_set_name, features + '-' + classifier)
# custom vectorizer
# BOW, unigram, bigram, trigram, n-gram (features)