# Lecture 1

In [None]:
import re

line = 'A cat sat on the mat. His name was Måns.'

# Initialise lists
tokens = []
unmatchable = []

# Compile patterns for speedup
token_pat = re.compile(r'\w+')
skippable_pat = re.compile(r'\s+')  # typically spaces

# As long as there's any material left...
while line:
    # Try finding a skippable token delimiter first.
    skippable_match = re.search(skippable_pat, line)
    if skippable_match and skippable_match.start() == 0:
        # If there is one at the beginning of the line, just skip it.
        line = line[skippable_match.end():]
    else:
        # Else try finding a real token.
        token_match = re.search(token_pat, line)
        if token_match and token_match.start() == 0:
            # If there is one at the beginning of the line, tokenise it.
            tokens.append(line[:token_match.end()])
            line = line[token_match.end():]
        else:
            # Else there is unmatchable material here.
            # It ends where a skippable or token match starts, or at the end of the line.
            unmatchable_end = len(line)
            if skippable_match:
                unmatchable_end = skippable_match.start()
            if token_match:
                unmatchable_end = min(unmatchable_end, token_match.start())
            # Add it to unmatchable and discard from line.
            unmatchable.append(line[:unmatchable_end])
            line = line[unmatchable_end:]

print(tokens)
print(unmatchable)

# Lecture 2

Exercise: N-gram Language Modelling
First Year Projects
4 May 2021


In this exercise, you will get familiar with n-gram language models using the
nltk library.

You can work with the corpora from the two tasks you selected for your project.
Additionally, you can download a dataset of news editorials collected and
distributed by the organisers of the Conference on Machine Translation from
here:
http://data.statmt.org/news-commentary/v16/training-monolingual/news-commentary-v16.en.gz

This is a sizeable data set (albeit still quite small by the standards of the
NLP community), and if you find that your computer becomes intolerably slow or
runs into memory problems, you can just use a subset of the corpus for the
exercises.

1. Remove a subset of about 5000 sentences from the news commentary dataset to
be used for evaluation. For the TweetEval datasets, you can use the standard
training/validation split.

2. Load and tokenise your datasets so that, for each of the corpora, you get a
list of sentences, each sentence represented by a list of tokens. Use the same
tokeniser for all datasets.

3. Follow the instructions at
http://www.nltk.org/api/nltk.lm.html#module-nltk.lm
to train maximum-likelihood language models of varying orders (e.g., n=2..6) for
each of your corpora. Make a note of the size of the n-gram lists for each
n-gram order. You might also plot them.

IMPORTANT: We will want to compare perplexities across different corpora. This
only produces meaningful results if all the models use exactly the same
vocabulary. To ensure that, create a vocabulary from the largest of your
datasets and use it for ALL the corpora.

4. Use the lm.generate function to generate some example text from each of your
models and compare.

5. Use lm.score and lm.logscore to calculate the scores of a couple of n-grams
you find in the texts with the different models. Try 1 or 2 n-grams with all
function words and 1 or 2 n-grams that contain specific content words or names.

5. Use the lm.perplexity function to compute the perplexity of the validation
sets from each dataset with each of the language models and compare. Keep in
mind that LOWER perplexity is BETTER.

6. Repeat the steps above with another language model implementation, such as
Laplace, WittenBellInterpolated or KneserNeyInterpolated.

In [None]:
import collections
import nltk.tokenize
import numpy
import pandas
import pickle
import seaborn
import matplotlib.pyplot as plt


# tok = nltk.tokenize.TreebankWordTokenizer()
#
# corpus = []
# with open('news-commentary-v16.en', 'r') as f:
#     for line in f:
#         corpus.extend(t for line in f for t in tok.tokenize(line))
#
# with open('ncv16-list.pkl', 'wb') as f:
#     pickle.dump(corpus, f)

with open('ncv16-list.pkl', 'rb') as f:
    corpus = pickle.load(f)

voc = collections.Counter(corpus)
frq = pandas.DataFrame(voc.most_common(), columns=['token', 'frequency'])

# Index in the sorted list
frq['idx'] = frq.index + 1

# Frequency normalised by corpus size
frq['norm_freq'] = frq.frequency / len(corpus)

# Cumulative normalised frequency
frq['cumul_frq'] = frq.norm_freq.cumsum()

seaborn.set_theme(style='whitegrid')

# Plot: Cumulative frequency by index
seaborn.relplot(x='idx', y='cumul_frq', data=frq)
plt.show()

# Plot: Cumulative frequency by index, top 10000 tokens
seaborn.relplot(x='idx', y='cumul_frq', data=frq[:10000], kind='line')
plt.show()

# Plot: Log-log plot for Zipf's law
frq['log_frq'] = numpy.log(frq.frequency)
frq['log_rank'] = numpy.log(frq.frequency.rank(ascending=True))
seaborn.relplot(x='log_rank', y='log_frq', data=frq)
plt.show()

pass

# Lecture 3

In [None]:
import os
import random


inroot = 'tweeteval/datasets'
outroot = 'iaa-sets'

corpora = [
    'emoji',
    'emotion',
    'hate',
    'irony',
    'offensive',
    'sentiment',
    'stance/abortion',
    'stance/atheism',
    'stance/climate',
    'stance/feminist',
    'stance/hillary'
]

iaa_size = 120

for crp in corpora:
    indir = inroot + '/' + crp
    outdir = outroot + '/' + crp
    with open(indir + '/train_text.txt', 'r') as f:
        train_text = [line.rstrip('\n') for line in f]
    with open(indir + '/train_labels.txt', 'r') as f:
        train_labels = [line.rstrip('\n') for line in f]

    train_size = len(train_text)
    assert len(train_labels) == train_size

    smpl = set(random.sample(range(train_size), iaa_size))
    iaa_text = [t for i, t in enumerate(train_text) if i in smpl]
    iaa_labels = [t for i, t in enumerate(train_labels) if i in smpl]

    os.makedirs(outdir, exist_ok=True)
    with open(outdir + '/iaa_text.txt', 'w') as f:
        print('\n'.join(iaa_text), file=f)
    with open(outdir + '/iaa_labels.txt', 'w') as f:
        print('\n'.join(iaa_labels), file=f)
    with open(outdir + '/iaa_indices.txt', 'w') as f:
        print('\n'.join(str(i) for i in sorted(smpl)), file=f)