### Install and import Stacy

In [1]:
#!pip install spacy scipy numpy
#!python -m spacy download en_core_web_lg
#!pip install transformers

In [2]:
import spacy
spacy.prefer_gpu()

False

In [3]:
nlp = spacy.load("en_core_web_lg")
stopwords = nlp.Defaults.stop_words

### Import Data from files

In [4]:
# Training Data
import csv

bodies: dict[int, str] = {}

with open("work/fnc-1/train_bodies.csv") as bodies_file:
    bodies_reader = csv.reader(bodies_file)
    
    # Get rid of key from first entry in CSV file
    _ = next(bodies_reader)
             
    for body in bodies_reader:
        body_id = int(body[0])
        body_str = body[1]
        bodies[body_id] = body_str

        
        
articles: list[tuple[str, str, str]] = []
with open("work/fnc-1/train_stances.csv") as stances_file:
    stances_reader = csv.reader(stances_file)

    # Get rid of key from first entry in CSV file
    _ = next(stances_reader)
    
    for article in stances_reader:
        articles.append((article[0], bodies[int(article[1])], article[2]))
bodies = None


# TF-IDF

### Tokenise text

In [5]:
def tokenise(articles):
    import numpy
    headlines = map(lambda article: article[0], articles)
    bodies = map(lambda article: article[1], articles)
    stances = map(lambda article: article[2], articles)

    processed_articles = zip(
            nlp.pipe(headlines, disable = ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']),
            nlp.pipe(bodies, disable = ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']),
            stances
        )
    return processed_articles

### Get Inverse document frequency for each Term

In [6]:
processed_articles = tokenise(articles)
token_freq = {}

import numpy as np
from itertools import chain
from tqdm import tqdm

all_docs = tqdm(chain.from_iterable(map(lambda a: (a[0], a[1]), processed_articles)))
document_count = 0

for doc in all_docs:
    document_count += 1
    already_added = set()
    
    for token in doc:
        if token.text not in already_added:
            already_added.add(token.text)
            try:
                token_freq[token.text] += 1
            except KeyError:
                token_freq[token.text] = 1

for word, count in tqdm(token_freq.items()):
    token_freq[word] = np.log(document_count/count)

99944it [00:54, 1820.08it/s]
100% 29150/29150 [00:00<00:00, 712402.62it/s]


### Give each term an ID

In [7]:
tokens = list(token_freq.keys())
vector_size = len(tokens)

### Get vector pair for each headline - body pair

In [49]:
from scipy import sparse

processed_articles = tokenise(articles)

def calculate_td_idf(text):
    vector = sparse.dok_array((vector_size, 1))
    for token in text:
        i = tokens.index(token.text)
        value = token_freq[token.text]
        vector[i, 0] += value

    return vector
def td_idf(processed_articles):
    for article in processed_articles:
        headline = calculate_td_idf(article[0])
        body = calculate_td_idf(article[1])
        stance = article[2]
        yield headline, body, stance
    
    
tf_idf_articles = td_idf(processed_articles)

# Transformer

In [9]:
from transformers import BertModel, BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
def tokenise_and_transform(articles):
    import numpy
    headlines = map(lambda article: article[0], articles)
    bodies = map(lambda article: article[1], articles)
    stances = map(lambda article: article[2], articles)

    processed_articles = zip(
            map(lambda x: model(**x), map(lambda x: tokenizer(x, return_tensors="pt"), headlines)),
            map(lambda x: model(**x), map(lambda x: tokenizer(x, return_tensors="pt"), bodies)),
            stances
        )
    return processed_articles

In [11]:
transformer_articles = tokenise_and_transform(articles)


# Related/Unrelated Classification

In [56]:
from scipy.sparse import vstack
import itertools
x_idf = []
y_idf = []
l = 5000
for article in tqdm(itertools.islice(tf_idf_articles, l), total=l):
    x_idf.append(vstack((article[0], article[1])).toarray().squeeze(1))
    if article[2] == "unrelated":
        y_idf.append(0)
    else:
        y_idf.append(1)

100% 5000/5000 [02:05<00:00, 39.91it/s]


In [58]:
from torch import cat
import itertools
x_trf = []
y_trf = []
l = 5000
for article in tqdm(itertools.islice(transformer_articles, l), total=l):
    x_idf.append(cat((article[0].last_hidden_state, article[1].last_hidden_state)).squeeze(1))
    if article[2] == "unrelated":
        y_idf.append(0)
    else:
        y_idf.append(1)

  0% 0/5000 [00:00<?, ?it/s]


RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 19 but got size 100 for tensor number 1 in the list.

### ML - TF-IDF

In [51]:
from scipy.sparse import vstack

from sklearn.ensemble import RandomForestClassifier

mltfidf = RandomForestClassifier(verbose=True, n_estimators=100, n_jobs=12)
mltfidf.fit(x_idf,y_idf)




[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    5.6s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:   17.4s finished


RandomForestClassifier(n_jobs=12, verbose=True)

In [52]:
test_x_idf = []
test_y_idf = []
l = 500
for article in tqdm(itertools.islice(tf_idf_articles, l), total=l):
    test_x_idf.append(vstack((article[0], article[1])).toarray().squeeze(1))
    if article[2] == "unrelated":
        test_y_idf.append(0)
    else:
        test_y_idf.append(1)
    
test_y_pred = mltfidf.predict(test_x_idf)
count = len(test_y_pred)
correct = sum(y == t for y,t in zip(test_y_pred, test_y_idf))
print(f"Accuracy = {correct}/{count} = {correct/count}")

100% 500/500 [00:13<00:00, 36.86it/s]

Accuracy = 412/500 = 0.824



[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished


# ML - Transformer

### DL - TF-IDF

In [None]:
import torch
import math

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)
loss_fn = torch.nn.MSELoss(reduction='sum')



# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use RMSprop; the optim package contains many other
# optimization algorithms. The first argument to the RMSprop constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
while 1:
    
    
    y_pred = model(x)

    loss = loss_fn(y_pred, y)


    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

### DL - Transformer

In [None]:
import torch
import math

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)
loss_fn = torch.nn.MSELoss(reduction='sum')



# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use RMSprop; the optim package contains many other
# optimization algorithms. The first argument to the RMSprop constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
while 1:
    
    
    y_pred = model(x)

    loss = loss_fn(y_pred, y)


    optimizer.zero_grad()
    loss.backward()
    optimizer.step()