### Data loading utils

In [5]:
# See: https://pytorch.org/tutorials/beginner/basics/data_tutorial.html

from re import I
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

STANCE_MAP = {
    'agree':0,
    'disagree':1,
    'discuss':2,
    'unrelated':3,
}

class FakeNewsDataset(Dataset):
    def __init__(self, stances_file=None, bodies_file=None, related_only=False):
        self.stances = pd.read_csv(stances_file) if stances_file else None
        self.bodies = pd.read_csv(bodies_file) if bodies_file else None
        if related_only:
            self.stances.drop(self.stances[self.stances['Stance'] == 'unrelated'].index, inplace=True)

    def set_df(self, stances_df, bodies_df):
        self.stances = stances_df
        self.bodies = bodies_df

    def __len__(self):
        return len(self.stances)
    
    def __getitem__(self, idx):
        headline, body_id, stance = self.stances.iloc[idx]
        select = self.bodies['Body ID'] == body_id
        body = self.bodies[select]['articleBody'].values[0]
        return (headline, body), STANCE_MAP[stance]

data = FakeNewsDataset('../data/combined_stances_train.csv', '../data/combined_bodies_train.csv', related_only=True)
dataloader = DataLoader(data)

In [14]:
from nltk.tokenize import sent_tokenize

MAX_LEN = 50

def pad_truncate(X, K=MAX_LEN):
    '''
    X will be padded with '' or truncated so that it is of length K
    X: array to be padded or truncated
    K: the length you want the output array to be
    '''
    if len(X) > K:
        X = X[:K]
    padding = ['[PAD]' for i in range(max(0, K - len(X)))]
    return X + padding

def pad_tokenize(X):
    '''
    Preprocess the bodies
    '''
    X_tok = list(map(sent_tokenize, X))
    return list(map(pad_truncate, X_tok))

### Where does the similarity lie?

In [6]:
from sentence_transformers import SentenceTransformer
sim_encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [27]:
import numpy as np
SUBSET_SIZE = 1000
sims = np.zeros((SUBSET_SIZE, 50))
for i, ((h, b), y) in enumerate(dataloader):
    print(f'i:{i}/{SUBSET_SIZE}')
    B = pad_tokenize(b)
    if i > SUBSET_SIZE:
        break
    head_encoding = sim_encoder.encode(h)
    sims_row = []
    for j, body in enumerate(B[0]):
        if body == '[PAD]':
            continue
        body_encoding = sim_encoder.encode(body)
        sims[i,j] = np.dot(head_encoding, body_encoding)

i:0/1000
j:0/1
j:1/1
j:2/1
j:3/1
j:4/1
j:5/1
j:6/1
j:7/1
j:8/1
j:9/1
j:10/1
j:11/1
j:12/1
j:13/1
j:14/1
j:15/1
j:16/1
i:1/1000
j:0/1
j:1/1
j:2/1
j:3/1
j:4/1
j:5/1
j:6/1
j:7/1
j:8/1
j:9/1
j:10/1
j:11/1
j:12/1
j:13/1
j:14/1
j:15/1
j:16/1
j:17/1
j:18/1
j:19/1
j:20/1
j:21/1
j:22/1
j:23/1
j:24/1
j:25/1
j:26/1
j:27/1
j:28/1
j:29/1
j:30/1
j:31/1
j:32/1
j:33/1
j:34/1
i:2/1000
j:0/1
j:1/1
j:2/1
j:3/1
j:4/1
j:5/1
j:6/1
j:7/1
j:8/1
j:9/1
j:10/1
j:11/1
j:12/1
j:13/1
j:14/1
j:15/1
j:16/1
j:17/1
j:18/1
j:19/1
i:3/1000
j:0/1
j:1/1
j:2/1
j:3/1
j:4/1
j:5/1
j:6/1
j:7/1
j:8/1
j:9/1
j:10/1
i:4/1000
j:0/1
j:1/1
j:2/1


KeyboardInterrupt: 

In [11]:
temp = sims