In [30]:
import torch
import json
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from utils import get_sorted_tweets, get_target_words


In [2]:
# Load BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name).cuda()
tweets = get_sorted_tweets()
target_words = get_target_words()
tsv_file_path = 'data/annotator.tsv'
# Read the TSV file into a pandas DataFrame
df = pd.read_csv(tsv_file_path, sep=' ', header=None).to_numpy()
annotator = {item[0]: item[1] for item in df}

In [3]:
def generate_vector_from_context(word, text):
    tok_w = tokenizer(word, return_tensors='pt', add_special_tokens=False)
    tok = int(tok_w['input_ids'].flatten()[0])
    len_tok = len(tok_w['input_ids'].flatten())
    tok_t = tokenizer(text, return_tensors='pt', padding='max_length')
    ids = tok_t['input_ids'].flatten().tolist()
    if tok in ids:
        idx = ids.index(tok)
    else:
        raise ValueError(f'{tok} from {tok_w} not in list {ids}. \n text: {text} word {word} \n tokenizer decode: {tokenizer.decode(ids)}')
    for item in tok_t:
        tok_t[item] = tok_t[item].to('cuda')
    vec = model(**tok_t)['last_hidden_state'].squeeze(0)[idx:idx+len_tok].cpu().detach().numpy()
    vec = np.average(vec, axis=0)
    return vec

In [4]:
def avg_vector_by_year(year):
    target_word_vectors = {wrd: [] for wrd in target_words}
    data = tweets[year]
    for t in data:
        word = t['word']
        text = t['text']
        try:
            vec = generate_vector_from_context(word, text)
        except ValueError:
            pass
        target_word_vectors[word].append(vec)
    for wrd in target_words:
        vecs = np.array(target_word_vectors[wrd])
        target_word_vectors[wrd] = np.average(vecs, axis=0)
    
    return target_word_vectors



In [5]:
vecs_2019 = avg_vector_by_year('2019')
vecs_2020 = avg_vector_by_year('2020')
vecs_2021 = avg_vector_by_year('2021')

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


In [14]:
np.nan in vecs_2021

False

In [34]:
def find_correlation(A, B):
    bert_pred = []
    ground_truth = []
    for key in A:
        a = A[key]
        b = B[key]
        if type(a) != np.ndarray or type(b) != np.ndarray:
            continue
        cos_sim = float(cosine_similarity([a], [b]).flatten()[0])
        cos_dist = 1 - cos_sim
        bert_pred.append(cos_dist)
        ground_truth.append(annotator[key])
    return pearsonr(bert_pred, ground_truth)[1]

In [37]:
find_correlation(vecs_2020, vecs_2021)

0.012166736321230068

In [38]:
def load_data(data_path, labels_path):
    # Load tweet instances
    with open(data_path, 'r', encoding='utf-8') as file:
        data_instances = [json.loads(line) for line in file]

    # Load labels
    with open(labels_path, 'r', encoding='utf-8') as file:
        labels = dict(line.strip().split('\t') for line in file)

    return data_instances, labels

In [52]:
train_data_path = 'data/train.data.jl'
train_labels_path = 'data/train.labels.tsv'

val_data_path = 'data/validation.data.jl'
val_labels_path = 'data/validation.labels.tsv'

test_data_path = 'data/trial.data.jl'
test_labels_path = 'data/trial.gold.tsv'

data_instances, labels = [], {}
tr_data, tr_labels = load_data(train_data_path, train_labels_path)
data_instances.extend(tr_data)
labels = labels | tr_labels

val_data, val_labels = load_data(val_data_path, val_labels_path)
data_instances.extend(val_data)
labels = labels | val_labels

te_data, te_labels = load_data(test_data_path, test_labels_path)
data_instances.extend(te_data)
labels = labels | te_labels

pairs = {item['id']: [item['tweet1']['text'], item['tweet2']['text'], item['word']] for item in data_instances}

In [46]:
def find_acc(threshold):
    correct = 0
    count = 0
    for key in pairs:
        label = labels[key]
        t1, t2, word = pairs[key]
        try:
            vec1 = generate_vector_from_context(word, t1)
            vec2 = generate_vector_from_context(word, t2)
        except ValueError:
            continue
        res = float(cosine_similarity([vec1], [vec2]).flatten()[0])
        res = 1 if res > threshold else 0
        if res == int(label):
            correct += 1
        count += 1

    return correct / count


In [54]:
for threshold in range(10):
    i = threshold / 10
    print(find_acc(i))
# best threshold 0.7

0.4515778019586507
0.4515778019586507
0.4515778019586507
0.4515778019586507
0.45973884657236125
0.47279651795429817
0.5337323177366703
0.6447225244831338
0.6142546245919478
0.5522306855277476
