In [None]:
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
import os
import pandas as pd
import json

In [7]:
model = SentenceTransformer('all-mpnet-base-v2')

In [23]:
data_directory = "data/"
home_directory = "../"

In [22]:
train_file_path = "data/task_4/task_4_news_media_factuality_train.tsv"
dev_file_path = "data/task_4/task_4_news_media_factuality_dev.tsv"
test_gold_file_path = "task_4_news_media_factuality_test.tsv"

In [62]:
train_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, train_file_path), sep = "\t")

In [63]:
train_raw_data.head()

Unnamed: 0,source,json_file_path,label
0,newschannel5.com,data/task_4/train_json/newschannel5.com.json,2
1,acpeds.org,data/task_4/train_json/acpeds.org.json,0
2,altnewsmedia.net,data/task_4/train_json/altnewsmedia.net.json,1
3,nationalaffairs.com,data/task_4/train_json/nationalaffairs.com.json,2
4,duluthnewstribune.com,data/task_4/train_json/duluthnewstribune.com.json,2


In [66]:
def get_inference_results(sentences, task = "bias"):
    embeddings = torch.from_numpy(model.encode(sentences))
    if task == "bias":
        linear = nn.LazyLinear(2)
        labels = ['Factual', 'Not Factual']
        colname = 'Factuality'
    else:
        linear = nn.LazyLinear(3)
        labels = ['Left', 'Center', 'Right']
        colname = 'Bias'
    scores = nn.Softmax(dim = -1)(linear(embeddings)).T.tolist()
    results = pd.DataFrame({
        colname: labels,
        'Scores': scores
    })
    return results

def get_articles(raw_data, tune = False):
    data = []
    for each_row in raw_data['articles']:
        if tune:
            data.append([each_row['content'], each_row['label']])
        else:
            data.append(each_row['content'])
    return data

In [67]:
def get_split_data(split_data):
    data = []
    for news_src in split_data['json_file_path']:
        raw_data = json.loads(open(os.path.join(home_directory, data_directory, news_src)).read())
        cur_data = get_articles(raw_data)
        data.extend(cur_data)
    return data

In [68]:
train_data = get_split_data(train_raw_data)

In [70]:
factresults = get_inference_results(train_data)



In [75]:
import numpy as np
np.mean(factresults['Scores'][0])

0.4853795408662624