In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
import math
import traceback
from tqdm import tqdm
import numpy as np
import torch

In [4]:
import asyncioConfig as asyncC
from models.ClassifierInterface import ClassifierInterface
from requestsConfig import GetSession
from gdelt.GdeltConsumer import GdeltConsumer
from articleContent.ArticleConsumer import ArticleConsumer
from dataset import Dataset

In [9]:
# Dependencies setup
session = GetSession()
gdeltConsumer = GdeltConsumer.getConsumer(session)
articleConsumer = ArticleConsumer.getConsumer(session)
asyncC.asyncioSetup()

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [6]:
# Splits a dataset to have at least lineCount lines, rounded up by one article
def TrainingDataset(fromDataset: Dataset, lineCount: int):
    trainingDataset = Dataset([])
    lines = 0
    i = 0
    while lines < lineCount and i < len(fromDataset):
        trainingDataset.append(fromDataset[i])
        lines+= len(fromDataset[i].content)
        i+=1
    return trainingDataset

def Predict(dataset: Dataset, classifier: ClassifierInterface, labels: list[int|float]):
    if not labels:
        return
    if isinstance(labels[0], int):
        correct = 0
    
    error = 0
    squaredError = 0
    
    for i, article in enumerate(tqdm(dataset)):
        actual = labels[i]
        if actual == None:
            continue
        prediction = round(np.mean(classifier.predict(article)))
        if(prediction == actual):
            correct += 1
        error += abs(actual - prediction)
        squaredError += abs(actual - prediction)**2
    
    if isinstance(labels[0], int):
        print(f'Correct predictions {correct}/{len(dataset)}, {(correct/len(dataset))*100}%')
    print(f'Error {error}, mean error {error/len(dataset)}')
    print(f'Root Mean Square Error {math.sqrt(squaredError/len(dataset))}')

In [None]:
def main():
    ds = Dataset.load('clearCorpus.json')
    tds = TrainingDataset(ds, 30000)
    from models.NieBert import NieBert
    labels = NieBert.getRegressionLabels(ds)
    weights = NieBert.getRegressionWeights(ds)
    modelNames = ['bert-base-uncased']#, 'roberta-large', 'google/electra-large-discriminator', 'microsoft/deberta-v3-large', 'albert-xxlarge-v2']
    perDeviceTrainBatchSizes = [16, 4, 4, 8, 2]
    for i in range(len(modelNames)):
        mn = modelNames[i]
        s = perDeviceTrainBatchSizes[i]
        try:
            model = NieBert.load(f'{mn}-20-3'.replace('/', '-'))
        except Exception:
            print(f'Now training {mn}')
            try:
                model = NieBert.trainFromDataset(tds, labels[:len(tds)], weights[:len(tds)], mn, per_device_train_batch_size=s, gradient_accumulation_steps=(16//s))
                print(f'Saving model {mn}')
                model.save(f'{mn}-20-3'.replace('/', '-'))
            except Exception as e:
                print(f'EXCEPTION for {mn}: {e}')
                traceback.print_exc()
                continue
        else:
            print(f'Model {mn} loaded')
        print(f'Predictions for {mn}')
        Predict(ds[len(tds):], model, labels[len(tds):])