In [None]:
!pip install pandas datasets transformers[torch] scikit-learn tqdm
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install accelerate -U

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification,Trainer, TrainingArguments
import torch.nn as nn
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import os

In [None]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base')
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = 512)

In [None]:
from modules.elastic import ArticleSearchQuery
from modules.objects import FullArticle
from modules.config import BaseConfig

from dotenv import load_dotenv

load_dotenv()

config_options = BaseConfig()

In [None]:
print(config_options.ELASTICSEARCH_ARTICLE_INDEX)
articles = config_options.es_article_client.query_all_documents()
print(len(articles))

In [None]:
pre_classified_articles = [article for article in articles if article.ml.incident and article.summary]
incident = [article for article in pre_classified_articles if article.ml.incident == 2]
not_incident = [article for article in pre_classified_articles if article.ml.incident == 1]

print(len(pre_classified_articles), len(incident), len(not_incident), {article.ml.incident for article in articles})

In [None]:
# Used for uploading to the cloud

#import json, gzip
#with gzip.open("./classified.gz", "wt", encoding="utf-8") as f:
#    json.dump([article.model_dump(mode="json") for article in pre_classified_articles], f)

# Used in the cloud for loading

#import json, gzip
#with gzip.open("classified.gz", 'r') as f:
#    pre_classified_articles = json.loads(f.read().decode('utf-8'))

In [None]:

for article in pre_classified_articles:
  article['ml']['incident'] = 0 if article['ml']['incident'] == 1 else 1

incident = [article for article in pre_classified_articles if article['ml']['incident'] == 1]
not_incident = [article for article in pre_classified_articles if article['ml']['incident'] == 0]

train_size = 0.7

training_source = incident[:int(train_size * len(incident))] + not_incident[:int(train_size * len(not_incident))]
val_source = incident[int(train_size * len(incident)):] + not_incident[int(train_size * len(not_incident)):]

print(len(articles), len(training_source), len(val_source))

In [None]:
def gen_data(articles):
  for article in articles:
    yield {"text" : article['summary'], "label": article["ml"]["incident"]}

train_data = Dataset.from_generator(lambda: gen_data(training_source))
val_data = Dataset.from_generator(lambda: gen_data(val_source))

In [None]:
# define a function that will tokenize the model, and will return the relevant inputs for the model
def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = True, truncation=True)


train_data = train_data.map(tokenization, batched = True, batch_size = len(train_data))
val_data = val_data.map(tokenization, batched = True, batch_size = len(val_data))

train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
# define accuracy metrics
saved_preds = []

def compute_metrics(pred):
    global saved_preds
    print(pred)
    saved_preds.append(pred)
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
training_args = TrainingArguments(
    output_dir="./",
    num_train_epochs=2,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 16,
    per_device_eval_batch_size= 8,
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    disable_tqdm = False,
    load_best_model_at_end=True,
    warmup_steps=5,
    weight_decay=0.01,
    logging_steps = 8,
    dataloader_num_workers = 2,
    run_name = 'roberta-classification'
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()