# Data Processing

In [12]:
# Install the necessary libraries when running for the first time
# !pip install datasets transformers evaluate

In [13]:
# Import the required libraries

import os
import evaluate
import numpy as np
import pandas as pd
from datasets import Dataset
from typing import List, Dict
from datetime import datetime
import matplotlib.pyplot as plt
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

In [14]:
def load_dataset(fields: List[str], files: Dict[str, str], mapping: Dict[str, int]) -> pd.DataFrame:
    dataset = {}
    def encode_label(label: str):
        return mapping[label]
    def load_data(file: str):
        df = pd.read_json(file, lines=True)
        data = []
        for _, i in df.iterrows():
            text = ""
            for field in fields:
                if isinstance(i[field], list):
                    text += ' '.join(i[field])
                elif isinstance(field, str):
                    text += i[field]
                else:
                    raise NotImplemented

            data.append({
                "text": text,
                "label": encode_label(i["tags"][0])})
        return data
    for split in list(files.keys()):
        dataset[split] = load_data(files[split])
    return dataset

In [15]:
def preprocess_dataset(dataset: List[Dict[str, List[str]]], model_base: str, model_name: str):
    if model_base.startswith("roberta"):
        from transformers import RobertaTokenizer
        tokenizer = RobertaTokenizer.from_pretrained(model_name)

    elif model_base.startswith("bert"):
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name)     

    else:
        raise NotImplementedError    

    # create tokanizer function with correct tokenizer
    def preprocess_function(sample):
        return tokenizer(sample["text"], truncation=True) 

    # process slices
    dataset_tokenized = {}
    for slice in list(dataset.keys()):
        slice_dataset = Dataset.from_list(dataset[slice])
        slice_tokenized = slice_dataset.map(preprocess_function, batched=True)
        dataset_tokenized[slice] = slice_tokenized

    return dataset_tokenized

# Model definition and Configurations

In [16]:
def create_model(model_base: str, model_name: str, file_name: str, dataset):
    # Get num labels
    num_label = len(dataset[list(dataset.keys())[0]].unique("label"))

    if model_base.startswith("roberta"):
        from transformers import RobertaTokenizer
        tokenizer = RobertaTokenizer.from_pretrained(model_name)

    elif model_base.startswith("bert"):
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name)     

    else:
        raise NotImplementedError   

    # Metrics
    metric = evaluate.load("f1")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels, average="weighted")
    
    # Model
    training_args = TrainingArguments(
        output_dir="../data/checkpoints/"+file_name,
        logging_dir='../data/checkpoints/', 
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        weight_decay=0.01,
        logging_steps = 10,
        load_best_model_at_end=True,
        metric_for_best_model='loss',
        greater_is_better=False,
        evaluation_strategy='epoch',
        save_strategy='epoch',)


    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_label)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()

    return trainer

In [17]:
def config_name(model, fields, mapping):
    now = datetime.now().strftime("%Y-%m-%d-T%H-%M-%S")
    if sum(mapping.values())> 1:
        type_ = "multiclass"
    else:
        type_ =  [k for k, v in mapping.items() if v == 1][0]
    return model+"-"+str(len(fields))+"_"+type_+"-"+now


In [18]:
models = ["roberta-base", "roberta-news-full"]
field_config = [["postText"], ["postText", "targetTitle"], ["postText", "targetTitle", "targetParagraphs"]]
types = ["multiclass", "one_against_the_others"]

# Configurations
configs = []
for model in models:
    for fields in field_config:
        for type in types:
            if type == "multiclass":
                mapping = {'passage': 0, 'phrase':1, 'multi':2}
                configs.append((model, fields, mapping))


            elif type == "one_against_the_others":
                classes = ["passage", "phrase", "multi"]
                for class_ in classes:
                    mapping = {}
                    for c in classes:
                        if c == class_:
                            mapping[c] = 1
                        else:
                            mapping[c] = 0
                    configs.append((model, fields, mapping))

# Train Model

In [21]:
# LOGGING = True
MODEL_BASE = "roberta"
PROJECT_NAME = "MSCI-project"


data_paths = {
    "train": "../data/input/train.jsonl", 
    "validation":"../data/input/val.jsonl"}


for model, fields, mapping in configs:
  file_name = config_name(model, fields, mapping)

  # ignore trained configs
  model_file_name = "-".join(file_name.split("-")[:-6])
  models_trained = os.listdir("../models")
  models_trained = [ "-".join(name.split("-")[:-6]) for name in models_trained]
  if model_file_name not in models_trained:

    print("Start training")
    print("Configs:", model, fields, mapping)
    print("Name:", file_name)

    # Load dataset
    dataset = load_dataset(fields=fields, files=data_paths, mapping=mapping)
    dataset = preprocess_dataset(dataset=dataset, model_base=MODEL_BASE, model_name=model)

    if model.endswith("full"):
        model = "../models" + model

    trainer_trained = create_model(MODEL_BASE, model, file_name, dataset)
    print(trainer_trained.evaluate())
    trainer_trained.save_model(os.path.join('../models', file_name))

Start training
Configs: roberta-base ['postText'] {'passage': 0, 'phrase': 1, 'multi': 2}
Name: roberta-base-1_multiclass-2023-07-28-T13-43-41


Map: 100%|██████████| 3200/3200 [00:00<00:00, 5525.16 examples/s]
Map: 100%|██████████| 400/400 [00:00<00:00, 6281.08 examples/s]
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

Problem at: /tmp/ipykernel_8905/3288082876.py 33 <module>


wandb: ERROR Error while calling W&B API: project not found (<Response [404]>)


CommError: It appears that you do not have permission to access the requested resource. Please reach out to the project owner to grant you access. If you have the correct permissions, verify that there are no issues with your networking setup.(Error 404: Not Found)