In [None]:
import pandas as pd
import json
import torch
from datasets import load_dataset

from transformers import (
    AutoTokenizer, 
    BloomModel, 
    AutoModel, 
    BloomForCausalLM, 
    TrainingArguments, 
    Trainer,
    pipeline)

# Data loading and preparation

In [None]:
print("Loading and prepping data...")
json1_path = "../data/CMS_VWSC_–_WSC_Round_9_1.2_Hours_of_Sebring.xml"
with open(json1_path) as file:
    input_data_1 = file.read()

summary1_path = "../data/CMS_VWSC_-_WSC_Round_9_1.2_Hours_of_Sebring.txt"
with open(summary1_path, "r") as file:
    summary_1 = file.read()
    
json2_path = "../data/NARS_at_The_Green_Hell_2020.xml"
with open(json2_path) as file:
    input_data_2 = file.read()

summary2_path = "../data/NARS_at_The_Green_Hell_2020.txt"
with open(summary2_path, "r") as file:
    summary_2 = file.read()

json3_path = "../data/VWSC_2.4_Hours_of_Le_Mans_2020.xml" 
with open(json3_path) as file:
    input_data_3 = file.read()

summary3_path = "../data/VWSC_2.4_Hours_of_Le_Mans_2020.txt"
with open(summary3_path, "r") as file:
    summary_3 = file.read()

json4_path = "../data/VWSC_60_Minutes_of_Laguna_Seca_2020.xml"
with open(json4_path) as file:
    input_data_4 = file.read()

summary4_path = "../data/VWSC_60_Minutes_of_Laguna_Seca_2020.txt"
with open(summary4_path, "r") as file:
    summary_4 = file.read()

df_train = pd.DataFrame()
df_train["input"] = [input_data_1, input_data_2, input_data_3]
df_train["output"] = [summary_1, summary_2, summary_3]
df_train.head()

jsonl_filename = "230315-cms.jsonl"
df_train.to_json(jsonl_filename, orient="records", lines=True)

# Model training setup

In [None]:
print("Setting up model training...")
SEED_VALUE = 42
MODEL_NAME = "bigscience/bloom-1b1"
BATCH_SIZE = 2
EPOCHS = 20

task_designator = "Summary:"
context_length = 2048
padding = "max_length"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = BloomForCausalLM.from_pretrained(MODEL_NAME)

In [None]:
args = TrainingArguments(
    output_dir= f"fine-tuned/bloom_1b1_summarizer_{EPOCHS}_epochs",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy="steps",
    eval_steps=5000,
    logging_steps=1000,
    num_train_epochs=EPOCHS,
    learning_rate=5e-6,
    fp16=False,
    save_strategy="epoch",
    save_total_limit=2
)

In [None]:
# not sure why the validation is set to the same as the file?
data = load_dataset("json", data_files={"train":[jsonl_filename], "validation":[jsonl_filename]})
#data

In [None]:
def tokenize(element):
    ip = ""
    for ele in element["input"]:
        ip += str(ele)
    print(ip)
    text = "Data: " + ip + "\n" + task_designator + " " + element["output"] + tokenizer.eos_token
    output = tokenizer(
        text, 
        truncation=True,
        padding=padding,
        max_length=context_length,
        )
    
    labels = output["input_ids"].copy()
    labels = [-100 if ele == tokenizer.pad_token_id else ele for ele in labels]
    output["labels"] = labels
    return output

In [None]:
tokenized_datasets = data.map(
    tokenize, remove_columns=data["train"].column_names
)
# tokenized_datasets

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

In [None]:
trainer.train()

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [None]:
checkpoint = f"fine-tuned/bloom_1b1_summarizer_{EPOCHS}_epochs/checkpoint-20"

In [None]:
generator = pipeline(
    "text-generation", model=checkpoint, device=device
)

In [None]:
def generate_summary(example):  
    
    prompt = "Data: " + example + "\n" + task_designator 

    # currently generating 400 max tokens
    outputs = generator(prompt, max_new_tokens=400)
    output_str = outputs[0]["generated_text"]

    return output_str

In [None]:
ip = "../data/VWSC_80_Minutes_of_Bahrain___Season_End_Report.xml"
with open(ip, "r") as file:
    results_to_generate = file.read()

In [None]:
print("")
print("*******************************")
print("Generating summary...")
op = generate_summary(ip)
pred_op = op.split(task_designator)[-1]

In [None]:
print(pred_op)