# Transformer as Regression Model

In [None]:
LANG = "arq" # eng, arq, esp

### Package, data, and preprocessing

In [None]:
!pip install datasets
!pip install accelerate -U
!pip install transformers -U

In [None]:
# Get the Files (Track A - English)
!mkdir STR_Data
!wget https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/eng/eng_train.csv
!wget https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/eng/eng_dev.csv
!wget https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/eng/eng_dev_with_labels.csv
!wget https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/eng/eng_test.csv

!mv eng_train.csv STR_Data/eng_train.csv
!mv eng_dev.csv STR_Data/eng_dev.csv
!mv eng_dev_with_labels.csv STR_Data/eng_dev_with_labels.csv
!mv eng_test.csv STR_Data/eng_test.csv

# Get the Files (Track A - Spanish)
!mkdir STR_Data
!wget https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/esp/esp_train.csv
!wget https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/esp/esp_dev.csv
!wget https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/esp/esp_dev_with_labels.csv
!wget https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/esp/esp_test.csv

!mv esp_train.csv STR_Data/esp_train.csv
!mv esp_dev.csv STR_Data/esp_dev.csv
!mv esp_dev_with_labels.csv STR_Data/esp_dev_with_labels.csv
!mv esp_test.csv STR_Data/esp_test.csv

# Get the Files (Track A - Arabic)
!mkdir STR_Data
!wget https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/arq/arq_train.csv
!wget https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/arq/arq_dev.csv
!wget https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/arq/arq_dev_with_labels.csv
!wget https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/arq/arq_test.csv

!mv arq_train.csv STR_Data/arq_train.csv
!mv arq_dev.csv STR_Data/arq_dev.csv
!mv arq_dev_with_labels.csv STR_Data/arq_dev_with_labels.csv
!mv arq_test.csv STR_Data/arq_test.csv

In [None]:
!ls STR_Data

In [None]:
from datasets import load_dataset

paths = {
  'eng': {
    'train_path': "STR_Data/eng_train.csv",
    'valid_path': "STR_Data/eng_dev_with_labels.csv",
    'submit_path': "STR_Data/eng_test.csv"
  },
    'esp': {
    'train_path': "STR_Data/esp_train.csv",
    'valid_path': "STR_Data/esp_dev_with_labels.csv",
    'submit_path': "STR_Data/esp_test.csv"
  },
    'arq': {
    'train_path': "STR_Data/arq_train.csv",
    'valid_path': "STR_Data/arq_dev_with_labels.csv",
    'submit_path': "STR_Data/arq_test.csv"
  }
}

data_files = {"train": paths[LANG]['train_path'], "valid": paths[LANG]['valid_path']}
dataset = load_dataset("csv", data_files=data_files, column_names=['PairID', 'Text', 'Score'])
dataset['train'] = dataset['train'].train_test_split(test_size=0.2)
dataset

In [None]:
# Load test (Only for submission)
from datasets import load_dataset

submit_dataset = {"test": submit_path}
submit_dataset = load_dataset("csv", data_files=submit_dataset, column_names=['PairID', 'Text'])
submit_dataset = submit_dataset['test']

In [None]:
def is_float(sample):
  try:
    float(sample)
    return True
  except:
    return False

dataset['train'] = dataset['train'].filter(lambda example: is_float(example["Score"]))
dataset['valid'] = dataset['valid'].filter(lambda example: is_float(example["Score"]))

In [None]:
dataset

### Setting up the model

In [None]:
# @title Hyperparameters
# base model: bert-base-cased
BASE_MODEL = "roberta-base" # @param {type:"string"}
LEARNING_RATE = 3e-5
MAX_LENGTH = 128
BATCH_SIZE = 16
EPOCHS = 3

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

def tokenize_function(examples):
    return tokenizer(examples["Text"], padding="max_length", truncation=True)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader

model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1)

In [None]:
def preprocess_function(examples):
    label = examples["Score"]
    examples = tokenizer(examples["Text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)
    examples["label"] = float(label)

    return examples

for split in dataset:
    train_data = dataset["train"].map(preprocess_function, remove_columns=["PairID", "Text", "Score"])

In [None]:
dataset

### Training

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)
    mse = mean_squared_error(labels, logits)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    single_squared_errors = ((logits - labels).flatten()**2).tolist()
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)

    return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}

In [None]:
# @title Training Arguments
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = "/content/models/arq-roberta-fine-tuned-regression" # @param {type:"string"}
    , learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    # metric_for_best_model="accuracy",
    metric_for_best_model="mae",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

In [None]:
import torch

class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
trainer = RegressionTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data['train'],
    eval_dataset=train_data['test'],
    compute_metrics=compute_metrics_for_regression,
)

trainer.train()

In [None]:
# @title Save model
model_path = "Roberta-Arq" # @param {type:"string"}
save_path = f"/content/STR-Reg{model_path}-last"
trainer.save_model(save_path)

In [None]:
# Save model to GDrive
!cp -r /content/STR-RegRoberta-Arq-last /content/drive/MyDrive/

### Test

Make sure to run the **Setting up the model** section before this phase.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

You should only load the model from the following path if you running the weights of model, previously trained. If you are running it after training the model, just ignore it.

In [None]:
model_path = "/content/drive/MyDrive/STR-RegBERT"

In [None]:
# This will load the model which is just the BERT (for now) with a classification head on top
model = AutoModelForSequenceClassification.from_pretrained(model_path, local_files_only=True)

In [None]:
from tqdm import *
from scipy.stats import spearmanr, pearsonr
import torch

test_dataset = dataset["valid"]

data_ = test_dataset["Text"]
true_scores = test_dataset["Score"]
pred_scores = []

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)

for sample in tqdm(data_):
  inputs = tokenizer(sample, return_tensors="pt").to(device)
  with torch.no_grad():
      logits = model(**inputs).logits
      score = logits.tolist()[0][0]
      pred_scores.append(score)


In [None]:
tr_scores = []
for i in true_scores:
  tr_scores.append(float(i))

In [None]:
print("Pearson Correlation:", round(pearsonr(tr_scores, pred_scores)[0], 2))

Pearson Correlation: 0.32


In [None]:
for idx, i in enumerate(tr_scores):
  # pred_scores[idx] = round(pred_scores[idx], 2)
  print(pred_scores[idx])

In [None]:
for idx, sample in enumerate(data_):
  print(sample)
  print("Human Score:", tr_scores[idx])
  print("Machine Score:", "%.2f" % pred_scores[idx])
  print("-"*200)

In [None]:
# Plot
import matplotlib.pyplot as plt
import numpy as np


def plotGraph(y_test,y_pred,regressorName):
    if max(y_test) >= max(y_pred):
        my_range = int(max(y_test))
    else:
        my_range = int(max(y_pred))
    plt.scatter(range(len(y_test)), y_test, color='blue')
    plt.scatter(range(len(y_pred)), y_pred, color='red')
    plt.title(regressorName)
    plt.show()
    return


plotGraph(tr_scores, pred_scores, "Machine scores vs. True scores")

In [None]:
# Confusion Matrix
import pandas as pd
pd.qcut(pred_scores, q = 4, precision = 0)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

# Creating bins for scores
bins = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
categories = ['0-0.2', '0.2-0.4', '0.4-0.6', '0.6-0.8', '0.8-1.0']

# Assigning each score to a category
true_categories = pd.cut(tr_scores, bins, labels=categories)
predicted_categories = pd.cut(pred_scores, bins, labels=categories)

# Creating a confusion matrix (cross-tabulation)
confusion_matrix = pd.crosstab(true_categories, predicted_categories, rownames=['True'], colnames=['Predicted'])

print("Confusion Matrix:")
print(confusion_matrix)

# Calculating Mean Squared Error
mse = mean_squared_error(tr_scores, pred_scores)
print("\nMean Squared Error:", mse)
# Plotting the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix, annot=True, fmt='d', cmap='Blues', cbar=True, square=True)
plt.title('Confusion Matrix (Regression)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
!pip install datasets
!python3 /content/train.py

In [None]:
model

### Submission

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
model_path = "/content/drive/MyDrive/STR-RegBERT"

In [None]:
# This will load the model which is just the model with a classification head on top
model = AutoModelForSequenceClassification.from_pretrained(model_path, local_files_only=True)

In [None]:
from tqdm import *
from scipy.stats import spearmanr, pearsonr
import torch

data__ = submit_dataset
# data__ = dataset['valid']
data_text = data__["Text"]

pred_scores = []

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

for idx, sample in enumerate(tqdm(data_text)):
  inputs = tokenizer(sample, return_tensors="pt").to(device)
  with torch.no_grad():
      logits = model(**inputs).logits
      score = logits.tolist()[0][0]
      p_id = data__[idx]['PairID']
      pred_scores.append([p_id, score])

In [None]:
import pandas as pd
columns = ['PairID', 'Pred_Score']
df = pd.DataFrame(pred_scores, columns=columns)
print(df)

In [None]:
eval_filename = 'pred_eng.csv'
df[['PairID', 'Pred_Score']].to_csv(eval_filename, index=False)
print(f"Saved the {eval_filename} file.")

# T5 Paraphraser

### Set-up

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd

device = "cuda"
tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base") # https://huggingface.co/humarin/chatgpt_paraphraser_on_T5_base?text=they+live+on+47483+conneticuit+drive+in+nashville
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

### Paraphraser

In [None]:
# Can modify different elements

def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=2, # Can be used to output multiple examples to choose from
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids.to(device)

    outputs = model.generate(
        input_ids, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty #, temperature=temperature
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res


### Inference

In [None]:
# Get the data
!wget https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/eng/eng_train.csv

In [None]:
# Read the data
df = pd.read_csv("/content/eng_train.csv")
# df = df.reset_index() # Pair with number of rows

In [None]:
# Paraphrase the data

pairID = df.tail(1)["PairID"].iloc[0].split("-")[2] # Continue from the dataset iteration
for index, row in df.iterrows():
    s1, s2 = row['Text'].split("\n")
    new_s1 = paraphrase(s1)[0]
    new_s2 = paraphrase(s2)[0]

    new_columns_data = {'PairID': ["ENG-train-" + str(int(pairID) + 1)],
        'Text': [new_s1 + "\n" + new_s2],
        'Score': [row['Score']]}

    new_columns_df = pd.DataFrame(new_columns_data)
    result_df = pd.concat([df, new_columns_df], ignore_index = True)
    # result_df.reset_index()

In [None]:
# result_df = result_df.reset_index()
# result_df.reindex(range(result_df.index.max() + 1))
result_df.tail(3)

In [None]:
result_df.to_csv("/content/eng_data_new.csv", index=False)