In [1]:
!pip install transformers datasets



In [2]:
pip install transformers datasets scikit-learn pandas openpyxl



In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [4]:
from google.colab import files

uploaded = files.upload()


Saving cool.annotated.filtered.cleaned.xlsx to cool.annotated.filtered.cleaned.xlsx
Saving cool.unannotated.filtered.xlsx to cool.unannotated.filtered.xlsx


In [5]:
import pandas as pd

# Load the uploaded Excel files
annotated_df = pd.read_excel("cool.annotated.filtered.cleaned.xlsx")
unannotated_df = pd.read_excel("cool.unannotated.filtered.xlsx")

# Optional: preview a few rows
annotated_df.head(), unannotated_df.head()


(                                         occurrences interpretation  \
 0  we sat , after the long hot drive , drinking c...          Basic   
 1  are observed here on the cool earth , whilesta...          Basic   
 2  in. long . preferred habitat -_- rocky , rich ...          Basic   
 3  red berries clustered in a head . preferred ha...          Basic   
 4  persons are , however , very sensitive to drau...          Basic   
 
         syntax genre decade  
 0  Attributive  acad   1900  
 1  Attributive  acad   1900  
 2  Attributive  acad   1900  
 3  Attributive  acad   1900  
 4  Attributive  acad   1900  ,
                    filename genre  year  \
 0  wlp_acad_1900.inline.txt  acad  1900   
 1  wlp_acad_1900.inline.txt  acad  1900   
 2  wlp_acad_1900.inline.txt  acad  1900   
 3  wlp_acad_1900.inline.txt  acad  1900   
 4  wlp_acad_1900.inline.txt  acad  1900   
 
                                        match_context match  
 0  we sat , after the long hot drive , drinking c.

In [6]:
from sklearn.preprocessing import LabelEncoder

# Clean whitespace and inconsistent capitalization
annotated_df["interpretation"] = annotated_df["interpretation"].str.strip().str.capitalize()

# Encode the cleaned labels into numeric values
label_encoder = LabelEncoder()
annotated_df["label"] = label_encoder.fit_transform(annotated_df["interpretation"])

# Optional: Show the mapping from label names to integers
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)


Label Mapping: {'Basic': np.int64(0), 'Emotion': np.int64(1), 'Nonliteral': np.int64(2)}


In [7]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

# Load pretrained tokenizer and model (3 labels for classification)
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=3)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Split into training and test sets
train_df, test_df = train_test_split(annotated_df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset format (renaming 'occurrences' → 'text')
train_dataset = Dataset.from_pandas(train_df[["occurrences", "label"]].rename(columns={"occurrences": "text"}))
test_dataset = Dataset.from_pandas(test_df[["occurrences", "label"]].rename(columns={"occurrences": "text"}))


In [9]:
# Tokenization function
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Format datasets for PyTorch
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])


Map:   0%|          | 0/806 [00:00<?, ? examples/s]

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./cool_model_xlm",
    eval_strategy="epoch",  # Evaluate at each epoch
    save_strategy="epoch",  # Save at each epoch
    load_best_model_at_end=True,
    num_train_epochs=4,
)



In [11]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "xlm-roberta-base" # Or any other model you want to use
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3) # Adjust num_labels as needed
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Tokenize both train and test datasets using your tokenizer
tokenized_train = train_dataset.map(lambda examples: tokenizer(examples['text'], truncation=True, padding="max_length"), batched=True)
tokenized_test = test_dataset.map(lambda examples: tokenizer(examples['text'], truncation=True, padding="max_length"), batched=True)


Map:   0%|          | 0/806 [00:00<?, ? examples/s]

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

In [13]:
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])


In [17]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = evaluate.load("accuracy").compute(predictions=predictions, references=labels)
    f1 = evaluate.load("f1").compute(predictions=predictions, references=labels, average="weighted")
    precision = evaluate.load("precision").compute(predictions=predictions, references=labels, average="weighted")
    recall = evaluate.load("recall").compute(predictions=predictions, references=labels, average="weighted")
    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"],
        "precision": precision["precision"],
        "recall": recall["recall"]
    }

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)



In [18]:
pip install evaluate scikit-learn

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [19]:
# Import necessary libraries
import numpy as np
import evaluate

# Define the function that will compute the metrics
def compute_metrics(eval_pred):
    """
    Computes accuracy, F1, precision, and recall for a given set of predictions.
    """
    # The 'eval_pred' object is a tuple containing the model's raw output (logits)
    # and the true labels.
    logits, labels = eval_pred

    # The logits are the raw scores. To get the final prediction, we take the
    # class with the highest score (the argmax).
    predictions = np.argmax(logits, axis=-1)

    # Load the metric calculators from the 'evaluate' library
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")

    # Calculate the scores
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    # For multi-class metrics, 'average="weighted"' accounts for class imbalance.
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")

    # Return the results as a dictionary
    return {
        "accuracy": accuracy['accuracy'],
        "f1": f1['f1'],
        "precision": precision['precision'],
        "recall": recall['recall']
    }

In [22]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="./cool_model_xlm",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    num_train_epochs=4,  # or 3 — it will stop early if needed
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_strategy="epoch",
    save_total_limit=1,  # saves only the best model
    seed=42
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)


In [23]:
trainer.train()


  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7696,0.710607,0.648515,0.632498,0.783469,0.648515
2,0.618,0.635,0.777228,0.762935,0.774627,0.777228
3,0.546,0.589498,0.787129,0.78646,0.785846,0.787129
4,0.4886,0.607508,0.792079,0.791802,0.809871,0.792079


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=204, training_loss=0.6055495785731896, metrics={'train_runtime': 606.248, 'train_samples_per_second': 5.318, 'train_steps_per_second': 0.336, 'total_flos': 848277658755072.0, 'train_loss': 0.6055495785731896, 'epoch': 4.0})

In [24]:
from datasets import Dataset

# Prepare the unannotated dataframe for tokenization
unannotated_dataset = Dataset.from_pandas(
    unannotated_df[["match_context"]].rename(columns={"match_context": "text"})
)

# Tokenize the text
tokenized_unannotated = unannotated_dataset.map(
    lambda examples: tokenizer(examples["text"], truncation=True, padding="max_length"),
    batched=True
)

# Format for PyTorch
tokenized_unannotated.set_format("torch", columns=["input_ids", "attention_mask"])


Map:   0%|          | 0/19149 [00:00<?, ? examples/s]

In [25]:
# Run predictions
predictions = trainer.predict(tokenized_unannotated)

# Get predicted class index (highest score)
predicted_class_ids = predictions.predictions.argmax(axis=1)


  return forward_call(*args, **kwargs)


In [27]:
# Convert numeric predictions back to label names
predicted_labels = label_encoder.inverse_transform(predicted_class_ids)

# Add predictions to the unannotated DataFrame
unannotated_df["predicted_interpretation"] = predicted_labels

# Save to Excel
unannotated_df.to_excel("cool.unannotated.with_predictions.xlsx", index=False)

# Confirmation message
print("✅ Predictions saved to: cool.unannotated.with_predictions.xlsx")

✅ Predictions saved to: cool.unannotated.with_predictions.xlsx


In [28]:
from google.colab import files
files.download("cool.unannotated.with_predictions.xlsx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
import numpy as np

# Get full probability scores (logits → softmax to get probabilities)
from scipy.special import softmax
probabilities = softmax(predictions.predictions, axis=1)

# Create a score column: confidence value of the predicted class
prediction_scores = np.max(probabilities, axis=1)  # max prob = confidence score

In [30]:
# Convert class indices to label names
predicted_labels = label_encoder.inverse_transform(predictions.predictions.argmax(axis=1))

# Add both to DataFrame
unannotated_df["predicted_interpretation"] = predicted_labels
unannotated_df["prediction_score"] = prediction_scores


In [31]:
unannotated_df.to_excel("cool.unannotated.with_predictions.xlsx", index=False)
print("✅ File updated with prediction scores.")


✅ File updated with prediction scores.


In [32]:
from google.colab import files
files.download("cool.unannotated.with_predictions.xlsx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>