In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer, TrainingArguments, Trainer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset
import PIL
import os
from tqdm import tqdm
import pandas as pd
import torchvision.transforms as transforms
import regex
import numpy as np
from sklearn.model_selection import train_test_split
import random
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [3]:
np.random.seed(0)
random.seed(0)
torch.manual_seed(0)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
elco_df = pd.read_csv('../../data/ELCo.csv')
device

'cpu'

In [16]:
class EmojisDataset(Dataset):
    def __init__(self, elco_df, tokenizer):
        super().__init__()
        self.elco_df = elco_df
        self.emoji_descriptions = [self.preprocess_emoji_description(desc) for desc in elco_df["Description"]]
        self.raw_emoji_descriptions = elco_df["Description"].values
        self.en = [self.preprocess_en(en) for en in elco_df["EN"].values]
        self.tokenizer = tokenizer
        
    def preprocess_emoji_description(self, text):
        text = text.replace('\'\'', '').lower()
        split_text = regex.findall(r'\':?(.*?):?\'', text)
        return split_text
    def preprocess_en(self, text):
        return text.lower().strip()
    
    def preprocess_prompt(self, prompt):
        inputs = self.tokenizer(prompt["prompt"], truncation=True, padding="max_length", max_length=64)
        targets = self.tokenizer(prompt["target"], truncation=True, padding="max_length", max_length=4)
        inputs["labels"] = targets["input_ids"]
        return inputs
    
    def __len__(self):
        return len(self.emoji_descriptions)
    
    def __getitem__(self, index):
        emoji_description = " ".join(self.emoji_descriptions[index])
        prompt = f"emoji group is [{emoji_description}], overall meaning is {self.en[index]}. Select from composition types: [Direct, Metaphorical, Semantic list, Reduplication, Single]. The composition type is:"
        target = self.elco_df["Composition strategy"].values[index]
        prompt_dict ={
            "prompt": prompt,     
            "target": target
            }
        inputs = self.preprocess_prompt(prompt_dict)
        inputs["text"] = prompt
        inputs["target"] = target
        return inputs
        

In [9]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

train_df, validate_df = train_test_split(elco_df, test_size=0.2, random_state=42, stratify=elco_df["Composition strategy"])
train_dataset = EmojisDataset(elco_df=train_df, tokenizer=tokenizer)
validate_datset = EmojisDataset(elco_df=validate_df, tokenizer=tokenizer)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [13]:
tokenizer.convert_ids_to_tokens(validate_datset[0]["input_ids"])

['▁',
 'e',
 'm',
 'oji',
 '▁group',
 '▁is',
 '▁[',
 'cross',
 '_',
 'mark',
 '▁six',
 '-',
 't',
 'hir',
 't',
 'y',
 '],',
 '▁overall',
 '▁meaning',
 '▁is',
 '▁wrong',
 '▁time',
 '.',
 '▁Select',
 '▁from',
 '▁composition',
 '▁types',
 ':',
 '▁[',
 'Direct',
 ',',
 '▁Meta',
 'phor',
 'ical',
 ',',
 '▁Se',
 'man',
 'tic',
 '▁list',
 ',',
 '▁Red',
 'u',
 'plication',
 ',',
 '▁Single',
 '].',
 '▁The',
 '▁composition',
 '▁type',
 '▁is',
 ':',
 '</s>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>']

In [10]:
training_args = TrainingArguments(
    output_dir="./emoji_composition_model",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_datset,
)



In [11]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.5798,0.401191
2,0.5008,0.347852
3,0.4842,0.332514


TrainOutput(global_step=498, training_loss=0.9454755744780882, metrics={'train_runtime': 409.9758, 'train_samples_per_second': 9.688, 'train_steps_per_second': 1.215, 'total_flos': 67197204430848.0, 'train_loss': 0.9454755744780882, 'epoch': 3.0})

In [17]:
overall_dataset = EmojisDataset(elco_df=elco_df, tokenizer=tokenizer)

In [20]:
def predict(model, tokenizer, input_text):
    inputs = tokenizer(input_text, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=5)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction

correct_predictions = 0
for i in tqdm(range(len(overall_dataset)), total=len(overall_dataset)):
    input_text = overall_dataset[i]["text"]
    target = overall_dataset[i]["target"]
    prediction = predict(model, tokenizer, input_text)

    # print(f"Input: {input_text}")
    # print(f"Target: {target}")
    # print(f"Prediction: {prediction}")
    # print("Correct prediction")
    if prediction == target:
        correct_predictions += 1
print(f"Accuracy: {correct_predictions / len(overall_dataset)}")

100%|██████████| 1655/1655 [01:46<00:00, 15.51it/s]

Accuracy: 0.5099697885196375



