In [263]:
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
from transformers import BartTokenizer, BartForConditionalGeneration, AutoModelForCausalLM, \
    DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline, PreTrainedTokenizerFast
from peft import get_peft_model, LoraConfig, TaskType
from tokenizers import normalizers, pre_tokenizers, Tokenizer, models, trainers

In [264]:
ROW_NUMBER = 500000
VALIDATION_SIZE = (ROW_NUMBER*20)//100

In [265]:
dataset_old = load_dataset('KomeijiForce/Text2Emoji')
sliced_train_dataset = dataset_old['train'].select(range(ROW_NUMBER))

dataset = dataset_old.copy()
dataset['train'] = sliced_train_dataset
dataset

{'train': Dataset({
     features: ['text', 'emoji', 'topic'],
     num_rows: 1000
 })}

In [266]:
dct = {"text": [], "emoji": [], "topic": []}
with open("./data/gpt_translate_1.txt", 'r', encoding='utf-8') as file:
    for line in file:
        inx = line.rfind(",")
        text, emoji = line[:inx], line[inx+1:]
        dct["text"].append(text.strip())
        dct["emoji"].append(emoji.strip())
        dct['topic'].append("None")

In [267]:
extension_data = Dataset.from_dict(dct)
extended_dataset = concatenate_datasets([extension_data, dataset['train']])
dataset['train'] = extended_dataset
dataset

{'train': Dataset({
     features: ['text', 'emoji', 'topic'],
     num_rows: 1197
 })}

In [None]:
NEW_ROW = dataset['train'].num_rows
VALIDATION = (NEW_ROW*20)//100

In [268]:
def transform_features(example):
    return {
        "output": example["emoji"],
        "input": example["text"]
    }

transformed_train = dataset["train"].map(transform_features, remove_columns=["topic", "emoji", 'text'])
train_test_split = transformed_train.shuffle(seed=42).train_test_split(test_size=VALIDATION)

final_data = DatasetDict({
    "train": train_test_split["train"].select(range(NEW_ROW - VALIDATION)),
    "validation": train_test_split["test"]
})

print(final_data)

Map:   0%|          | 0/1197 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['output', 'input'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['output', 'input'],
        num_rows: 200
    })
})


### Tokenizer training

In [269]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

In [270]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)

model = get_peft_model(model, lora_config)

In [271]:
model.print_trainable_parameters()

trainable params: 1,179,648 || all params: 407,471,104 || trainable%: 0.2895


In [272]:
tokenizer_input = BartTokenizer.from_pretrained('facebook/bart-large')
tokenizer_output = Tokenizer.from_file( "./tokenizers/hf_tok_emoji.json" )
tokenizer_output.enable_padding(length = 120)
tokenizer_output.enable_truncation(max_length = 120)

In [273]:
tokenizer_output.pad_token = tokenizer_input.pad_token

In [274]:
def tokenize_dataset(sample):
    max_length = 120
    input = tokenizer_input(sample['input'], padding='max_length', max_length=max_length, truncation=True)
    label_tokens = [tokenizer_output.encode(' '.join(list(output_str))).ids for output_str in sample['output']]
    input["labels"] = label_tokens
    return input


In [275]:
tokenized_dataset = final_data.map(tokenize_dataset, batched=True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [276]:
cleared_dataset = tokenized_dataset.map(lambda x: x, remove_columns=["output", 'input'])

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [277]:
cleared_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})

In [None]:

training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    evaluation_strategy="steps",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    predict_with_generate=True,
    logging_steps=1000,
    save_steps=500,
    eval_steps=8000,
    warmup_steps=2000,
    overwrite_output_dir=True,
    save_total_limit=3,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=cleared_dataset['train'],
    eval_dataset=cleared_dataset['validation'],
)

trainer.train()



Step,Training Loss,Validation Loss
64,6.3955,6.868536


In [None]:
import os

save_directory = "./model"
if not os.path.exists(save_directory):
  os.mkdir(save_directory)
model.save_pretrained(save_directory)

In [None]:
device = "cuda"
model = AutoModelForCausalLM.from_pretrained(save_directory).to(device)

In [None]:
input_text = "Car travelling"
inputs = tokenizer_input(input_text, return_tensors="pt").to(device)
outputs = model.generate(
    inputs.input_ids.to(device),
    max_length=50,
    temperature=0.7,
    do_sample=True
)
outputs = outputs.cpu()
output_ids = outputs[0].tolist()

print("Input:")
print(input_text)
generated_text = tokenizer_output.decode(output_ids, skip_special_tokens=True)
generated_text = "".join([text.strip() for text in generated_text.split(" ")])
print("Generated Output:")
print(generated_text)