In [21]:
%pip install datasets



In [1]:
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
from transformers import BartTokenizer, BartForConditionalGeneration, AutoModelForCausalLM, \
    Seq2SeqTrainingArguments, Seq2SeqTrainer
from peft import get_peft_model, LoraConfig, TaskType
from tokenizers import Tokenizer

  from .autonotebook import tqdm as notebook_tqdm
2024-12-19 03:46:25.792399: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734572785.820272  258933 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734572785.832255  258933 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-19 03:46:25.871719: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [108]:
dataset_old = load_dataset('KomeijiForce/Text2Emoji')

ROW_NUMBER = 15000
VALIDATION_SIZE = (ROW_NUMBER*20)//100

In [109]:

sliced_train_dataset = dataset_old['train'].select(range(ROW_NUMBER))

dataset = dataset_old.copy()
dataset['train'] = sliced_train_dataset
dataset

{'train': Dataset({
     features: ['text', 'emoji', 'topic'],
     num_rows: 15000
 })}

In [110]:
dct = {"text": [], "emoji": [], "topic": []}
index = 0
with open("./data/gpt_translate_2.txt", 'r', encoding='utf-8') as file:
    for line in file:
        inx = line.rfind(",")
        text, emoji = line[:inx], line[inx+1:]
        dct["text"].append(text.strip())
        dct["emoji"].append(emoji.strip())
        dct['topic'].append("None")
        index += 1
        if index >= 5000:
          break

In [111]:
print(len(dct['text']))

5000


In [112]:
extension_data = Dataset.from_dict(dct)
# extended_dataset = concatenate_datasets([extension_data, dataset['train']])
# dataset['train'] = extended_dataset
dataset

{'train': Dataset({
     features: ['text', 'emoji', 'topic'],
     num_rows: 15000
 })}

In [113]:
NEW_ROW = dataset['train'].num_rows
VALIDATION = (NEW_ROW*20)//100

In [114]:
def transform_features(example):
    return {
        "output": example["emoji"],
        "input": example["text"]
    }

transformed_train = dataset["train"].map(transform_features, remove_columns=["topic", "emoji", 'text'])
train_test_split = transformed_train.shuffle(seed=42).train_test_split(test_size=VALIDATION)

final_data = DatasetDict({
    "train": train_test_split["train"].select(range(NEW_ROW - VALIDATION)),
    "validation": train_test_split["test"]
})

print(final_data)

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['output', 'input'],
        num_rows: 12000
    })
    validation: Dataset({
        features: ['output', 'input'],
        num_rows: 3000
    })
})


### Tokenizer training

In [115]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

In [116]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)

model = get_peft_model(model, lora_config)

In [117]:
model.print_trainable_parameters()

trainable params: 1,179,648 || all params: 407,471,104 || trainable%: 0.2895


In [118]:
tokenizer_input = BartTokenizer.from_pretrained('facebook/bart-large')
tokenizer_output = Tokenizer.from_file( "./tokenizers/hf_tok_emoji.json" )
tokenizer_output.enable_padding(length = 120)
tokenizer_output.enable_truncation(max_length = 120)

In [119]:
tokenizer_output.pad_token = tokenizer_input.pad_token

In [120]:
final_data = final_data.filter(lambda example: all(value is not None for value in example.values()))

Filter:   0%|          | 0/12000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [121]:
def tokenize_dataset(sample):
    max_length = 120
    input = tokenizer_input(sample['input'], padding='max_length', max_length=max_length, truncation=True)
    label_tokens = [tokenizer_output.encode(' '.join(list(output_str))).ids for output_str in sample['output']]
    input["labels"] = label_tokens
    return input


In [122]:
shuffled_dataset = final_data.shuffle(seed=42)
tokenized_dataset = shuffled_dataset.map(tokenize_dataset, batched=True)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [123]:
cleared_dataset = tokenized_dataset.map(lambda x: x, remove_columns=["output", 'input'])

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [124]:
cleared_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 12000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [129]:

training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    evaluation_strategy="steps",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    predict_with_generate=True,
    logging_steps=1000,
    save_steps=500,
    eval_steps=8000,
    warmup_steps=2000,
    overwrite_output_dir=True,
    save_total_limit=3,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=cleared_dataset['train'],
    eval_dataset=cleared_dataset['validation'],
)

trainer.train()



Step,Training Loss,Validation Loss
8000,0.3669,0.349451
16000,0.3443,0.334062


TrainOutput(global_step=18000, training_loss=0.3668614247639974, metrics={'train_runtime': 3270.0194, 'train_samples_per_second': 11.009, 'train_steps_per_second': 5.505, 'total_flos': 9173049016320000.0, 'train_loss': 0.3668614247639974, 'epoch': 3.0})

In [130]:
import os

save_directory = "./model_15k"
if not os.path.exists(save_directory):
  os.mkdir(save_directory)
model.save_pretrained(save_directory)

In [131]:
device = "cuda"
model = AutoModelForCausalLM.from_pretrained(save_directory).to(device)

Loading adapter weights from ./model_15k led to unexpected keys not found in the model: model.encoder.layers.0.self_attn.q_proj.lora_A.default.weight, model.encoder.layers.0.self_attn.q_proj.lora_B.default.weight, model.encoder.layers.0.self_attn.v_proj.lora_A.default.weight, model.encoder.layers.0.self_attn.v_proj.lora_B.default.weight, model.encoder.layers.1.self_attn.q_proj.lora_A.default.weight, model.encoder.layers.1.self_attn.q_proj.lora_B.default.weight, model.encoder.layers.1.self_attn.v_proj.lora_A.default.weight, model.encoder.layers.1.self_attn.v_proj.lora_B.default.weight, model.encoder.layers.10.self_attn.q_proj.lora_A.default.weight, model.encoder.layers.10.self_attn.q_proj.lora_B.default.weight, model.encoder.layers.10.self_attn.v_proj.lora_A.default.weight, model.encoder.layers.10.self_attn.v_proj.lora_B.default.weight, model.encoder.layers.11.self_attn.q_proj.lora_A.default.weight, model.encoder.layers.11.self_attn.q_proj.lora_B.default.weight, model.encoder.layers.11.

In [133]:
input_text = "Gorilla jumping on trees"
inputs = tokenizer_input(input_text, return_tensors="pt").to(device)
outputs = model.generate(
    inputs.input_ids.to(device),
    max_length=50,
    temperature=0.7,
    do_sample=True
)
outputs = outputs.cpu()
output_ids = outputs[0].tolist()

print("Input:")
print(input_text)
generated_text = tokenizer_output.decode(output_ids, skip_special_tokens=True)
generated_text = "".join([text.strip() for text in generated_text.split(" ")])
print("Generated Output:")
print(generated_text)

Input:
Gorilla jumping on trees
Generated Output:
🧁🇨🌊👦👦😍👦👦👦🖼👗🖼🖼🔩🔩👗👦🖼🎊🖼🖼🎊🖼🔩🖼👗🔩🖼🔩👦🎊🔩🎊🎊👦
