# Fine-Tuning

In [None]:
with open('data/gpt_translate_2.txt', 'r', encoding='utf-8') as f:
    data = f.readlines()

texts = []
labels = []

for line in data:
    if ' ,' in line:
        text, emoji = line.split(' ,', 1)
        texts.append(text.strip())
        labels.append(emoji.strip())

print(texts[:5])
print(labels[:5])


["Mat's childhood took place in a little African village in south Botswana", 'Conduct of the Persian Gulf War: Final Report to the Congress', 'It is even more striking that both EFhd2 and tau exhibit F-actin-bundling activity', 'Obesity leads to insulin resistance', 'The menstrual period']
['where he was initiated to sacred dancings and musical habits of Congalaabawanaa.,🌍👦🏾🏝️🌍🎶🥁🕺🏾', 'Appendix C: Intelligence, April 1992. back to article 2.,🔍📚', 'that actin dynamics are altered in tauopathies and that EFhd2 was found in synapses [6,7].,😮\u200d💨🔝🚶\u200d♂️👀🔍🤍🦠🏼📏🔗🩺🏢🥱💫🛠🧬🔄🔬🍃😶\u200d🌠🐇🔝🎑🔓📘', 'high blood pressure and ultimately to heart disease.,❤️🔺💔', "a metaphor alluding to sexual-abstinence . ',🩸🔴😅🚫💑"]


In [None]:
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

encoding = tokenizer(texts, truncation=True, padding=True, max_length=128)

emoji_labels = list(set(labels))
label_map = {emoji: idx for idx, emoji in enumerate(emoji_labels)}

labels_numeric = [label_map[label] for label in labels]

train_texts, val_texts, train_labels, val_labels = train_test_split(encoding['input_ids'], labels_numeric, test_size=0.2)

train_inputs = torch.tensor(train_texts)
val_inputs = torch.tensor(val_texts)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

train_data = TensorDataset(train_inputs, train_labels)
val_data = TensorDataset(val_inputs, val_labels)

train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=16)


In [None]:
from transformers import BertForSequenceClassification
from torch.optim import AdamW

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_map))

optimizer = AdamW(model.parameters(), lr=1e-5)

def train(model, train_dataloader):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, labels = batch
        optimizer.zero_grad()

        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        total_loss += loss.item()
    print(f"Training loss: {total_loss / len(train_dataloader)}")

for epoch in range(1):
    print(f"Epoch {epoch + 1}")
    train(model, train_dataloader)


In [None]:
model.save_pretrained('path_to_save_model')
tokenizer.save_pretrained('path_to_save_model')


In [None]:

model = BertForSequenceClassification.from_pretrained('path_to_save_model')
tokenizer = BertTokenizer.from_pretrained('path_to_save_model')

inputs = tokenizer("Some text to predict", return_tensors='pt')
outputs = model(**inputs)

predictions = outputs.logits.argmax(dim=-1)
print(predictions)
