# Install

In [None]:
# Needed for Colab
#!pip install sentence-transformers

# Imports

In [None]:
import numpy as np
import torch

from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, SentencesDataset, losses

np.random.seed(21)

# Setup

In [None]:
#Local setup
%cd ..

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
print(f"Using {device} device")

In [None]:
#MiniLM-L6-v2 is a small model that is fast to train and has a small memory footprint.
#MPNet-base-v2 is a medium-sized model that has a bigger memory footprint, but is more accurate than MiniLM-L6-v2.

model_id = "sentence-transformers/all-MiniLM-L6-v2"
#model_id = "sentence-transformers/all-mpnet-base-v2"

frac = 0.1 #amount of data used to fine-tune the model
model = SentenceTransformer(model_id)
model = model.to(device)

In [None]:
MAIN_DIR = "C:/Users/MartijnElands/Documents/Thesis/twister_temp/data/" #local
EPOCHS = 3

In [None]:
from src.CustomDataLoader import CustomDataLoader

loader = CustomDataLoader(name="tweet_eval", subset="emoji")

dataset = loader.load_huggingface_data()
subsets = list(dataset.keys())
dfs = loader.to_dataframe(data_dict=dataset, subsets=subsets)

df_train = dfs["train"]
df_validation = dfs["validation"]
df_test = dfs["test"]

In [None]:
from src.CustomTextDataset import CustomTextDataset

training_data = CustomTextDataset(file = df_train, label_name = "label", text_name = "text")

# Fine tuning

In [None]:
#df_train = training_data.get_data()
if frac != 1:
  df_train = df_train.sample(frac=frac, random_state=1) #take % of the data

train_examples = [InputExample(texts=[t['text']], label=t['label']) for i, t in df_train.iterrows()]

In [None]:
gen = torch.Generator().manual_seed(21)
train_dataset = SentencesDataset(train_examples, model)
train_dataloader = DataLoader(train_dataset, batch_size=21, shuffle=True, generator=gen)

In [None]:
train_loss = losses.BatchHardTripletLoss(model=model)

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=EPOCHS)

# Saving

In [None]:
dir = "/some/drive/" + MAIN_DIR.split("/")[-2]  + "-" + model_id.split("/")[-1]+"-"+str(int(frac*100))+"/"

In [None]:
model.save(dir, model_name="FineTuned")

In [None]:
#Login to push to hub
import os
from dotenv import load_dotenv
from huggingface_hub import login, create_repo

load_dotenv()
TOKEN = os.getenv('TOKEN')

login(TOKEN)

In [None]:
from huggingface_hub import HfApi
api = HfApi()
REPO_ID = "TheChickenAgent/" + MAIN_DIR.split("/")[-2]  + "-" + model_id.split("/")[-1]+"-"+str(int(frac*100))
#Example of REPO_ID: "TheChickenAgent/Emoji_dataset-all-MiniLM-L6-v2-10"
api.create_repo(repo_id=REPO_ID, private=True)

In [None]:
dir

In [None]:
files_to_push_to_hub = [
    'README.md',
    'config.json',
    'config_sentence_transformers.json',
    'modules.json',
    'pytorch_model.bin',
    'sentence_bert_config.json',
    'special_tokens_map.json',
    'tokenizer.json',
    'tokenizer_config.json',
    'vocab.txt'
]

for filename in files_to_push_to_hub:
  api.upload_file(
      path_or_fileobj=dir+filename,
      repo_id=REPO_ID,
      path_in_repo=filename,
      repo_type="model",
      commit_message="Pushing model",
      commit_description="Model"
  )

api.upload_folder(
    folder_path=dir+"1_Pooling",
    path_in_repo="1_Pooling",
    repo_id=REPO_ID,
    repo_type="model",
)