# Fine-tuning of the pre-trained model

## Important Notices!


*   Since this step requires a lot of computational power (memory and processing), upload and run this notebook in Google Colab, **after** the pre-processing phase of the database;
*   Just after uploading the notebook, check the running environment of Google Colab by clicking on the down arrow in the upper right corner, and then on "Change the type of running environment";
*   If the selected option is 'CPU', switch to 'GPUs: T4' (or to another GPU of your choice if you have a paid plan from Colab);
*   After that, upload the necessary files to the environment by clicking on the "folder" icon (left side of the screen) and dragging the following files from the repository:
    * notebooks/requirements_fine_tuning.txt
    * temp/temp_input.parquet

After these steps, start running this notebook, and do not forget to download the trained model at the end, in the specified directory.


### Installs and imports the required modules

In [None]:
!pip install -q -r requirements_fine_tuning.txt

In [None]:
import sys
import os
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers import SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from datasets import Dataset
import joblib

### Pretrained model and dataset loading


In [None]:
model = SentenceTransformer("sentence-transformers/xlm-r-bert-base-nli-stsb-mean-tokens")
model

In [None]:
df = pd.read_parquet("temp_input.parquet")
df

### Preparing dataset for fine-tuning

In [None]:
df = df.dropna(subset=['text', 'is_sarcastic'])
df['text'] = df['text'].apply(lambda tokens: " ".join(tokens) if isinstance(tokens, list) else str(tokens))
df['headline'] = df['headline'].apply(lambda tokens: " ".join(tokens) if isinstance(tokens, list) else str(tokens))
df['is_sarcastic'] = df['is_sarcastic'].astype(int)

# Optional text size limit
df['text'] = df['text'].apply(lambda x: x[:512])

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['is_sarcastic'], random_state=42)




In [None]:
train_ds = Dataset.from_pandas(train_df.rename(columns={'text': 'text', 'is_sarcastic': 'label'}), preserve_index=False)
train_ds

In [None]:
eval_ds = Dataset.from_pandas(test_df.rename(columns={'text': 'text', 'is_sarcastic': 'label'}), preserve_index=False)
eval_ds

### Defines training parameters

In [None]:
def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        preds = predictions.argmax(axis=1)
        return {"accuracy": accuracy_score(labels, preds)}

training_args = SentenceTransformerTrainingArguments(
    output_dir="finetuned_model_sarcasm",
    num_train_epochs=4,
    per_device_train_batch_size=16,
    logging_steps=10,
    save_total_limit=1,
    learning_rate=2e-5,
    warmup_steps=10,
    fp16=False,
    report_to="none"
)

train_loss = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=2
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    loss=train_loss,
    compute_metrics=compute_metrics
)

### Trains the model

In [None]:
trainer.train()

### Trains the classifier

In [None]:
# Generating embeddings for the training and test sets
X_train = model.encode(train_df['text'].tolist(), convert_to_tensor=True).cpu().numpy()
X_test = model.encode(test_df['text'].tolist(), convert_to_tensor=True).cpu().numpy()
y_train = train_df['is_sarcastic'].values
y_test = test_df['is_sarcastic'].values

In [None]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision (weighted): {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall (weighted): {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1-Score (weighted): {f1_score(y_test, y_pred, average='weighted'):.4f}")

### Saves trained model and classifier

In [None]:
model.save("finetuned_model_sarcasm")
joblib.dump(clf, os.path.join("finetuned_model_sarcasm/classifier_logreg.pkl"))

Downloads the folder, takes a few minutes

In [None]:
from google.colab import files
import shutil
import os

folder_to_download = "finetuned_model_sarcasm"
zip_filename = f"{folder_to_download}.zip"

# Compresses the folder by creating the zip from the parent directory
shutil.make_archive(folder_to_download, 'zip', root_dir='.', base_dir=folder_to_download)

# Download the zipped file
files.download(zip_filename)

After downloading, unzip the .zip file in the 'models' directory of the repository
