# LLMs Transfer Learning

### perform sentiment analysis on the IMDb dataset using a pre-trained model from Hugging Face.

Based on the documentation available at: https://huggingface.co

In [1]:
# Install necessary libriaries
# pip install transformers datasets torch accelerate evaluate

# Import necessary libriaries
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader
import evaluate
import re
import torch
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Authenticate this Notebook to send the final model to the Hugging Face hub
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# Install Git Large File Storage (LFS)
# !apt-get install git-lfs
#pip install git-lfs

## 1. Load the IMDb Dataset

In [5]:
from datasets import load_dataset

data = load_dataset('imdb')
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

## 2. Preprocess text

In [6]:
# Define a function to preprocess our messages
def preprocess_text(text):
    
    ## 1. Lowercase
    text = text.lower()
    
    ## 2. Remove HTML tags
    text = re.sub("<.*?>"," ", text)

    ## 3. Replace contractions with full words
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)

    # Delete newline characters
    text = text.replace('\n', ' ')
    # Remove redundant spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [7]:
# Implement text cleaning

data = data.map(lambda x: {'text': preprocess_text(x['text'])})

## 3. Select a Pre-Trained Model from Hugging Face
I choose `distilbert-base-uncased-finetuned-sst-2-english` pre-trained model for this project task, sentiment analysis.

## 4. Apply Pre-trained Model

In [8]:
# Load pre-trained model and tokenizer from Hugging Face
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

## 5. Prepare data

In [9]:
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True)    #, padding="max_length"

# Tokenize dataset
tokenized_datasets = data.map(tokenize_function, batched=True)

In [10]:
# Split train_dataset into 80% to train and 20% to eval
test_dataset = tokenized_datasets["test"]
train_dataset = tokenized_datasets["train"].train_test_split(test_size=0.2)["train"]
eval_dataset = tokenized_datasets["train"].train_test_split(test_size=0.2)["test"]

## _________
## 6. Enhancing Model Using Transfer Learning
## Fine-tune pre-trained model on the project dataset

### Define training arguments and train the model:

In [13]:
# Create TrainingArguments
repo_name = 'LLM_project'
training_args = TrainingArguments(
                        output_dir=repo_name,
                        push_to_hub=True,
                        evaluation_strategy="epoch",
                        save_strategy="epoch",
                        learning_rate=2e-5,
                        per_device_train_batch_size=16,
                        per_device_eval_batch_size=16,
                        num_train_epochs=3,
                        weight_decay=0.01,
                        logging_steps=100,
                        warmup_steps=100,
                        load_best_model_at_end=True,                
)

# Create a batch of examples
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")


# Load accuracy metric
accuracy_metric = evaluate.load("accuracy")

# Generate evaluation function

def compute_metrics(eval_pred):
    output_labels, actual_labels = eval_pred
    predictions = np.argmax(output_labels, axis=1)   #prediction is the highest output probability
    return accuracy_metric.compute(predictions=predictions, references=actual_labels)

In [14]:
# Build Trainer
trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=eval_dataset,
                tokenizer=tokenizer,
                data_collator=data_collator,
                compute_metrics=compute_metrics,
)

# Train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0743,0.120796,0.9696
2,0.145,0.085196,0.9804
3,0.0322,0.104335,0.9822


TrainOutput(global_step=3750, training_loss=0.08349320179621379, metrics={'train_runtime': 56948.2027, 'train_samples_per_second': 1.054, 'train_steps_per_second': 0.066, 'total_flos': 7829320013314560.0, 'train_loss': 0.08349320179621379, 'epoch': 3.0})

In [15]:
# Evaluate fine-tuned model
trainer.evaluate()

{'eval_loss': 0.0851956456899643,
 'eval_accuracy': 0.9804,
 'eval_runtime': 1591.13,
 'eval_samples_per_second': 3.142,
 'eval_steps_per_second': 0.197,
 'epoch': 3.0}

## 7. Push the Model to Hugging Face Hub

In [16]:
# Once we are happy with the results, push the model to the hub for later use and share with the NLP community
# Push the Model to the Hugging Face Hub
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/ThuyTran102/LLM_project/commit/fa6d719fc10e329bc3f8c7dd82e8932f38171bff', commit_message='End of training', commit_description='', oid='fa6d719fc10e329bc3f8c7dd82e8932f38171bff', pr_url=None, pr_revision=None, pr_num=None)

## 8. Use fine-tuned model for Inference

In [17]:
# Load the pipeline with the fine-tuned model
from transformers import pipeline
sentiment_pipeline = pipeline(model="ThuyTran102/LLM_project")  # model_name = "your-username/your-repo-name"

config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

### Make Predictions on New Text

In [18]:
new_data = ["I loved this movie!", 
            "The movie was okay, but I wouldn't watch it again.", 
            "I didn't like the movie at all.",
            "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."]

# Get predictions
predictions = sentiment_pipeline(new_data)
predictions

[{'label': 'POSITIVE', 'score': 0.9997263550758362},
 {'label': 'NEGATIVE', 'score': 0.991361677646637},
 {'label': 'NEGATIVE', 'score': 0.9985843896865845},
 {'label': 'POSITIVE', 'score': 0.9988806843757629}]

In [19]:
# Display predictions
for text, prediction in zip(new_data, predictions):
    print(f"Text: {text}")
    print(f"Predicted sentiment: {prediction['label']}, Confidence: {prediction['score']}\n")

Text: I loved this movie!
Predicted sentiment: POSITIVE, Confidence: 0.9997263550758362

Text: The movie was okay, but I wouldn't watch it again.
Predicted sentiment: NEGATIVE, Confidence: 0.991361677646637

Text: I didn't like the movie at all.
Predicted sentiment: NEGATIVE, Confidence: 0.9985843896865845

Text: This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.
Predicted sentiment: POSITIVE, Confidence: 0.9988806843757629



### Make Predictions on small unseen Test set

In [20]:
sample_test = data['test']['text'][:10]  # Let's take the first 10 reviews from the test set
predictions = sentiment_pipeline(sample_test)
predictions

[{'label': 'NEGATIVE', 'score': 0.9995478987693787},
 {'label': 'NEGATIVE', 'score': 0.9666439890861511},
 {'label': 'NEGATIVE', 'score': 0.9997088313102722},
 {'label': 'NEGATIVE', 'score': 0.9997569918632507},
 {'label': 'POSITIVE', 'score': 0.9985153079032898},
 {'label': 'NEGATIVE', 'score': 0.9993792772293091},
 {'label': 'NEGATIVE', 'score': 0.9825100302696228},
 {'label': 'NEGATIVE', 'score': 0.9998304843902588},
 {'label': 'NEGATIVE', 'score': 0.9996961355209351},
 {'label': 'NEGATIVE', 'score': 0.9998137354850769}]

In [21]:
# Display predictions
for text, prediction in zip(sample_test, predictions):
    print(f"Text: {text}")
    print(f"Predicted sentiment: {prediction['label']}, Confidence: {prediction['score']}\n")

Text: i love sci-fi and am willing to put up with a lot. sci-fi movies/tv are usually underfunded, under-appreciated and misunderstood. i tried to like this, i really did, but it is to good tv sci-fi as babylon 5 is to star trek (the original). silly prosthetics, cheap cardboard sets, stilted dialogues, cg that does not match the background, and painfully one-dimensional characters cannot be overcome with a isci-fi' setting. (i am sure there are those of you out there who think babylon 5 is good sci-fi tv. it is not. it is clichéd and uninspiring.) while us viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. star trek). it may treat important issues, yet not as a serious philosophy. it is really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. their actions and reactions are wooden and predictable, often painful to watch. the makers of earth know it is rubbish as they h

### Make Predictions on all unseen Test set

In [22]:
# Map predictions to numerical labels
def map_predictions_to_labels(pred_outputs):
    return [0 if pred['label'] == 'NEGATIVE' else 1 for pred in pred_outputs]

# Get predictions
pred_outputs = sentiment_pipeline(test_dataset['text'], truncation=True)
# Convert predictions to numerical labels
pred_labels = map_predictions_to_labels(pred_outputs)

# Get actual_labels
actual_labels = test_dataset['label']

# Load accuracy metric
accuracy_metric = evaluate.load("accuracy")
# Calculate accuracy on evaluation set
accuracy_result = accuracy_metric.compute(predictions=pred_labels, references=actual_labels)
print(f"Accuracy of pre-trained model before fine-tuning: {accuracy_result}")

Accuracy of pre-trained model before fine-tuning: {'accuracy': 0.92848}
