# Requirements

In [None]:
!pip install -U accelerate
!pip install -U transformers[torch]
!pip install datasets

Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.26.1
Collecting transformers[torch]
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.36.2
Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datas

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
# Add as many imports as you need.
import numpy as np

# Laboratory Exercise - Run Mode (8 points)

## Introduction
This laboratory assignment's primary objective is to fine-tune a pre-trained LLM for binary classification on a dataset consisting of wine reviews. The dataset contains two attributes: **description** and **points**. The description is a brief text describing the wine and the points represent a quality metric ranging from 1 to 100. If some wine has at least 90 points it is considered **exceptional**. Your task involves predicting if some wine is exceptional based on its review.

## The Wine Reviews Dataset

## Downloading the Wine Reviews Dataset



In [None]:
!pip install gdown



In [None]:
!gdown 1fz9449M9Owofy6CYrKzkGwXKesI32B8u

Traceback (most recent call last):
  File "/usr/local/bin/gdown", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/gdown/cli.py", line 151, in main
    filename = download(
  File "/usr/local/lib/python3.10/dist-packages/gdown/download.py", line 203, in download
    filename_from_url = m.groups()[0]
AttributeError: 'NoneType' object has no attribute 'groups'


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


## Exploring the Wine Reviews Dataset

*Note: The dataset is complete, with no missing values in any of its entries.*

Load the dataset into a `pandas` data frame.

In [None]:
df = pd.read_csv('drive/MyDrive/wine-reviews.csv')

In [None]:
df.head(5)

Unnamed: 0,description,points
0,"Aromas include tropical fruit, broom, brimston...",87
1,"This is ripe and fruity, a wine that is smooth...",87
2,"Tart and snappy, the flavors of lime flesh and...",87
3,"Pineapple rind, lemon pith and orange blossom ...",87
4,"Much like the regular bottling from 2012, this...",87


Explore the dataset using visualizations of your choice.

In [None]:
df.isnull().sum()

description    0
points         0
dtype: int64

## Feauture Extraction
Extract the feature **exceptional** for each wine review. If some wine has at least 90 points it is considered **exceptional**.

In [None]:
df['exceptional'] = df['points'].apply(lambda x: 1 if x >= 90 else 0)

## Dataset Splitting
Partition the dataset into training and testing sets with an 80:20 ratio.


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['description'], df['exceptional'], train_size=0.8, random_state=42)

Concat the training set's features and label into a single data frame `train_df`.

In [None]:
train_df = pd.concat([X_train, y_train], axis=1)

In [None]:
train_df.columns = ['description', 'label']

Save the data frame `train_df`.

In [None]:
train_df.to_csv('train.csv')

Concat the testing set's features and label into a single data frame `test_df`.

In [None]:
test_df = pd.concat([X_test, y_test], axis=1)

In [None]:
test_df.columns = ['description', 'label']

Save the data frame `test_df`.

In [None]:
test_df.to_csv('test.csv')

Load the `train_df` and `test_df` into a Hugging Face dataset.

https://huggingface.co/docs/datasets/index

In [None]:
data = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv'})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
data = data.remove_columns(['Unnamed: 0'])

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['description', 'label'],
        num_rows: 103976
    })
    test: Dataset({
        features: ['description', 'label'],
        num_rows: 25995
    })
})

## Tokenization
Tokenize the reviews using `AutoTokenizer`.

https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoTokenizer

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["description"], padding="max_length", truncation=True)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

tokenized_datasets = data.map(tokenize_function, batched=True)

Map:   0%|          | 0/103976 [00:00<?, ? examples/s]

Map:   0%|          | 0/25995 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['description', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 103976
    })
    test: Dataset({
        features: ['description', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25995
    })
})

## Fine-tuning an LLM for Binary Classification
Fine-tune a pre-trained LLM model for binary classification on the dataset consisting of wine reviews.

Define the model using `AutoModelForSequenceClassification`.

https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Define the traning parameters using `TrainingArguments`.

https://huggingface.co/docs/transformers/v4.36.1/en/main_classes/trainer#transformers.TrainingArguments

In [None]:
training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    num_train_epochs=1,
    per_device_train_batch_size=4,  # batch size for training
    per_device_eval_batch_size=4,  # batch size for evaluation
)

Define the training using `Trainer`.

https://huggingface.co/docs/transformers/v4.36.1/en/main_classes/trainer#transformers.Trainer

In [None]:
# trainer = Trainer(
#     model,
#     training_args,
#     train_dataset=tokenized_datasets["train"],
#     eval_dataset=tokenized_datasets["test"],
# )

Fine-tune (train) the LLM.

In [None]:
# trainer.train()

If the fine-tuning of the LLM is very time-consuming, you can use the `small_train_dataset` and `small_test_dataset` for training and testing. Just uncomment the code below.

In [None]:
small_train_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(1000))
small_test_dataset = tokenized_datasets['test'].shuffle(seed=42).select(range(1000))

In [None]:
ts = Trainer(
    model,
    training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_test_dataset,
)

In [None]:
ts.train()

Epoch,Training Loss,Validation Loss
1,No log,0.584587


TrainOutput(global_step=250, training_loss=0.5935853271484375, metrics={'train_runtime': 138.862, 'train_samples_per_second': 7.201, 'train_steps_per_second': 1.8, 'total_flos': 263111055360000.0, 'train_loss': 0.5935853271484375, 'epoch': 1.0})

Use the trained model to make predictions for the test set.

In [None]:
pred = ts.predict(small_test_dataset)

In [None]:
logits, labels = pred.predictions, pred.label_ids
y_pred = np.argmax(logits, axis=-1)

Assess the performance of the model by using different metrics provided by the `scikit-learn` library.

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print(classification_report(labels, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.73      0.76       611
           1       0.62      0.69      0.66       389

    accuracy                           0.72      1000
   macro avg       0.71      0.71      0.71      1000
weighted avg       0.73      0.72      0.72      1000



In [None]:
print(confusion_matrix(labels, y_pred))

[[449 162]
 [119 270]]


# Laboratory Exercise - Bonus Task (+ 2 points)

As part of the bonus task in this laboratory assignment, your objective is to embed the wine reviews using a Sentence Transformer (https://www.sbert.net/) and then cluster the embedded reviews.

## Requirements

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/86.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_trans

In [None]:
from sentence_transformers import SentenceTransformer, util

## Example Usage for a Sentence Transformer

In [None]:
sentences = ['This is an example sentence.', 'Each sentence is converted into an embedding.']

In [None]:
model2 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
embeddings = model2.encode(sentences)

In [None]:
print(f'Similarity: {util.cos_sim(embeddings[0], embeddings[1])}')

Similarity: tensor([[0.4216]])


In [None]:
embeddings.shape

(2, 384)