In [1]:
# %%capture
# !pip install git+https://github.com/huggingface/datasets.git
# !pip install rouge_score
# !pip install sentencepiece
# !pip install transformers
# !pip install bert_score
# !pip install seaborn

In [2]:
import torch
from transformers import (
    AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments,T5Model
)
from transformers import T5Tokenizer
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import datasets



In [3]:
RANDOM_SEED = 42
BEAM_SIZE = 4
DEVICE = "cpu"
MODEL_NAME = "t5-large"
DATASET_NAME = "e2e_nlg"
MAX_LENGTH = 128
BATCH_SIZE = 20
SAVE_EVAL_STRATEGY = 'epoch'

loading the data and EDA

In [4]:
data = pd.read_csv('Women_s_Black_Crew_Neck_Basic_Cotton_Tshirt _ Boohoo_UK_processed.csv')

In [5]:
data = data.dropna(subset=['desc'])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1800 entries, 0 to 1801
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        1800 non-null   object
 1   gender      1800 non-null   object
 2   attributes  1279 non-null   object
 3   colour      1800 non-null   object
 4   price       1800 non-null   object
 5   category    1800 non-null   object
 6   desc        1800 non-null   object
dtypes: object(7)
memory usage: 112.5+ KB


split training and testing

In [6]:

data['source'] = data.apply(
    lambda x: '{} '.format(data['name']),
    axis=1)
data = data.drop(
    ['name', 'gender', 'attributes', 'colour', 'price', 'category'], axis=1)

In [7]:
data

Unnamed: 0,desc,source
0,A classic wardrobe staple which no clothing co...,0 High Waisted Disco Den...
1,If you’re going for a top-to-bottom wardrobe r...,0 High Waisted Disco Den...
2,Hit refresh on your casual wardrobe with a ver...,0 High Waisted Disco Den...
3,A seriously comfy addition to your new-season ...,0 High Waisted Disco Den...
4,"A menswear classic with a feminine edge, add a...",0 High Waisted Disco Den...
...,...,...
1797,Introducing your new fave top from our latest ...,0 High Waisted Disco Den...
1798,Introducing your new fave top from our latest ...,0 High Waisted Disco Den...
1799,Swapping out your jeans for something comfier?...,0 High Waisted Disco Den...
1800,Just chilling? Do it right with an oversized h...,0 High Waisted Disco Den...


In [8]:
df_train = data.iloc[:1700]   # First two rows of the dataframe
df_test = data.iloc[1700:]   # Remaining rows of the dataframe

In [9]:
cell_lengths = df_train['source'].str.len()

# Get the length of the longest cell in the 'col1' column
max_length = cell_lengths.max()

print(max_length)

656


In [10]:
datasets_train_test = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "test": Dataset.from_pandas(df_test)
    })

In [11]:
datasets_train_test

DatasetDict({
    train: Dataset({
        features: ['desc', 'source', '__index_level_0__'],
        num_rows: 1700
    })
    test: Dataset({
        features: ['desc', 'source', '__index_level_0__'],
        num_rows: 100
    })
})

In [12]:
def construct_input_for_batch(batch):
    """Construct input strings from a batch."""
    print(batch["source"])
    source = batch["source"]
    target = batch["desc"]
    return source, target

In [13]:
def batch_tokenize(batch, tokenizer, max_length=32):
    """Construct the batch (source, target) and run them through a tokenizer."""
    source, target = construct_input_for_batch(batch)
    res = {
        "input_ids": tokenizer(source)["input_ids"],
        "labels": tokenizer(
            target,
            padding="max_length",
            truncation=True,
            max_length=max_length
        ),
    }
    return res

In [14]:

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

train_data_tokenized = datasets_train_test['train'].map(
    lambda batch: batch_tokenize(batch, tokenizer, max_length=MAX_LENGTH),
    batched=True
)
valid_data_tokenized = datasets_train_test['test'].map(
    lambda batch: batch_tokenize(batch, tokenizer, max_length=MAX_LENGTH),
    batched=True
)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Map:   0%|          | 0/1700 [00:00<?, ? examples/s]

TypeError: Provided `function` which is applied to all elements of table returns a `dict` of types [<class 'list'>, <class 'transformers.tokenization_utils_base.BatchEncoding'>]. When using `batched=True`, make sure provided `function` returns a `dict` of types like `(<class 'list'>, <class 'numpy.ndarray'>, <class 'pandas.core.series.Series'>, <class 'torch.Tensor'>)`.

In [None]:
meteor_scorer = load_metric('meteor')

def meteor_metric_builder(tokenizer):
    def compute_meteor_metrics(pred):
        """Utility to compute meteor during training."""
        labels_ids = pred.label_ids
        pred_ids = pred.predictions
        # All special tokens are removed.
        pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        labels_ids[labels_ids == -100] = tokenizer.pad_token_id
        label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
        # Compute the metric.
        meteor_results = meteor_scorer.compute(predictions=pred_str,
                                       references=label_str)
        return {
            "meteor": round(meteor_results['meteor'], 4),
        }
    return compute_meteor_metrics

meteor_metric_fn = meteor_metric_builder(tokenizer)

In [None]:

if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    DEVICE = torch.ones(1, device=mps_device)
    print(DEVICE)
elif torch.cuda.is_available():
    DEVICE = "cuda:0"
    print(DEVICE)
    
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model = model.to(DEVICE)

In [None]:
train_args = Seq2SeqTrainingArguments(
    output_dir="t5-v1_1-base-E2E",
    evaluation_strategy=SAVE_EVAL_STRATEGY,
    save_strategy=SAVE_EVAL_STRATEGY,
    logging_steps=5,
    # optimization args, the trainer uses the Adam optimizer
    # and has a linear warmup for the learning rate
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    learning_rate=1e-03,
    num_train_epochs=8,
    warmup_steps=1000,
    # misc args
    seed=RANDOM_SEED,
    disable_tqdm=False,
    load_best_model_at_end=True,
    metric_for_best_model="meteor",
    # generation
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=train_args,
    train_dataset=train_data_tokenized,
    eval_dataset=valid_data_tokenized,
    tokenizer=tokenizer,
    compute_metrics=meteor_metric_fn,
)

trainer._max_length = MAX_LENGTH
trainer._num_beams = BEAM_SIZE

In [None]:
trainer.train()

In [None]:
def beam_generate_sentences(batch,
                            model,
                            tokenizer,
                            num_beams=4,
                            max_length=128,
                            device='cuda:0'):
    """Generate outputs from a model with beam search decoding."""
    # Create batch inputs.
    source, _ = construct_input_for_batch(batch)
    # Use the model's tokenizer to create the batch input_ids.
    batch_features = tokenizer(source, padding=True, return_tensors='pt')
    # Move all inputs to the device.
    batch_features = dict([(k, v.to(device))
                           for k, v in batch_features.items()])

    # Generate with beam search.
    generated_ids = model.generate(
        **batch_features,
        num_beams=num_beams,
        max_length=max_length,
    )

    # Use model tokenizer to decode to text.
    generated_sentences = [
        tokenizer.decode(gen_ids.tolist(), skip_special_tokens=True)
        for gen_ids in generated_ids
    ]
    return generated_sentences

In [None]:
valid_output = [X_test,y_test].map(
    lambda batch: {
        'generated':
        beam_generate_sentences(batch,
                                model,
                                tokenizer,
                                num_beams=BEAM_SIZE,
                                max_length=MAX_LENGTH,
                                device=DEVICE)
    },
    batched=True,
    batch_size=BATCH_SIZE,
)

In [None]:
# Evaluate for ROUGE-2/L
rouge_scorer = load_metric("rouge")

rouge_results = rouge_scorer.compute(
    predictions=valid_output["generated"],
    references=valid_output["human_reference"],
    rouge_types=["rougeL"],
    use_aggregator=True,
    use_stemmer=False,
)
rougeL = rouge_results['rougeL'].mid.fmeasure
f"R-L: {rouge_results['rougeL'].mid.fmeasure:.3f}"

In [None]:
rouge_results

In [None]:
# Evaluate for meteor

meteor_results = meteor_scorer.compute(predictions=valid_output["generated"],
                                       references=valid_output["human_reference"])
meteor = meteor_results['meteor']
meteor_results

In [None]:
bertscore = load_metric("bertscore")
bertscore_results = bertscore.compute(predictions=valid_output["generated"],
                                      references=valid_output["human_reference"],
                                      model_type='distilbert-base-uncased')

In [None]:
bertscore_results

In [None]:
def average(lst):
    return sum(lst) / len(lst)

bert_average_precision = average(bertscore_results['precision'])
bert_average_recall = average(bertscore_results['recall'])
bert_average_f1 = average(bertscore_results['f1'])

f'average_precision: {bert_average_precision}, average_recall: {bert_average_recall},average_f1: {bert_average_f1}'

In [None]:
bert_score_merics = [
    "rougeL", "meteor", "bert_average_precision", "bert_average_recall",
    "bert_average_f1"
]
bert_score_list = [
    rougeL, meteor, bert_average_precision, bert_average_recall,
    bert_average_f1
]

dataf = pd.DataFrame({
    "bert_score_merics": bert_score_merics,
    "bert_score_list": bert_score_list
})

plt.figure(figsize=(12, 6), dpi=80)
sns.barplot(x="bert_score_merics",
            y="bert_score_list",
            data=dataf,
            palette='Blues')
plt.show()

In [None]:
random.choice(valid_output)