In [1]:
from IPython.display import clear_output as cls

In [2]:
!pip install transformers==4.20.0
!pip install keras_nlp==0.3.0
!pip install datasets
!pip install huggingface-hub
!pip install nltk
!pip install rouge-score
cls()

In [3]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [4]:
import os
import logging

import nltk
import numpy as np
import tensorflow as tf
from tensorflow import keras

# Only log error messages
tf.get_logger().setLevel(logging.ERROR)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [5]:
# The percentage of the dataset you want to split as train and test
TRAIN_TEST_SPLIT = 0.1

MAX_INPUT_LENGTH = 1024  # Maximum length of the input to the model
MIN_TARGET_LENGTH = 5  # Minimum length of the output by the model
MAX_TARGET_LENGTH = 128  # Maximum length of the output by the model
BATCH_SIZE = 1  # Batch-size for training our model
LEARNING_RATE = 3e-5  # Learning-rate for training our model
MAX_EPOCHS = 1  # Maximum number of epochs we will train the model for

# This notebook is built on the facebook/bart-large-cnn checkpoint from the Hugging Face Model Hub
MODEL_CHECKPOINT = "facebook/bart-large-cnn"

## Define Tokenizer, Model

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

Downloading:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [7]:
from transformers import TFBartForConditionalGeneration

# Load and compile our model
model = TFBartForConditionalGeneration.from_pretrained(MODEL_CHECKPOINT)

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-large-cnn.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


## Load Dataset

In [8]:
dataset_dir = "/kaggle/input/e-waste-article-summarization/e-waste-summarization-dataset.csv"

In [9]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files=dataset_dir)
dataset = dataset['train'].train_test_split(test_size=TRAIN_TEST_SPLIT)
dataset

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-45fb6e5057ab3895/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-45fb6e5057ab3895/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Text', 'Summary'],
        num_rows: 145
    })
    test: Dataset({
        features: ['Text', 'Summary'],
        num_rows: 17
    })
})

In [10]:
def preprocess_function(examples):
    inputs = examples["Text"]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["Summary"], max_length=MAX_TARGET_LENGTH, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [11]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [12]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['Text', 'Summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 145
    })
    test: Dataset({
        features: ['Text', 'Summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 17
    })
})

In [13]:
# len(tokenized_datasets['train']['input_ids'][0])

In [14]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [15]:
train_dataset = tokenized_datasets["train"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)

test_dataset = tokenized_datasets["test"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)

generation_dataset = (
    tokenized_datasets["test"]
    .shuffle()
#     .select(list(range(200)))
    .to_tf_dataset(
        batch_size=BATCH_SIZE,
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=False,
        collate_fn=data_collator,
    )
)

In [16]:
import keras_nlp

rouge_l = keras_nlp.metrics.RougeL()

def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_l(decoded_labels, decoded_predictions)
    # We will print only the F1 score, you can use other aggregation metrics as well
    result = {"RougeL": result["f1_score"]}

    return result

In [17]:
from transformers import KerasMetricCallback

metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=generation_dataset, predict_with_generate=True
)

callbacks = [metric_callback]

In [18]:
from tensorflow.keras.optimizers import Adam

# Lower learning rates are often better for fine-tuning transformers
model.compile(optimizer=Adam(LEARNING_RATE))

model.fit(train_dataset, validation_data=test_dataset, epochs=MAX_EPOCHS, callbacks=callbacks)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.




<keras.callbacks.History at 0x78d2e004ae30>

In [23]:
text = dataset['test'][0]['Text']

In [24]:
from datetime import datetime

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [25]:
start_time = timer(None)
input_ids = tokenizer.encode(text, return_tensors="tf", truncation=True)
generated_sequence = model.generate(input_ids=input_ids)
output_text = tokenizer.decode(generated_sequence.numpy().squeeze(), skip_special_tokens=True)
timer(start_time)


 Time taken: 0 hours 1 minutes and 20.57 seconds.


In [26]:
output_text

'E-waste is a growing waste problem around the world, with over 49 million tons of electronic waste discarded in 2016, with a projected 57 million tons by 2021. To address this issue, there are several steps your company can take, including embracing cloud services like cloud storage, donating electronics you no longer use, checking manufacturers for recycling programs, and ensuring electronic waste disposal is properly disposed of. Certified e-Waste disposal companies can help solve electronic waste management issues, including transportation of large quantities, collection sites, and drop-off facilities that accept electronic waste, as well as quick disposal facilities. The California Department of Resources, known as CalRecycleRecycling and Recovery,'

In [28]:
true_label = dataset['test'][0]['Summary']
true_label

'To improve electronic waste management, companies can take four simple steps. Firstly, embracing cloud services reduces the need for physical electronic devices like servers and circuit boards. Secondly, donating functioning electronics to organizations that refurbish and distribute them to those in need helps keep devices out of landfills. Thirdly, checking with manufacturers for recycling programs allows for proper disposal and potential discounts on future purchases. Lastly, ensuring that electronic waste is properly disposed of by partnering with a certified e-waste disposal company streamlines the process and ensures compliance with regulations. These actions contribute to reducing the amount of e-waste generated and promote responsible and sustainable practices.'

In [30]:
result = rouge_l(output_text, true_label)
# We will print only the F1 score, you can use other aggregation metrics as well
result = {"RougeL": result["f1_score"]}
result

{'RougeL': <tf.Tensor: shape=(), dtype=float32, numpy=0.27936164>}

In [19]:
!nvidia-smi

Fri May 26 06:12:28 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    38W / 250W |   9167MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [31]:
path = "saved_model_finetuned"

model.save_pretrained(path, from_tf=True)
tokenizer.save_pretrained(path+"/tokenizer/")

('saved_model_finetuned/tokenizer/tokenizer_config.json',
 'saved_model_finetuned/tokenizer/special_tokens_map.json',
 'saved_model_finetuned/tokenizer/vocab.json',
 'saved_model_finetuned/tokenizer/merges.txt',
 'saved_model_finetuned/tokenizer/added_tokens.json',
 'saved_model_finetuned/tokenizer/tokenizer.json')

In [None]:
import zipfile
import os

output_dir = "fine_tuned_model.zip"

with zipfile.ZipFile(output_dir, 'w') as z:
    for root, dirs, files in os.walk(path):
        for file in files:
            file_path = os.path.join(root, file)
            z.write(file_path, file_path[len(folder_path):])

In [38]:
print('completed')

completed
