In [None]:
! pip install transformers
! pip install datasets
! pip install sentencepiece
! pip install rouge_score
! pip install wandb (Optional)(Use Weights and Biases API to build better models faster)
! pip install transformers[torch]
! pip install accelerate -U

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19

In [None]:
!pip install wandb
# log in to WandB
WANDB_INTEGRATION = True
if WANDB_INTEGRATION:
   import wandb
   wandb.login()

Collecting wandb
  Downloading wandb-0.17.4-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.10.0-py2.py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.1/302.1 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x8

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

Sets up the environment to work with transformers, datasets, and other essential libraries to perform sequence-to-sequence language modeling tasks using the Hugging Face library.

In [None]:
import torch
import numpy as np
import datasets
from transformers import (
   AutoModelForSeq2SeqLM,
   AutoTokenizer,
   Seq2SeqTrainingArguments,
   Seq2SeqTrainer,
   DataCollatorForSeq2Seq,
)
from tabulate import tabulate
import nltk
from datetime import datetime


Load the pretrained BART-based model for sequence-to-sequence language modeling, sets the appropriate tokenizer, and defines the maximum lengths for the encoder and decoder sequences.

In [None]:
#model_name to indicate the specific pretrained model used
pretrained_model_name = "sshleifer/distilbart-xsum-12-3"
# Load the pretrained model
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
# Tokenization parameters
encoder_max_len = 256
decoder_max_len = 6

Importing and Pre-processing

In [None]:
# Load dataset from the CS
import pandas as pd
geographical_data_df = pd.read_csv('/content/Geographicaldata.csv', encoding='latin-1', sep=',',on_bad_lines='skip') #The encoding='latin-1' argument is used to handle invalid characters appropriately.
# Reduce the dataset to 200 rows
geographical_data_df = geographical_data_df.head(200)


In [None]:
print(geographical_data_df.columns)

Splits the dataset into training and validation sets
Convert pandas DataFrame to a Dataset object using the Hugging Face datasets library.

Splits it into training and validation sets for further processing and modeling tasks.

In [None]:
import pandas as pd
from datasets import Dataset
# Convert DataFrame to Dataset
geographical_data = Dataset.from_pandas(geographical_data_df)
def flatten_example(example):
   return {
       "CityDescription": example["City Description"],
       "FeatureExtraction": example["Feature Extraction"],
   }
def filter_samples(example):
   CityDescription = []
   FeatureExtraction = []
   for desc, feat in zip(example["City Description"], example["Feature Extraction"]):
       if len(desc) > 0:
           CityDescription.append(desc)
           FeatureExtraction.append(feat)
   return {"CityDescription": CityDescription, "FeatureExtraction": FeatureExtraction}

# Apply transformations to the dataset
geographical_data = geographical_data.map(flatten_example)
geographical_data = geographical_data.map(filter_samples, batched=True)
# Split the dataset into train and validation sets
train_data_txt, validation_data_txt = geographical_data.train_test_split(test_size=0.1).values()


Tokenization and Preprocessing for Sequence-to-Sequence Models
Prepare the data for training a Seq2Seq model, where the input is the tokenized and processed "CityDescription," and the target is the tokenized and processed "FeatureExtraction."

In [None]:
def preprocess_batch(batch, tokenizer, max_source_length, max_target_length):
   src, tgt = batch["CityDescription"], batch["FeatureExtraction"]
   src_tokenized = tokenizer(
       src, padding="max_length", truncation=True, max_length=max_source_length
   )
   tgt_tokenized = tokenizer(
       tgt, padding="max_length", truncation=True, max_length=max_target_length
   )
   batch = {k: v for k, v in src_tokenized.items()}
   # Ignore padding in the loss
   batch["labels"] = [
       [-100 if token == tokenizer.pad_token_id else token for token in l]
       for l in tgt_tokenized["input_ids"]
   ]
   return batch


Preprocess the training data for a sequence-to-sequence model.

In [None]:
train_data = train_data_txt.map(
   lambda batch: preprocess_batch(
       batch, tokenizer, encoder_max_len, decoder_max_len
   ),
   batched=True,
   remove_columns=train_data_txt.column_names,
)


Preprocess the validation data suitable for evaluating the performance of a sequence-to-sequence model

In [None]:
validation_data = validation_data_txt.map(
   lambda batch: preprocess_batch(
       batch, tokenizer, encoder_max_len, decoder_max_len
   ),
   batched=True,
   remove_columns=validation_data_txt.column_names,
)


Build the Model for fine-tuning.
Set up the necessary libraries for training and evaluating Seq2Seq models

In [None]:
import numpy as np
from datasets import load_metric
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer
nltk.download("punkt", quiet=True)
metric = load_metric("rouge") #Recall-Oriented Understudy for Gisting Evaluation
def postprocess_text(preds, labels):
   preds = [pred.strip() for pred in preds]
   labels = [label.strip() for label in labels]
   # rougeLSum expects newline after each sentence
   preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
   labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
   return preds, labels


Define function to evaluate the performance of the model on generated summaries compared to the reference summaries using the ROUGE metric.

In [None]:
def calculate_metrics(eval_preds):
   preds, labels = eval_preds
   if isinstance(preds, tuple):
       preds = preds[0]
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   # Replace -100 in the labels as we can't decode them.
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
   # Some simple post-processing
   decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
   result = metric.compute(
       predictions=decoded_preds, references=decoded_labels, use_stemmer=True
   )
   # Extract a few results from ROUGE
   result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

   prediction_lens = [
       np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
   ]
   result["gen_len"] = np.mean(prediction_lens)
   result = {k: round(v, 4) for k, v in result.items()}
   return result


Train the model using the provided configuration and datasets.
Define a Seq2SeqTrainingArguments object that encapsulates the training arguments and configuration.

Create a DataCollatorForSeq2Seq object that is responsible for collating and processing the training data.

Build  Seq2SeqTrainer object, to handle training loop, optimization, logging, and evaluation, and manages the training process.

In [None]:
training_args = Seq2SeqTrainingArguments(
   output_dir="results",
   num_train_epochs=1,  # demo
   do_train=True,
   do_eval=True,
   per_device_train_batch_size=4,  # demo
   per_device_eval_batch_size=4,
   # learning_rate=3e-05,
   warmup_steps=500,
   weight_decay=0.1,
   label_smoothing_factor=0.1,
   predict_with_generate=True,
   logging_dir="logs",
   logging_steps=50,
   save_total_limit=3,
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   data_collator=data_collator,
   train_dataset=train_data,
   eval_dataset=validation_data,
   tokenizer=tokenizer,
   compute_metrics=calculate_metrics,
)


Evaluate pre-trained model before training

In [None]:
trainer.evaluate()


Train the model for given dataset

In [None]:
trainer.train()


Evaluate the model after training

In [None]:
trainer.evaluate()


In [None]:
def generate_summary(test_data, model):
   inputs = tokenizer(
       test_data["CityDescription"],
       padding="max_length",
       truncation=True,
       max_length=encoder_max_len,
       return_tensors="pt",
   )
   input_ids = inputs.input_ids.to(model.device)
   attention_mask = inputs.attention_mask.to(model.device)
   outputs = model.generate(input_ids, attention_mask=attention_mask)
   output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
   return outputs, output_str
model_before_fine_tuning = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name)
test_samples = validation_data_txt.select(range(10))
features_before_fine_tuning = generate_summary(test_samples, model_before_fine_tuning)[1]
features_after_fine_tuning = generate_summary(test_samples, model)[1]


In [None]:
print("\nTarget Feature:\n")
print(
   tabulate(list(enumerate(test_samples["FeatureExtraction"])), headers=["Id", "Target Feature"])
)
print("\nCity Description:\n")
print(tabulate(list(enumerate(test_samples["CityDescription"])), headers=["Id", "City Description"]))


ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics for evaluating automatic summarization of texts as well as machine translations. It works by comparing an automatically produced summary or translation against a set of reference summaries (typically human-produced). The metrics compare the generated summaries to reference (gold-standard) summaries based on overlapping n-grams (sequences of n consecutive words) and other measures.eval_rouge1 denotes the ROUGE-1 F1-score, eval_rouge2 represents the ROUGE-2 F1-score, eval_rougeL corresponds to the ROUGE-L F1-score, and eval_rougeLsum indicates the ROUGE-Lsum F1-score.
These scores are used to measure the performance of the summarization model, with higher scores indicating better alignment between the generated summaries and the reference summaries.