In [2]:
%pip install -U transformers
%pip install -U datasets
%pip install pytorch
%pip install torch

Collecting transformers
  Downloading transformers-4.43.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m337.8 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Downloading transformers-4.43.3-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.43.2
    Uninstalling transformers-4.43.2:
      Successfully uninstalled transformers-4.43.2
Successfully installed transformers-4.43.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use update

In [3]:
%pip install -U accelerate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
from datasets import load_dataset

# Load the dataset
cnn_dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:2%]")
cnn_dataset

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 5742
})

In [5]:
cnn_dataset=cnn_dataset.train_test_split(test_size=0.3)

cnn_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 4019
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 1723
    })
})

In [7]:
def clean_text(example):
  for t in ["article"]:
    example[t]=example[t].lower()
    example[t]=example[t].replace("\\", "")
    example[t]=example[t].replace("/", "")
    example[t]=example[t].replace("\n", "")
    example[t]=example[t].replace("``", "")
    example[t]=example[t].replace('"', '')
    example[t]=example[t].replace("--", "")
  for t in ["highlights"]:
    example[t]=example[t].lower()
    example[t]=example[t].replace("\\", "")
    example[t]=example[t].replace("/", "")
    example[t]=example[t].replace("\n", "")
    example[t]=example[t].replace("``", "")
    example[t]=example[t].replace('"', '')
    example[t]=example[t].replace("--", "")


    return example

In [8]:
cleaned_cnn_dataset=cnn_dataset.map(clean_text)
cleaned_cnn_dataset

Map: 100%|██████████| 4019/4019 [00:06<00:00, 666.61 examples/s] 
Map: 100%|██████████| 1723/1723 [00:01<00:00, 1490.82 examples/s]


DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 4019
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 1723
    })
})

In [9]:
MODEL_NAME="t5-small"

In [10]:
from transformers import AutoTokenizer

tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME)

In [11]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model= AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

In [12]:
import torch
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

prefix="summarize: "

def preprocess_function(examples):

  inputs= [prefix + doc for doc in examples["article"]]
  model_inputs= tokenizer(inputs, max_length=1024, truncation=True)
  labels=tokenizer(text_target=examples["highlights"], max_length=128, truncation=True)
  model_inputs["labels"]= labels["input_ids"]

  return model_inputs

In [13]:
tokenized_cnn_dataset=cleaned_cnn_dataset.map(preprocess_function,batched=True)

tokenized_cnn_dataset

Map: 100%|██████████| 4019/4019 [00:11<00:00, 347.85 examples/s]
Map: 100%|██████████| 1723/1723 [00:01<00:00, 1035.39 examples/s]


DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4019
    })
    test: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1723
    })
})

In [15]:
from transformers import DataCollatorForSeq2Seq

data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=MODEL_NAME)

In [20]:
import torch

if torch.cuda.is_available():
    print("GPU is available")
else:
    print("GPU is not available")

GPU is available


In [21]:

training_args=Seq2SeqTrainingArguments(
    output_dir="cnn_news_summary_model_trained_on_reduced_data",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size= 1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate= True,
    fp16=True,
)



In [22]:
trainer=Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_cnn_dataset['train'],
    eval_dataset=tokenized_cnn_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [23]:
PYTORCH_CUDA_ALLOC_CONF=expandable_segments=True

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.8541,1.633938
2,1.7951,1.620276
3,1.7479,1.614978


TrainOutput(global_step=12057, training_loss=1.8223612793960944, metrics={'train_runtime': 8308.5043, 'train_samples_per_second': 1.451, 'train_steps_per_second': 1.451, 'total_flos': 2532978383192064.0, 'train_loss': 1.8223612793960944, 'epoch': 3.0})

In [25]:
trainer.evaluate()

{'eval_loss': 1.614978313446045,
 'eval_runtime': 315.1077,
 'eval_samples_per_second': 5.468,
 'eval_steps_per_second': 5.468,
 'epoch': 3.0}

In [26]:
trainer.save_model("my_model")

In [None]:
#test run
from transformers import pipeline
text="SnipSwift is a Chrome extension that retrieves YouTube transcripts and generates both abstractive and extractive summaries. Abstractive summarization creates coherent and easy-to-understand summaries, while extractive summarization extracts key sentences from the text. The extension also offers multilingual translation (supporting languages like Hindi and Gujarati), text-to-speech, and download features. As part of the backend team, I worked on transcript retrieval, summary generation, translation, and integration with the frontend. The backend was built using Flask due to its simplicity and the Python language. For transcript retrieval, we used the YouTube Transcript API, which returns transcripts as a list of dictionaries, each containing the start time and text of a segment. We used a fine-tuned T5 Small model for abstractive summarization, chosen for its efficiency and effectiveness in generating summaries. This model, based on the transformer architecture, was fine-tuned on a fraction of the CNN News dataset and integrated using the pipeline function from the Hugging Face Transformers library. For extractive summarization, we utilized SBERT with the Paraphrase-MiniLM-L6-v2 model, which generates sentence embeddings to identify the most important sentences. The Paraphrase-MiniLM is optimized for capturing semantic similarity and is computationally efficient. We also integrated the Googletrans library for translation tasks. REST API endpoints were set up for seamless communication between the frontend and backend. Despite challenges with limited computational resources, which we addressed using Google Colab and a teammate's GPU-enabled laptop, we successfully developed and integrated all features. This project enhanced my skills in backend development and provided valuable insights into various NLP tasks."
text="summarize: "+text
summarizer=pipeline("summarization",model="my_model",truncation=True,device=0)
pred=summarizer(text)
result=pred[0]['summary_text']
print(result)