In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

file_path = '/content/drive/MyDrive/Hindi_train.csv'

df_train = pd.read_csv(file_path)

In [None]:
new_df = df_train[df_train['Id'].str.startswith("hindi_2023_train")].copy()


In [None]:
len(new_df)

9857

In [None]:
df_without_ids = new_df.drop('Id',axis=1)
df = df_without_ids.drop_duplicates()

In [None]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(new_df, test_size=0.2)

In [None]:
train.shape

(7885, 4)

In [None]:
!pip install transformers datasets evaluate torch rouge_score
!pip install rouge bert_score
!pip install accelerate -U

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset, DatasetDict, load_metric
import numpy as np
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(val)
dataset = DatasetDict({'train': train_dataset, 'validation': val_dataset})

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBART")
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/IndicBART").to(device)

def map_to_model_inputs(batch):
    inputs = tokenizer(batch['Article'], padding='max_length', truncation=True, max_length=1024)
    outputs = tokenizer(batch['Summary'], padding='max_length', truncation=True, max_length=100)
    batch['input_ids'] = inputs.input_ids
    batch['attention_mask'] = inputs.attention_mask
    batch['labels'] = outputs.input_ids
    batch['labels'] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch['labels']]
    return batch

tokenized_dataset = dataset.map(map_to_model_inputs, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge = load_metric('rouge')
    bert_score = load_metric('bertscore')

    rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=["rougeL", "rouge2"])
    bert_score_output = bert_score.compute(predictions=decoded_preds, references=decoded_labels, lang="hi")

    return {
        "rouge2": rouge_output['rouge2'].mid.fmeasure,
        "rougeL": rouge_output['rougeL'].mid.fmeasure,
        "bert_score": bert_score_output['f1'][0]
    }

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.005,
    save_total_limit=1,
    num_train_epochs=3,
    predict_with_generate=True,
    gradient_accumulation_steps = 16
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

model.save_pretrained('/content/drive/MyDrive/indicBART_finetuned_small')
tokenizer.save_pretrained('/content/drive/MyDrive/indicBART_finetuned_small')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/832 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.90M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/221 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


pytorch_model.bin:   0%|          | 0.00/976M [00:00<?, ?B/s]

Map:   0%|          | 0/7885 [00:00<?, ? examples/s]

Map:   0%|          | 0/1972 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Rouge2,Rougel,Bert Score
0,No log,2.211058,0.033776,0.11367,0.58517
1,No log,2.111071,0.038102,0.125792,0.632604
2,No log,2.083808,0.038499,0.126537,0.658022


  rouge = load_metric('rouge')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.92k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Non-default generation parameters: {'forced_eos_token_id': 2}


('/content/drive/MyDrive/indicBART_finetuned_small/tokenizer_config.json',
 '/content/drive/MyDrive/indicBART_finetuned_small/special_tokens_map.json',
 '/content/drive/MyDrive/indicBART_finetuned_small/spiece.model',
 '/content/drive/MyDrive/indicBART_finetuned_small/added_tokens.json',
 '/content/drive/MyDrive/indicBART_finetuned_small/tokenizer.json')

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model='/content/drive/MyDrive/indicBART_finetuned_small', tokenizer="/content/drive/MyDrive/indicBART_finetuned_small")

text = """मानसून सीजन में हुई भारी बारिश ने कई राज्यों में भारी तबाही मचाई है। हिमाचल प्रदेश में पिछले 40 दिनों में 187 लोगों की जान जा चुकी है, वहीं 34 लोग लापता हैं।
तेलंगाना में एक हफ्ते में 18 लोगों की मौत हो गई। करीब 12,000 लोगों को राहत शिविरों में पहुंचाया गया है।
उधर, शनिवार को जम्मू-कश्मीर के गांदरबल जिले के एक गांव में बादल फट गया। जिससे सात घर, एक मस्जिद और दो स्कूल क्षतिग्रस्त हो गए।
उत्तराखंड के चमोली में रविवार सुबह पहाड़ों से पत्थर गिरने के बाद बद्रीनाथ नेशनल हाईवे को बंद करना पड़ा। हालांकि, 2 घंटे बाद हाईवे ट्रैफिक के लिए खोल दिया गया।
अगले 24 घंटे कैसे रहेंगे…
इन राज्यों में तेज बारिश होगी: हिमाचल, महाराष्ट्र, उत्तराखंड, उत्तर प्रदेश, राजस्थान, मध्यप्रदेश, बिहार, झारखंड, गोवा, छत्तीसगढ़, ओडिशा, पश्चिम बंगाल, असम, अरुणाचल प्रदेश।
बारिश की संभावना नहीं: जम्मू-कश्मीर, पंजाब, हरियाणा, पश्चिमी राजस्थान, गुजरात, कर्नाटक, आंध्र प्रदेश, केरल, तमिलनाडु में बारिश की संभावना नहीं है।
अलग-अलग राज्यों से मानसून की तस्वीरें...
अन्य राज्यों में मौसम का हाल..."""
print(summarizer(text))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[{'summary_text': 'मानसून सीजन में हुई भारी बारिश ने कई राज्यों में भारी तबाही मचाई है। हिमाचल प्रदेश'}]


In [None]:
from transformers import pipeline
from datasets import load_metric
from bert_score import score as bert_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

summarizer = pipeline("summarization", model="/content/drive/MyDrive/indicBART_finetuned_small", tokenizer="/content/drive/MyDrive/indicBART_finetuned_small")

rouge = load_metric("rouge")


def compute_metrics(generated_summary, reference_summary):
    rouge_scores = rouge.compute(predictions=[generated_summary], references=[reference_summary])
    rouge_scores = {key: value.mid.fmeasure * 100 for key, value in rouge_scores.items()}
    _, _, bertscore_f1 = bert_score([generated_summary], [reference_summary], lang="hi", rescale_with_baseline=False)
    metrics = {k: round(v, 4) for k, v in rouge_scores.items()}
    metrics['bert_score_f1'] = round(bertscore_f1.mean().item() * 100, 4)
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([generated_summary, reference_summary])
    cosine_sim = cosine_similarity(vectors[0:1], vectors[1:])
    metrics['cosine_similarity'] = round(cosine_sim[0][0] * 100, 4)
    return metrics

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  rouge = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [None]:
for index, row in val.iterrows():
    try:
        print(f"Processing Article {index}: {str(row['Article'])[:50]}")

        generated_summaries = summarizer(row["Article"], max_length=75, min_length=30, length_penalty=2.0, num_beams=4)

        if len(generated_summaries) > 0:
            generated_summary = generated_summaries[0]['summary_text']
            metrics = compute_metrics(generated_summary, row["Summary"])

            print(f"Article {index}:")
            print(f"Generated Summary: {generated_summary}")
            print(f"Reference Summary: {row['Summary']}")
            print(f"Metrics: {metrics}")
            print("\n---\n")
        else:
            print(f"No summary generated for Article {index}.")

    except Exception as e:
        print(f"An error occurred at Article {index}: {e}")


Processing Article 567: सरकारी नौकरी की तलाश कर रहे बेरोजगारों के लिए अच्छ
Article 567:
Generated Summary: 留 Rajasthan Government Jobs Recruitment 2018: सरकारी नौकरी की तलाश कर रहे बेरोजगारों के लिए अच्छी खबर है। नेशनल टेस्टिंग एजेंसी ने देशभर में एकलव्य मॉडल रेजिडेंशियल स्कूल में 6 हजार से ज्यादा पदों पर वैकेंसी निकाली है। इसके तहत टीचिंग और नॉन टीचिंग पदों पर भर्तियां की जाएंगी। भर्ती प्रक्रिया
Reference Summary: Eklavya Model Residential Schools Vacancy Details Update; Government job vacancy teaching and non-teaching positions  भर्ती प्रक्रिया में शामिल होने के लिए उम्मीदवार एकलव्य मॉडल रेजिडेंशियल स्कूल की ऑफिशल वेबसाइट emrs.tribal.gov.in पर जाकर 18 अगस्त तक ऑनलाइन अप्लाई कर सकते हैं।
Metrics: {'rouge1': 7.6923, 'rouge2': 0.0, 'rougeL': 7.6923, 'rougeLsum': 7.6923, 'bert_score_f1': 68.5345, 'cosine_similarity': 18.9896}

---

Processing Article 9829: महाराष्ट्र के पालघर में एक कार ड्राइवर ट्रैफिक पुल
Article 9829:
Generated Summary: पालघर में एक कार ड्राइवर ट्रैफिक पुलिस कॉन्स्टेबल

KeyboardInterrupt: 

In [None]:
import numpy as np

rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
bert_scores = []
cosine_similarities = []

test = val[:150]

for index, row in test.iterrows():
    try:
        generated_summaries = summarizer(row["Article"], max_length=80, min_length=30, length_penalty=2.0, num_beams=4)
        if generated_summaries:
            generated_summary = generated_summaries[0]['summary_text']
            metrics = compute_metrics(generated_summary, row["Summary"])

            rouge1_scores.append(metrics['rouge1'])
            rouge2_scores.append(metrics['rouge2'])
            rougeL_scores.append(metrics['rougeL'])
            bert_scores.append(metrics['bert_score_f1'])
            cosine_similarities.append(metrics['cosine_similarity'])
        else:
            print(f"Article {index} produced no summaries.")

    except Exception as e:
        print()

average_rouge1 = np.mean([score for score in rouge1_scores if score > 0])
average_rouge2 = np.mean([score for score in rouge2_scores if score > 0])
average_rougeL = np.mean([score for score in rougeL_scores if score > 0])
average_bert = np.mean([score for score in bert_scores if score > 0])
average_cosine_similarity = np.mean([score for score in cosine_similarities if score > 0])

print(f"Average ROUGE-1: {average_rouge1:.2f}")
print(f"Average ROUGE-2: {average_rouge2:.2f}")
print(f"Average ROUGE-L: {average_rougeL:.2f}")
print(f"Average BERT Score F1: {average_bert:.2f}")
print(f"Average Cosine Similarity: {average_cosine_similarity:.2f}")














































Average ROUGE-1: 28.82
Average ROUGE-2: 23.89
Average ROUGE-L: 27.46
Average BERT Score F1: 76.88
Average Cosine Similarity: 37.90
