<a href="https://colab.research.google.com/github/tzf101/190041101-CSE-4302/blob/master/utils_notebook/sr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading Libraries



### Libraries

In [1]:
from google.colab import drive
drive.mount("/content/MyDrive", force_remount=True)

Mounted at /content/MyDrive


In [None]:
!pip install sentencepiece
!pip install rouge
!pip install sacrebleu
!pip install -U sentence-transformers
!pip install bert-score

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import sentencepiece as spm
import pandas as pd

import sacrebleu
from rouge import Rouge
from sacrebleu import corpus_bleu
from bert_score import score
from sentence_transformers import SentenceTransformer, util
from nltk.translate.bleu_score import sentence_bleu

### Loading Models

In [None]:
sbert_model = SentenceTransformer('l3cube-pune/bengali-sentence-bert-nli')

### Score calculation

In [None]:
def calculate_sbert_score(original, augmented):
    emb1 = sbert_model.encode(original)
    emb2 = sbert_model.encode(augmented)
    cosine_scores = util.pytorch_cos_sim(emb1, emb2)
    sbert_score = cosine_scores.item()
    return sbert_score

In [None]:
def calculate_scores(original, augmented):
    # BLEU Score
    reference = original
    candidate = augmented
    # bleu_score = sentence_bleu(reference, candidate)
    bleu_score = [sacrebleu.corpus_bleu([aug], [[orig]]).score for aug, orig in zip(augmented, original)]
    # BERTScore
    P, R, F1 = score([augmented], [original], lang="bn", rescale_with_baseline=True)

    # SBERT Score with Cosine Similarity
    emb1 = sbert_model.encode(original)
    emb2 = sbert_model.encode(augmented)
    cosine_scores = util.pytorch_cos_sim(emb1, emb2)
    sbert_score = cosine_scores.item()

    return bleu_score, F1.item(), sbert_score

### SR function

In [None]:
!pip install bnlp

In [None]:
!pip install bnlp-toolkit

In [None]:
import random
from random import shuffle
import re

In [None]:
from bnlp import BengaliCorpus as corpus

In [None]:
from bnlp import BengaliWord2Vec

In [None]:
def get_synonyms(word):
    synonyms = set()
    bwv = BengaliWord2Vec()
    similar_words = bwv.get_most_similar_words(word, topn=10)
    for word in similar_words:
        synonyms.add(word[0])
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

In [None]:
import random

class SR():
    def __init__(self, stopwords):
        self.stopwords = stopwords

    def augment(self, text, n, debug=False):
        words = text.split()
        new_words = words.copy()
        random_word_list = list(set([word for word in words if word not in self.stopwords]))
        random.shuffle(random_word_list)
        num_replaced = 0
        for random_word in random_word_list:
            try:
                synonyms = get_synonyms(random_word)
                if len(synonyms) >= 1:
                    synonym = random.choice(list(synonyms))
                    new_words = [synonym if word == random_word else word for word in new_words]
                    num_replaced += 1
            except KeyError:
                # Ignore words not present in the vocabulary
                continue
            if num_replaced >= n:
                break

        output = ' '.join(new_words)
        if debug:
            output += "(sr)"
        return output


### Testing

In [None]:
stopwords = set(corpus.stopwords)
sr = SR(stopwords)

text = "সঠিক তদন্ত করতে হবে। বিচারের আওতায় আনতে হবে যে এই কাজ টা করেছে।"
augmented_text = sr.augment(text, n=2)
print(augmented_text)

### Apply SR on dataset

In [None]:
# Define a function to apply paraphrasing
def apply_sr(row):
    return sr.augment(row['original_sentence'], n=2)

In [None]:
def sr_and_evaluate_dataset(file_path, original_col_name, new_col_name):
    # Load the dataset
    df = pd.read_csv(file_path)
    df = df.rename(columns={original_col_name: 'original_sentence'})

    # Apply the text augmentation function
    df[new_col_name] = df.apply(apply_sr, axis=1)

    # Compute ROUGE scores
    rouge = Rouge()
    rouge_scores = rouge.get_scores(df[new_col_name], df["original_sentence"])
    rouge_df = pd.DataFrame([
        {
            'r1f1': score['rouge-1']['f'],
            'r2f1': score['rouge-2']['f'],
            'rlf1': score['rouge-l']['f'],
        }
        for score in rouge_scores
    ])

    # Compute SacreBLEU scores
    sacrebleu_scores = [sacrebleu.corpus_bleu([aug], [[orig]]).score for aug, orig in zip(df[new_col_name], df["original_sentence"])]
    df["sacrebleu_score"] = sacrebleu_scores

    # SBERT Score with Cosine Similarity
    df["sbert_score"] = [calculate_sbert_score(orig, aug) for orig, aug in zip(df["original_sentence"], df[new_col_name])]

    # BERTScore
    P, R, F1 = score(df[new_col_name], df["original_sentence"], lang="en", rescale_with_baseline=True)
    df["bertscore_f1"] = F1.tolist()

    # Combine the dataframes
    result_df = pd.concat([df, rouge_df], axis=1)
    result_df["method"] = "sr2"

    return result_df

# Running on dataset

In [None]:
file_path = '/content/MyDrive/MyDrive/Research/Thesis: BDA/Main/evaluation/(old)Youtube/Datasets/yt_sentiment_train_10.csv'

In [None]:
result_df = sr_and_evaluate_dataset(file_path, 'sentence1', 'augmented_sentence')

In [None]:
result_df

### Saving augmented dataset

In [None]:
result_df.to_csv('/content/MyDrive/MyDrive/Research/Thesis: BDA/Main/evaluation/(old)Youtube/Datasets/yt_sentiment_train_10_sr.csv', index=False)