# **Telugu To Gujarati**

In [None]:
# **1.2) IndicTrans2**
## Setup

# Please run the cells below to install the necessary dependencies.

%%capture
!git clone https://github.com/AI4Bharat/IndicTrans2.git

In [None]:
%%capture
%cd /content/IndicTrans2/huggingface_interface


## **Please restart session**

In [None]:
%%capture
!python3 -m pip install nltk sacremoses pandas regex mock transformers>=4.33.2 mosestokenizer
!python3 -c "import nltk; nltk.download('punkt')"
!python3 -m pip install bitsandbytes scipy accelerate datasets
!python3 -m pip install sentencepiece
!git clone https://github.com/VarunGumma/IndicTransTokenizer
%cd IndicTransTokenizer
!python3 -m pip install --editable ./
%cd ..

In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer

BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
quantization = None
def initialize_model_and_tokenizer(ckpt_dir, direction, quantization):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    tokenizer = IndicTransTokenizer(direction=direction)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
    )

    if qconfig == None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()

    model.eval()

    return tokenizer, model


def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
    translations = []
    for i in range(0, len(input_sentences), BATCH_SIZE):
        print(i)
        batch = input_sentences[i : i + BATCH_SIZE]

        # Preprocess the batch and extract entity mappings
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

        # Tokenize the batch and generate input encodings
        inputs = tokenizer(
            batch,
            src=True,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        # Generate translations using the model
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1,
            )

        # Decode the generated tokens into text
        generated_tokens = tokenizer.batch_decode(generated_tokens.detach().cpu().tolist(), src=False)

        # Postprocess the translations, including entity replacement
        translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

        del inputs
        torch.cuda.empty_cache()

    return translations

In [None]:
import csv

# def process_csv(input_csv_path, output_csv_path):
    # Lists to store text and title from the input CSV
text_list = []
title_list = []

# Read data from the input CSV and populate the lists
with open('telugu_non_sar_test.csv', 'r', newline='', encoding='utf-8') as input_file:
  reader = csv.DictReader(input_file)
  for row in reader:
      text_list.append(row['text'])
      title_list.append(row['title'])

  



In [None]:
# process_csv('telugu_sarcastic_train.csv', 'output.csv')

In [None]:
print(len(text_list))
print(len(title_list))

69
69


In [None]:
### Indic to Indic Example


indic_indic_ckpt_dir = "ai4bharat/indictrans2-indic-indic-1B"  # ai4bharat/indictrans2-indic-indic-dist-320M
indic_indic_tokenizer, indic_indic_model = initialize_model_and_tokenizer(indic_indic_ckpt_dir, "indic-indic", quantization)

ip = IndicProcessor(inference=True)

tgt_lang, src_lang = "guj_Gujr", "tel_Telu"
tl_to_gu_text = batch_translate(text_list, src_lang, tgt_lang, indic_indic_model, indic_indic_tokenizer, ip)
tl_to_gu_title = batch_translate(title_list, src_lang, tgt_lang, indic_indic_model, indic_indic_tokenizer, ip)

# flush the models to free the GPU memory
del indic_indic_tokenizer, indic_indic_model



In [None]:
print(tl_to_gu_title[:5])

['માતા તમને કંઈક મોકલી રહી છે-શું તમે જાણો છો કે તે શું છે?', 'સડો વળાંકવા જેવો છે... કોને રોકવું?', 'જ્યારે વિલ ફેરેલ સફરજન વિશે મજાક કરી રહ્યો હતો ત્યારે શાકભાજી વિભાગમાં હાસ્યની લહેર હતી!', 'આંતરિક વ્યવસ્થાના નિરીક્ષકે જે રીતે વાંધો ઉઠાવ્યો હતો તે જ રીતે, કેટલીકવાર લગભગ માતાનો ભાગ બનવું, ખરેખર અગાઉ ક્યારેય ખોવાઈ ન ગયું હોવાનું સૂચવે છે.', '"ધ હેન્ડમેઇડ્સ ટેલ" નો નવો સીઝનઃ અતિવાદી નારીવાદના જોખમો પર ધ્યાન કેન્દ્રિત કરે છે.']


In [None]:

# Open a new text file for writing
with open('tel_guj_non_test.csv', 'w', newline='', encoding='utf-8') as output_file:
        writer = csv.writer(output_file)
        # Write the header row
        writer.writerow(['text', 'title'])
        # Write the data from the lists
        for text, title in zip(tl_to_gu_text, tl_to_gu_title):
            writer.writerow([text, title])

In [None]:
import csv

def merge_csv(source_csv_path, target_csv_path):
    # Lists to store text and title from the source CSV
    text_list = []
    title_list = []

    # Read data from the source CSV and populate the lists
    with open(source_csv_path, 'r', newline='', encoding='utf-8') as source_file:
        reader = csv.DictReader(source_file)
        for row in reader:
            text_list.append(row['text'])
            title_list.append(row['title'])

    # Append the text and title to the target CSV
    with open(target_csv_path, 'a', newline='', encoding='utf-8') as target_file:
        writer = csv.DictWriter(target_file, fieldnames=['text', 'title'])
        # Check if the target CSV is empty; if so, write the header row
        if target_file.tell() == 0:
            writer.writeheader()
        # Write the data from the source CSV to the target CSV
        for text, title in zip(text_list, title_list):
            writer.writerow({'text': text, 'title': title})

# Example usage:
merge_csv('tel_guj_non_test.csv','gujarati_non_sar_test.csv')


# **Sarcasm Detection**

In [None]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")


In [None]:
l1=[]
l2=[]

In [None]:
import json

def classify_news_articles(articles):
    print("started")
    sarcastic_articles = []
    non_sarcastic_articles = []
    i=1
    for article in articles:
        print(i)
        i=i+1
        sequence_to_classify = article['text']
        candidate_labels = ['sarcastic', 'non-sarcastic']
        result = classifier(sequence_to_classify, candidate_labels)

        if result['scores'][0] > 0.57:  # If sarcastic score is greater than 0.5
            sarcastic_articles.append({'text': article['text'], 'title': article['title']})
            l1.append({'text': article['text'], 'title': article['title']})
        else:
            non_sarcastic_articles.append({'text': article['text'], 'title': article['title']})
            l2.append({'text': article['text'], 'title': article['title']})

    return sarcastic_articles, non_sarcastic_articles

In [None]:
import csv

# Function to read data from CSV file and convert it into a list of dictionaries
def read_csv(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            data.append({'text': row['text'], 'title': row['title']})
    return data

# **Split-data**

## **Filter non-sarcastic Data to match sarcsatic data fields**

In [1]:
import csv

def filter_csv(csv_file, filtered_csv_file, fields_to_keep):
    # Read data from CSV file
    with open(csv_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        fieldnames = reader.fieldnames

        # Filter out headers not in fields_to_keep
        filtered_fieldnames = [field for field in fieldnames if field in fields_to_keep]

        # Write filtered data to new CSV file
        with open(filtered_csv_file, 'w', newline='', encoding='utf-8') as fw:
            writer = csv.DictWriter(fw, fieldnames=filtered_fieldnames)
            writer.writeheader()
            for row in reader:
                filtered_row = {field: row[field] for field in filtered_fieldnames}
                writer.writerow(filtered_row)




In [None]:
# Example usage:
csv_file = 'gujarati_nonsarcastic.csv'
filtered_csv_file = 'gujarati_nonsarcastic_filter.csv'
fields_to_keep = ['title', 'text']  # Specify the fields you want to keep
filter_csv(csv_file, filtered_csv_file, fields_to_keep)

In [None]:
import csv
import random

def split_csv(csv_file, train_csv_file, test_csv_file, train_samples=293, test_samples=70):
    # Read data from CSV file
    with open(csv_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        data = list(reader)

    # Randomly shuffle the data
    # random.shuffle(data)

    # Split the data into train and test sets
    train_data = data[:train_samples]
    test_data = data[train_samples:train_samples+test_samples]

    # Write train data to CSV file
    with open(train_csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=reader.fieldnames)
        writer.writeheader()
        writer.writerows(train_data)

    # Write test data to CSV file
    with open(test_csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=reader.fieldnames)
        writer.writeheader()
        writer.writerows(test_data)




## **Split telugu data**

In [None]:
# Example usage:
csv_file = 'telugu_nonsarcastic_filter.csv'
train_csv_file = 'telugu_non_sar_train.csv'
test_csv_file = 'telugu_non_sar_test.csv'
split_csv(csv_file, train_csv_file, test_csv_file)

In [None]:
# Example usage:
csv_file = 'telugu_sarcastic.csv'
train_csv_file = 'telugu_sar_train.csv'
test_csv_file = 'telugu_sar_test.csv'
split_csv(csv_file, train_csv_file, test_csv_file)

## **Split Gujarati Data**

In [None]:
# Example usage:
csv_file = 'gujarati_nonsarcastic.csv'
train_csv_file = 'gujarati_non_sar_train.csv'
test_csv_file = 'gujarati_non_sar_test.csv'
split_csv(csv_file, train_csv_file, test_csv_file)

In [None]:
# Example usage:
csv_file = 'gujarati_sarcastic.csv'
train_csv_file = 'gujarati_sar_train.csv'
test_csv_file = 'gujarati_sar_test.csv'
split_csv(csv_file, train_csv_file, test_csv_file)

# **Combine 2 test csv files**

In [None]:
import csv

def combine_csv(csv_file1, csv_file2, combined_csv_file):
    # Read data from CSV file 1
    with open(csv_file1, 'r', encoding='utf-8') as f1:
        reader1 = csv.DictReader(f1)
        fieldnames = reader1.fieldnames
        data1 = list(reader1)

    # Read data from CSV file 2
    with open(csv_file2, 'r', encoding='utf-8') as f2:
        reader2 = csv.DictReader(f2)
        data2 = list(reader2)

    # Combine data from both CSV files
    combined_data = data1 + data2

    # Write combined data to new CSV file
    with open(combined_csv_file, 'w', newline='', encoding='utf-8') as fw:
        writer = csv.DictWriter(fw, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(combined_data)




## **Combine telugu csvs**

In [None]:
# Example usage:
csv_file1 = 'telugu_sarcastic_test.csv'
csv_file2 = 'telugu_non_sar_test.csv'
combined_csv_file = 'telugu_test.csv'
combine_csv(csv_file1, csv_file2, combined_csv_file)

## **Combine Gujarati csv**

In [None]:
# Example usage:
csv_file1 = 'gujarati_sar_test_m.csv'
csv_file2 = 'gujarati_non_sar_test_m.csv'
combined_csv_file = 'gujarati_test.csv'
combine_csv(csv_file1, csv_file2, combined_csv_file)

# **1) Telugu Dataset**

In [None]:
# Load data from CSV file
file_path = 'telugu_test.csv'  # Replace 'test.csv' with the path to your CSV file
articles = read_csv(file_path)

In [None]:
len(l1)

8

In [None]:
len(l2)

36

In [None]:
# Function to write data to CSV file
def write_csv(file_path, data):
    with open(file_path, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['text', 'title'])
        writer.writeheader()
        for row in data:
            writer.writerow(row)

# Write sarcastic articles to CSV file
sarcastic_file_path = 'tl_sarcastic_articles_test.csv'
write_csv(sarcastic_file_path, l1)

# Write non-sarcastic articles to CSV file
non_sarcastic_file_path = 'tl_non_sarcastic_articles_test.csv'
write_csv(non_sarcastic_file_path, l2)

print("Sarcastic articles saved to:", sarcastic_file_path)
print("Non-Sarcastic articles saved to:", non_sarcastic_file_path)


Sarcastic articles saved to: tl_sarcastic_articles_test.csv
Non-Sarcastic articles saved to: tl_non_sarcastic_articles_test.csv


# **1) Gujarati Dataset**

In [None]:
# import pandas as pd
# Load data from CSV file
file_path = 'gujarati_test.csv'  # Replace 'test.csv' with the path to your CSV file
articles = read_csv(file_path)

In [None]:
len(articles)

280

In [None]:
articles[:5]

[{'text': 'અબુ ધાબીમાં અંડરવર્લ્ડ ડોન દાઉદ ઇબ્રાહિમના શાર્પ શૂટર રાશીદ માલબારીની ધરપકડ પછી ચોંકાવનારો ખુલાસો થયો છે. રાશીદ માલબારીએ છોટા શકીલના કહેવાથી શ્રીરામ સેનાના સંસ્થાપક પ્રમોદ મુથાલિક અને બીજેપી નેતા વરૂણ ગાંધીને મારવાનો પ્લાન બનાવ્યો હતો પરંતુ આ ષડયંત્ર પુરૂં થાય તે પહેલા જ તેના શૂટરની ધરપકડ કરી લેવામાં આવી હતી.\nરાશીદ વર્ષ 2014માં મેંગલુરૂ કોર્ટમાંથી નેપાળના રસ્તે ભારતમાંથી ફરાર થઇ ગયો હતો. અંડરવર્લ્ડનું નેપાળનું બધું કામ રાશીદ જ સંભાળે છે. બેંગકોકમાં વર્ષ 2000માં છોટા રાજન પર હુમલામાં રાશીદ પણ સામેલ હતો. હુમલામાં છોટા રાજનને ગોળી વાગી હતી પરંતુ તે ફરાર થઇ ગયો હતો. આ હુમલામાં છોટા રાજનનો નજીકનો માણસ રોહીત વર્મા માર્યો ગયો હતો. તે સમયે રાશીદે છોટા રાજન પર પણ ગોળી મારી હતી. તેની પર હત્યાના ઘણાં કેસ નોંધાયા છે. મેંગલુરૂ કોર્ટમાંથી ફરાર થયા બાદ પોલીસે તેની સામે લુકઆઉટ નોટિસ જાહેર કરી હતી. તેની સામે રેડ કોર્નર નોટિસ પણ જાહેર થઇ ચુકી છે.\nરાશીદ ડી ગેંગનો ભારતનો સૌથી મોટો માણસ માનવામાં આવે છે. તેણે છોટા રાજન પર હુમલા ઉપરાંત ક્વાલાલમ્પુરમાં છોટા રાજનના નજીકના માણસની હત્યામાં પણ તેનો હ

In [None]:
# Call the classify_news_articles function
sarcastic_articles, non_sarcastic_articles = classify_news_articles(articles)



In [None]:
len(l1)

60

In [None]:
len(l2)

146

In [None]:
# Function to write data to CSV file
def write_csv(file_path, data):
    with open(file_path, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['text', 'title'])
        writer.writeheader()
        for row in data:
            writer.writerow(row)

# Write sarcastic articles to CSV file
sarcastic_file_path = 'gu_sarcastic_articles_test.csv'
write_csv(sarcastic_file_path, l1)

# Write non-sarcastic articles to CSV file
non_sarcastic_file_path = 'gu_non_sarcastic_articles_test.csv'
write_csv(non_sarcastic_file_path, l2)

print("Sarcastic articles saved to:", sarcastic_file_path)
print("Non-Sarcastic articles saved to:", non_sarcastic_file_path)


Sarcastic articles saved to: gu_sarcastic_articles_test.csv
Non-Sarcastic articles saved to: gu_non_sarcastic_articles_test.csv


# **Headline Genration**

In [2]:
!pip install -r requirements.txt

Collecting accelerate>=0.12.0 (from -r requirements.txt (line 1))
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=1.8.0 (from -r requirements.txt (line 2))
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting rouge-score (from -r requirements.txt (line 5))
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py7zr (from -r requirements.txt (line 7))
  Downloading py7zr-0.21.0-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.6/67.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate (from -r requirements.txt (line 9))
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [

In [3]:
!git clone https://github.com/huggingface/transformers.git

Cloning into 'transformers'...
remote: Enumerating objects: 197124, done.[K
remote: Counting objects: 100% (602/602), done.[K
remote: Compressing objects: 100% (276/276), done.[K
remote: Total 197124 (delta 331), reused 502 (delta 266), pack-reused 196522[K
Receiving objects: 100% (197124/197124), 209.22 MiB | 24.48 MiB/s, done.
Resolving deltas: 100% (140167/140167), done.
Updating files: 100% (4243/4243), done.


In [4]:
!pip install -q ./transformers

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


In [14]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


## **Gujarati sarcastic model**

In [None]:
!python transformers/examples/pytorch/summarization/run_summarization.py \
    --model_name_or_path google/mt5-small \
    --do_train True \
    --do_eval False \
    --do_predict True \
    --source_prefix "summarize: " \
    --train_file gujarati_sar_train.csv \
    --test_file gu_sarcastic_articles_test.csv \
    --text_column "text" \
    --summary_column "title" \
    --max_target_length 298 \
    --output_dir output_mt5_gu/ \
    --per_device_train_batch_size=4 \
    --num_train_epochs 3 \
    --logging_strategy "epoch" \
    --save_strategy "no" \
    --overwrite_output_dir True \
    --predict_with_generate $@ 2>&1>./hg_mt5_log.txt

2024-04-22 15:31:29.015700: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-22 15:31:29.015765: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-22 15:31:29.017749: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using custom data configuration default-b72e0611477824f5
Loading Dataset Infos from /usr/local/lib/python3.10/dist-packages/datasets/packaged_modules/csv
Generating dataset csv (/root/.cache/huggingface/datasets/csv/default-b72e0611477824f5/0.0.0/8d73bd761341cee405ddc715f0eebe400df876d7da154d3a2263a460648d6ba5)
Downloading and preparing dataset csv/default to /ro

In [None]:

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!cp -r gu_sarcastic_articles_test.csv /content/drive/My\ Drive/saved_trainer/

In [None]:
!cp -r output_mt5_gu /content/drive/My\ Drive/saved_trainer/


In [None]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

# Load fine-tuned model and tokenizer
model = MT5ForConditionalGeneration.from_pretrained("output_mt5_gu")
tokenizer = MT5Tokenizer.from_pretrained("output_mt5_gu")

# Define input text
input_text = "પુલવામાં હુમલા પર મંગળવારે પાકિસ્તાની પીએમ ઇમરાન ખાને ભારતને એડ્રેસ કર્યું. તેની પર પ્રખ્યાત રાઇટર જાવેદ અખ્તરે જવાબ આપ્યો છે. સાથે જ હુમલાની જવાબદારી ન લેવા પર પાકિસ્તાનની આલોચના કરી છે.\nએટલું જ નહીં, અખ્તરે આ ઘટનાને પણ શેર કરી છે જેમા તેમને પાકિસ્તાની ન્યૂઝ એન્કર પર નિશાન સાધ્યું હતું. અખ્તરે કહ્યું, ઇમરાને નો બોલ ફેંક્યો છે. દર વખત તે પુછે છે કે તમને કેમ લાગે છે કે આ અમે કર્યું છે.\n<>\n\nજાવેદ અખ્તરે એક ઘટના અંગે જણાવતા લખ્યું, મુંબઇ આતંકી હુમલા બાદ પાકિસ્તાનની એક ટીવી એન્કરે મને પૂછ્યું કે શુ તમે એવું સમજો છો કે આ પાકિસ્તાને કર્યું છે. આ તો કોઇપણ દેશ હોય શકે છે. મેં પણ કહ્યું કે ઠીક છે ચલો તમને 3 ઓપ્શન આપીશ. તમારે એકને પસંદ કરવાનું છે. બ્રાઝીલ, સ્વીડન અને પાકિસ્તાન.\nજણાવી દઇએ કે પાક. પીએમ ઇમરાન ખાને આશરે 6 મિનિટના લાંબા વીડિયોમાં આ વાતથી ઇન્કાર કર્યો છે કે ભારતમાં થયેલા હુમલામાં પાકિસ્તાનનો હાથ છે. જો પાકિસ્તાને હુમલો કર્યો છે તો તેના પુરાવા આપે. ઇમરાને ભારતને ધમકી પણ આપી હતી કે ભારત પાકિસ્તાન પર અટેક કરશે તો તે પણ તેનો જડબાતોડ જવાબ આપશે."

# Tokenize input text
inputs = tokenizer(input_text, return_tensors="pt")

# Generate output
outputs = model.generate(**inputs)

# Decode output tokens
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print output text
print("Output:", output_text)



Output: <extra_id_0> ભારતમાં થયેલા હુમલામાં પાકિસ્તાનને ભારતને


## **Gujarati non sarcastic model**


In [None]:
!python transformers/examples/pytorch/summarization/run_summarization.py \
    --model_name_or_path google/mt5-small \
    --do_train True \
    --do_eval False \
    --do_predict True \
    --source_prefix "summarize: " \
    --train_file gujarati_non_sar_train_m.csv \
    --test_file gujarati_non_sarcastic_test_zeroM.csv \
    --text_column "text" \
    --summary_column "title" \
    --max_target_length 298 \
    --output_dir output_mt5_gu_non_sar_m/ \
    --per_device_train_batch_size=4 \
    --num_train_epochs 3 \
    --logging_strategy "epoch" \
    --save_strategy "no" \
    --overwrite_output_dir True \
    --predict_with_generate $@ 2>&1>./hg_mt5_log.txt

In [None]:

from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp -r output_mt5_gu_non_sar_m /content/drive/My\ Drive/saved_trainer/


In [None]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

# Load fine-tuned model and tokenizer
model = MT5ForConditionalGeneration.from_pretrained("output_mt5_gu_non_sar_m")
tokenizer = MT5Tokenizer.from_pretrained("output_mt5_gu_non_sar_m")

# Define input text
input_text = "કોરોના મહામારી સંકટ વચ્ચે દેશના લાખો કેન્દ્રીય કર્મચારીઓને કેન્દ્ર સરકારે ખુશખબર આપ્યા છે. સરકારે કર્મચારીઓને મળનારા વેરિએબલ મોંઘવારી ભથ્થામાં વધારાની જાહેરાત કરી છે. સરકારની આ જાહેરાતથી લગભગ દોઢ કરોડ કેન્દ્રીય કર્મચારીઓના ચહેરા પર હાસ્ય જોવા મળશે. તેનો ફાયદો કોન્ટ્રાક્ટ પર કામ કરતા કર્મચારીઓને પણ થશે. કર્મચારીઓના પગાર ઉપરાંત તેમના પ્રોવિડન્ડ ફંડ અને ગ્રેચ્યુઈટી ઉપર પણ આ નિર્ણયની અસર જોવા મળશે.\nબમણું થયું વેરિએબલ મોંઘવારી ભથ્થું\n\nકેન્દ્રીય શ્રમ અને રોજગાર મંત્રાલયે કેન્દ્રીય કર્મચારીઓ માટે વેરિએબલ મોંઘવારી ભથ્થામાં વધારો કર્યો છે. કર્મચારીઓને વેરિએબલ મોંઘવારી ભથ્થું હે પહેલા 105 રૂપિયા મહિના પ્રમાણે મળતું હતું તે હવે વધીને બમણું થયું છે. એટલે કે હવે 210 રૂપિયા દર મહિને મળશે.\nકેન્દ્રીય શ્રમ અને રોજગાર મંત્રાલયના આ નિર્ણયથી કેન્દ્ર સરકાર, રેલવે, ખાણ, ઓઈલ ફિલ્ડ્સ, પોર્ટ અને કેન્દ્ર સરકાર સંલગ્ન અન્ય કાર્યાલયોમાં કામ કરનારા લગભગ 1.5 કરોડ કર્મચારીઓને તેનો સીધો ફાયદો મળે તેવી આશા છે. કેન્દ્ર સરકારના જણાવ્યાં મુજબ વેરિએબલ મોંઘવારી ભથ્થામાં વધારાનો ફાયદો કોન્ટ્રાક્ટ અને હંગામી રીતે કાર્યરત કર્મચારીઓને પણ મળશે."

# Tokenize input text
inputs = tokenizer(input_text, return_tensors="pt")

# Generate output
outputs = model.generate(**inputs)

# Decode output tokens
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print output text
print("Output:", output_text)

# **Predict sarcastic data using sarcastic Model**

In [None]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

# Load fine-tuned model and tokenizer
model = MT5ForConditionalGeneration.from_pretrained("output_mt5_gu_sar_m")
tokenizer = MT5Tokenizer.from_pretrained("output_mt5_gu_sar_m")

In [None]:
import pandas as pd

# Load your CSV file
csv_file_path = "gujarati_sarcastic_test_zeroM.csv"  # Replace with the path to your CSV file
df = pd.read_csv(csv_file_path)

# Extract the "title" column
titles = df["title"]
text = df["text"]

# Save the titles to a text file
output_txt_file_path = "ground_truth_gu_sar_title.txt"  # Replace with the path to your output text file
with open(output_txt_file_path, "w", encoding="utf-8") as output_file:
    for title in titles:
        output_file.write(title + "\n")

output_txt_file_path = "ground_truth_gu_sar_text.txt"  # Replace with the path to your output text file
with open(output_txt_file_path, "w", encoding="utf-8") as output_file:
    for title in titles:
        output_file.write(title + "\n")

In [None]:


def read_sentences_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = [line.strip() for line in file.readlines()]
    return sentences

title_file = 'ground_truth_gu_sar_title.txt'
text_file = 'ground_truth_gu_sar_text.txt'

# Read sentences from files
title_sentences = read_sentences_from_file(title_file)
text_sentences = read_sentences_from_file(text_file)




In [None]:
# Define input text as an array of sentences
# input_texts = ["કોરોના મહામારી સંકટ", "મહામારી"]

# List to store generated outputs
output_texts = []

# Process each sentence to generate output
for input_text in text_sentences:
    # Tokenize input text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate output
    outputs = model.generate(**inputs)

    # Decode output tokens
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Append output to the list
    output_texts.append(output_text)

    # Print output text
    print("Input:", input_text)
    print("Output:", output_text)
    print()

# Write output texts to a text file
with open("generated_outputs.txt", "w", encoding="utf-8") as file:
    for output_text in output_texts:
        file.write(output_text + "\n")


# **Calculate scores**

In [None]:
import pandas as pd

# Load your CSV file
csv_file_path = "gujarati_sarcastic_test_zeroM.csv"  # Replace with the path to your CSV file
df = pd.read_csv(csv_file_path)

# Extract the "title" column
titles = df["title"]

# Save the titles to a text file
output_txt_file_path = "ground_truth_gu_sar.txt"  # Replace with the path to your output text file
with open(output_txt_file_path, "w", encoding="utf-8") as output_file:
    for title in titles:
        output_file.write(title + "\n")

In [None]:
!pip install rouge-score
!pip install rouge

In [None]:
# Open the text file for reading
with open('generated_outputs.txt', 'r', encoding='utf-8') as file:
    # Read all lines from the file
    lines = file.readlines()

# Modify each line to remove '<extra_id_0>'
modified_lines = [line.replace('<extra_id_0> ', '') for line in lines]

# Open the same text file for writing
with open('generated_outputs.txt', 'w', encoding='utf-8') as file:
    # Write the modified lines back to the file
    file.writelines(modified_lines)


In [None]:


def read_sentences_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = [line.strip() for line in file.readlines()]
    return sentences

generated_file = 'generated_outputs.txt'
reference_file = 'ground_truth_gu_sar.txt'

# Read sentences from files
generated_sentences = read_sentences_from_file(generated_file)
reference_sentences = read_sentences_from_file(reference_file)




In [None]:
from rouge_score import rouge_scorer
import nltk
from nltk.translate.bleu_score import corpus_bleu

In [None]:
# Bleu Score
bleu_score = corpus_bleu([[refer] for refer in reference_sentences], generated_sentences)
print("Blue Score", bleu_score)


In [None]:
#Rouge Score
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text


def calculate_ROUGE_score(ref_file, hyp_file):
    # Read reference and hypothesis text from files
    reference_text = read_text_file(ref_file)
    hypothesis_text = read_text_file(hyp_file)

    # Initialize ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)
    # Calculate ROUGE scores
    scores = scorer.score(hypothesis_text, reference_text)
    return scores


rouge_scores = calculate_ROUGE_score(reference_file,generated_file)
print("Rouge Score",rouge_scores)

# **Predict non-sarcastic data using non-sarcastic Model**

In [None]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

# Load fine-tuned model and tokenizer
model = MT5ForConditionalGeneration.from_pretrained("output_mt5_gu_non_sar_m")
tokenizer = MT5Tokenizer.from_pretrained("output_mt5_gu_non_sar_m")

In [None]:
import pandas as pd

# Load your CSV file
csv_file_path = "gujarati_non_sarcastic_test_zeroM.csv"  # Replace with the path to your CSV file
df = pd.read_csv(csv_file_path)

# Extract the "title" column
titles = df["title"]
text = df["text"]

# Save the titles to a text file
output_txt_file_path = "ground_truth_gu_non_sar_title.txt"  # Replace with the path to your output text file
with open(output_txt_file_path, "w", encoding="utf-8") as output_file:
    for title in titles:
        output_file.write(title + "\n")

output_txt_file_path = "ground_truth_non_gu_sar_text.txt"  # Replace with the path to your output text file
with open(output_txt_file_path, "w", encoding="utf-8") as output_file:
    for title in titles:
        output_file.write(title + "\n")

In [None]:
def read_sentences_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = [line.strip() for line in file.readlines()]
    return sentences

title_file = 'ground_truth_gu_non_sar_title.txt'
text_file = 'ground_truth_non_gu_sar_text.txt'

# Read sentences from files
title_sentences = read_sentences_from_file(title_file)
text_sentences = read_sentences_from_file(text_file)




In [None]:
# Define input text as an array of sentences
# input_texts = ["કોરોના મહામારી સંકટ", "મહામારી"]

# List to store generated outputs
output_texts = []

# Process each sentence to generate output
for input_text in text_sentences:
    # Tokenize input text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate output
    outputs = model.generate(**inputs)

    # Decode output tokens
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Append output to the list
    output_texts.append(output_text)

    # Print output text
    print("Input:", input_text)
    print("Output:", output_text)
    print()

# Write output texts to a text file
with open("generated_outputs_non.txt", "w", encoding="utf-8") as file:
    for output_text in output_texts:
        file.write(output_text + "\n")


# **Calculate scores**

In [None]:
import pandas as pd

# Load your CSV file
csv_file_path = "gujarati_non_sarcastic_test_zeroM.csv"  # Replace with the path to your CSV file
df = pd.read_csv(csv_file_path)

# Extract the "title" column
titles = df["title"]

# Save the titles to a text file
output_txt_file_path = "ground_truth_gu_non_sar.txt"  # Replace with the path to your output text file
with open(output_txt_file_path, "w", encoding="utf-8") as output_file:
    for title in titles:
        output_file.write(title + "\n")

In [None]:
# Open the text file for reading
with open('generated_outputs_non.txt', 'r', encoding='utf-8') as file:
    # Read all lines from the file
    lines = file.readlines()

# Modify each line to remove '<extra_id_0>'
modified_lines = [line.replace('<extra_id_0> ', '') for line in lines]

# Open the same text file for writing
with open('generated_outputs_non.txt', 'w', encoding='utf-8') as file:
    # Write the modified lines back to the file
    file.writelines(modified_lines)


In [None]:


def read_sentences_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = [line.strip() for line in file.readlines()]
    return sentences

generated_file = 'generated_outputs_non.txt'
reference_file = 'ground_truth_gu_non_sar.txt'

# Read sentences from files
generated_sentences = read_sentences_from_file(generated_file)
reference_sentences = read_sentences_from_file(reference_file)




In [None]:
# Bleu Score
bleu_score = corpus_bleu([[refer] for refer in reference_sentences], generated_sentences)
print("Bleu Score", bleu_score)

In [None]:
#Rouge Score
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text


def calculate_ROUGE_score(ref_file, hyp_file):
    # Read reference and hypothesis text from files
    reference_text = read_text_file(ref_file)
    hypothesis_text = read_text_file(hyp_file)

    # Initialize ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)
    # Calculate ROUGE scores
    scores = scorer.score(hypothesis_text, reference_text)
    return scores


rouge_scores = calculate_ROUGE_score(reference_file,generated_file)
print("Rouge Score",rouge_scores)

# **Predict sarcastic data using non-sarcastic Model**

In [None]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

# Load fine-tuned model and tokenizer
model = MT5ForConditionalGeneration.from_pretrained("output_mt5_gu_non_sar_m")
tokenizer = MT5Tokenizer.from_pretrained("output_mt5_gu_non_sar_m")

In [None]:
import pandas as pd

# Load your CSV file
csv_file_path = "gujarati_sarcastic_test_zeroM.csv"  # Replace with the path to your CSV file
df = pd.read_csv(csv_file_path)

# Extract the "title" column
titles = df["title"]
text = df["text"]

# Save the titles to a text file
output_txt_file_path = "ground_truth_gu_sar_title.txt"  # Replace with the path to your output text file
with open(output_txt_file_path, "w", encoding="utf-8") as output_file:
    for title in titles:
        output_file.write(title + "\n")

output_txt_file_path = "ground_truth_gu_sar_text.txt"  # Replace with the path to your output text file
with open(output_txt_file_path, "w", encoding="utf-8") as output_file:
    for title in titles:
        output_file.write(title + "\n")

In [None]:


def read_sentences_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = [line.strip() for line in file.readlines()]
    return sentences

title_file = 'ground_truth_gu_sar_title.txt'
text_file = 'ground_truth_gu_sar_text.txt'

# Read sentences from files
title_sentences = read_sentences_from_file(title_file)
text_sentences = read_sentences_from_file(text_file)




In [None]:
# Define input text as an array of sentences
# input_texts = ["કોરોના મહામારી સંકટ", "મહામારી"]

# List to store generated outputs
output_texts = []

# Process each sentence to generate output
for input_text in text_sentences:
    # Tokenize input text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate output
    outputs = model.generate(**inputs)

    # Decode output tokens
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Append output to the list
    output_texts.append(output_text)

    # Print output text
    print("Input:", input_text)
    print("Output:", output_text)
    print()

# Write output texts to a text file
with open("generated_outputs_non_sar.txt", "w", encoding="utf-8") as file:
    for output_text in output_texts:
        file.write(output_text + "\n")


# **Calculate scores**

In [None]:
import pandas as pd

# Load your CSV file
csv_file_path = "gujarati_sarcastic_test_zeroM.csv"  # Replace with the path to your CSV file
df = pd.read_csv(csv_file_path)

# Extract the "title" column
titles = df["title"]

# Save the titles to a text file
output_txt_file_path = "ground_truth_gu_sar.txt"  # Replace with the path to your output text file
with open(output_txt_file_path, "w", encoding="utf-8") as output_file:
    for title in titles:
        output_file.write(title + "\n")

In [None]:
# Open the text file for reading
with open('generated_outputs_non_sar.txt', 'r', encoding='utf-8') as file:
    # Read all lines from the file
    lines = file.readlines()

# Modify each line to remove '<extra_id_0>'
modified_lines = [line.replace('<extra_id_0> ', '') for line in lines]

# Open the same text file for writing
with open('generated_outputs_non_sar.txt', 'w', encoding='utf-8') as file:
    # Write the modified lines back to the file
    file.writelines(modified_lines)


In [None]:


def read_sentences_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = [line.strip() for line in file.readlines()]
    return sentences

generated_file = 'generated_outputs_non_sar.txt'
reference_file = 'ground_truth_gu_sar.txt'

# Read sentences from files
generated_sentences = read_sentences_from_file(generated_file)
reference_sentences = read_sentences_from_file(reference_file)




In [None]:
from rouge_score import rouge_scorer
import nltk
from nltk.translate.bleu_score import corpus_bleu

In [None]:
# Bleu Score
bleu_score = corpus_bleu([[refer] for refer in reference_sentences], generated_sentences)
print("Blue Score", bleu_score)

In [None]:
#Rouge Score
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text


def calculate_ROUGE_score(ref_file, hyp_file):
    # Read reference and hypothesis text from files
    reference_text = read_text_file(ref_file)
    hypothesis_text = read_text_file(hyp_file)

    # Initialize ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)
    # Calculate ROUGE scores
    scores = scorer.score(hypothesis_text, reference_text)
    return scores


rouge_scores = calculate_ROUGE_score(reference_file,generated_file)
print("Rouge Score",rouge_scores)

## **Telugu sarcastic model on sarcastic articles**

In [4]:
!python transformers/examples/pytorch/summarization/run_summarization.py \
    --model_name_or_path google/mt5-small \
    --do_train True \
    --do_eval False \
    --do_predict True \
    --source_prefix "summarize: " \
    --train_file telugu_sarcastic_train.csv \
    --test_file tl_sarcastic_articles_test.csv \
    --text_column "text" \
    --summary_column "title" \
    --max_target_length 298 \
    --output_dir output_mt5_tl/ \
    --per_device_train_batch_size=4 \
    --num_train_epochs 3 \
    --logging_strategy "epoch" \
    --save_strategy "no" \
    --overwrite_output_dir True \
    --predict_with_generate $@ 2>&1>./hg_mt5_log.txt

2024-04-23 14:34:51.277559: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-23 14:34:51.277604: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-23 14:34:51.278949: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-23 14:34:51.286639: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Using custom data configuration default-32c7b

In [5]:

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [6]:
!cp -r output_mt5_tl /content/drive/My\ Drive/saved_trainer/


In [7]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

# Load fine-tuned model and tokenizer
model = MT5ForConditionalGeneration.from_pretrained("output_mt5_tl")
tokenizer = MT5Tokenizer.from_pretrained("output_mt5_tl")

# Define input text
input_text = "పోర్ట్‌ల్యాండ్, మైనే - ఈ ఉదయమే దీన్ని తపాలా కార్యాలయంలో వేశానని, నువ్వు అందుకోవాలని మీ అమ్మ శనివారం తెలియజేసింది, నివేదికలు ధృవీకరించాయి. 'నీకు ఏదో పంపుతున్నాను' అని మీ అమ్మ నుండి ఒక టెక్స్ట్ సందేశం వచ్చింది. 'ఏదో కొన్ని రోజుల్లో అందుకుంటావులే' అని తెలిపింది. 'నీకిది వచ్చాక నాకు చెప్పాలి' అని మీ అమ్మ రాసింది. ప్రెస్ టైమ్ లో, మీ అమ్మ 'నీకు కావాలంటే' అని ప్యాకేజ్ ట్రాకింగ్ నంబర్‌ను మీకు ఈమెయిల్ చేసింది."

# Tokenize input text
inputs = tokenizer(input_text, return_tensors="pt")

# Generate output
outputs = model.generate(**inputs)

# Decode output tokens
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print output text
print("Output:", output_text)



Output: <extra_id_0>. 'నీకు కావాలంటే' అని మీ అమ్మ రాసింది.


In [61]:
with open('tl_sarcastic_vs_sarcastic_mt5.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

cleaned_lines = [line.replace('<extra_id_0>', '') for line in lines]

with open('tl_sarcastic_vs_sarcastic_mt5_trained_cleaned.txt', 'w', encoding='utf-8') as file:
    file.writelines(cleaned_lines)


In [63]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('tl_sarcastic_articles_test.csv')

# Extract the 'title' column
titles = df['title']


In [64]:
from nltk.translate.bleu_score import corpus_bleu

# Example ground truth and model-generated outputs as lists of strings
ground_truth = cleaned_lines
model_output = titles

# Convert strings to lists of tokens
ground_truth_tokenized = [reference.split() for reference in ground_truth]
model_output_tokenized = [hypothesis.split() for hypothesis in model_output]

# Calculate the BLEU score
bleu_score = corpus_bleu([[reference] for reference in ground_truth_tokenized], model_output_tokenized)

print(f"BLEU score: {bleu_score}")


BLEU score: 0.0068536145655216886


In [68]:
model_output=list(model_output)

In [69]:
from rouge import Rouge

# Example ground truth and model-generated outputs as lists of strings
ground_truth = cleaned_lines

# Initialize the ROUGE scorer
rouge = Rouge()

# Calculate ROUGE scores
scores = rouge.get_scores(model_output, ground_truth, avg=True)

print(scores)


{'rouge-1': {'r': 0.09686167929875023, 'p': 0.07364541658019916, 'f': 0.07851776803725381}, 'rouge-2': {'r': 0.01920002300437083, 'p': 0.01739138478268913, 'f': 0.017370426123417017}, 'rouge-l': {'r': 0.09533188863852479, 'p': 0.07180506691376254, 'f': 0.07684919442932092}}


## **Telugu Non sarcastic model on sarcastic articles**

In [14]:
!python transformers/examples/pytorch/summarization/run_summarization.py \
    --model_name_or_path google/mt5-small \
    --do_train True \
    --do_eval False \
    --do_predict True \
    --source_prefix "summarize: " \
    --train_file telugu_non_sarcastic_train.csv \
    --test_file tl_sarcastic_articles_test.csv \
    --text_column "text" \
    --summary_column "title" \
    --max_target_length 298 \
    --output_dir output_mt5_tl_nonsarcastic/ \
    --per_device_train_batch_size=4 \
    --num_train_epochs 3 \
    --logging_strategy "epoch" \
    --save_strategy "no" \
    --overwrite_output_dir True \
    --predict_with_generate $@ 2>&1>./hg_mt5_log.txt

2024-04-23 15:04:40.100029: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-23 15:04:40.100082: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-23 15:04:40.101575: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-23 15:04:40.109731: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Using custom data configuration default-b45e9

In [15]:

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
!cp -r output_mt5_tl_nonsarcastic /content/drive/My\ Drive/saved_non_sarcastictrainer/


In [17]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

# Load fine-tuned model and tokenizer
model = MT5ForConditionalGeneration.from_pretrained("output_mt5_tl")
tokenizer = MT5Tokenizer.from_pretrained("output_mt5_tl")

# Define input text
input_text = "పోర్ట్‌ల్యాండ్, మైనే - ఈ ఉదయమే దీన్ని తపాలా కార్యాలయంలో వేశానని, నువ్వు అందుకోవాలని మీ అమ్మ శనివారం తెలియజేసింది, నివేదికలు ధృవీకరించాయి. 'నీకు ఏదో పంపుతున్నాను' అని మీ అమ్మ నుండి ఒక టెక్స్ట్ సందేశం వచ్చింది. 'ఏదో కొన్ని రోజుల్లో అందుకుంటావులే' అని తెలిపింది. 'నీకిది వచ్చాక నాకు చెప్పాలి' అని మీ అమ్మ రాసింది. ప్రెస్ టైమ్ లో, మీ అమ్మ 'నీకు కావాలంటే' అని ప్యాకేజ్ ట్రాకింగ్ నంబర్‌ను మీకు ఈమెయిల్ చేసింది."

# Tokenize input text
inputs = tokenizer(input_text, return_tensors="pt")

# Generate output
outputs = model.generate(**inputs)

# Decode output tokens
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print output text
print("Output:", output_text)



Output: <extra_id_0>. 'నీకు కావాలంటే' అని మీ అమ్మ రాసింది.


In [52]:
with open('tl_sarcastic_vs_non_sarcastic_mt5.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

cleaned_lines = [line.replace('<extra_id_0>', '') for line in lines]

with open('tl_sarcastic_vs_non_sarcastic_mt5_trained_cleaned.txt', 'w', encoding='utf-8') as file:
    file.writelines(cleaned_lines)


In [54]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('tl_sarcastic_articles_test.csv')

# Extract the 'title' column
titles = df['title']


In [55]:
from nltk.translate.bleu_score import corpus_bleu

# Example ground truth and model-generated outputs as lists of strings
ground_truth = cleaned_lines
model_output = titles

# Convert strings to lists of tokens
ground_truth_tokenized = [reference.split() for reference in ground_truth]
model_output_tokenized = [hypothesis.split() for hypothesis in model_output]

# Calculate the BLEU score
bleu_score = corpus_bleu([[reference] for reference in ground_truth_tokenized], model_output_tokenized)

print(f"BLEU score: {bleu_score}")


BLEU score: 0.013472341009816211


In [58]:
model_output=list(model_output)

In [60]:
from rouge import Rouge

# Example ground truth and model-generated outputs as lists of strings
ground_truth = ['this is a test', 'this is another test']
model_output = ['this is a test', 'this is a different test']

# Initialize the ROUGE scorer
rouge = Rouge()

# Calculate ROUGE scores
scores = rouge.get_scores(model_output, ground_truth, avg=True)

print(scores)


{'rouge-1': {'r': 0.875, 'p': 0.8, 'f': 0.8333333283641975}, 'rouge-2': {'r': 0.6666666666666666, 'p': 0.625, 'f': 0.6428571379081633}, 'rouge-l': {'r': 0.875, 'p': 0.8, 'f': 0.8333333283641975}}


## **Telugu Non sarcastic model on non sarcastic articles**

In [5]:
!python transformers/examples/pytorch/summarization/run_summarization.py \
    --model_name_or_path google/mt5-small \
    --do_train True \
    --do_eval False \
    --do_predict True \
    --source_prefix "summarize: " \
    --train_file telugu_non_sarcastic_train.csv \
    --test_file tl_non_sarcastic_articles_test.csv \
    --text_column "text" \
    --summary_column "title" \
    --max_target_length 298 \
    --output_dir output_mt5_tl_nonsarcastic_and_ns/ \
    --per_device_train_batch_size=4 \
    --num_train_epochs 3 \
    --logging_strategy "epoch" \
    --save_strategy "no" \
    --overwrite_output_dir True \
    --predict_with_generate $@ 2>&1>./hg_mt5_log.txt

2024-04-23 15:42:10.532684: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-23 15:42:10.532738: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-23 15:42:10.534725: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-23 15:42:10.551569: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Using custom data configuration default-e96fb76b8fc0e

In [6]:

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [7]:
!cp -r output_mt5_tl_nonsarcastic_and_ns /content/drive/My\ Drive/saved_non_sarcastictrainer_and_ns/


In [8]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

# Load fine-tuned model and tokenizer
model = MT5ForConditionalGeneration.from_pretrained("output_mt5_tl_nonsarcastic_and_ns")
tokenizer = MT5Tokenizer.from_pretrained("output_mt5_tl_nonsarcastic_and_ns")

# Define input text
input_text = "పోర్ట్‌ల్యాండ్, మైనే - ఈ ఉదయమే దీన్ని తపాలా కార్యాలయంలో వేశానని, నువ్వు అందుకోవాలని మీ అమ్మ శనివారం తెలియజేసింది, నివేదికలు ధృవీకరించాయి. 'నీకు ఏదో పంపుతున్నాను' అని మీ అమ్మ నుండి ఒక టెక్స్ట్ సందేశం వచ్చింది. 'ఏదో కొన్ని రోజుల్లో అందుకుంటావులే' అని తెలిపింది. 'నీకిది వచ్చాక నాకు చెప్పాలి' అని మీ అమ్మ రాసింది. ప్రెస్ టైమ్ లో, మీ అమ్మ 'నీకు కావాలంటే' అని ప్యాకేజ్ ట్రాకింగ్ నంబర్‌ను మీకు ఈమెయిల్ చేసింది."

# Tokenize input text
inputs = tokenizer(input_text, return_tensors="pt")

# Generate output
outputs = model.generate(**inputs)

# Decode output tokens
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print output text
print("Output:", output_text)



Output: <extra_id_0> కోసం పంపుతున్నాను.


In [70]:
with open('tl_non_sarcastic_vs_non_sarcastic_mt5.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

cleaned_lines = [line.replace('<extra_id_0>', '') for line in lines]

with open('tl_non_sarcastic_vs_non_sarcastic_mt5_trained_cleaned.txt', 'w', encoding='utf-8') as file:
    file.writelines(cleaned_lines)


In [71]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('tl_non_sarcastic_articles_test.csv')

# Extract the 'title' column
titles = list(df['title'])


In [72]:
from nltk.translate.bleu_score import corpus_bleu

# Example ground truth and model-generated outputs as lists of strings
ground_truth = cleaned_lines
model_output = titles

# Convert strings to lists of tokens
ground_truth_tokenized = [reference.split() for reference in ground_truth]
model_output_tokenized = [hypothesis.split() for hypothesis in model_output]

# Calculate the BLEU score
bleu_score = corpus_bleu([[reference] for reference in ground_truth_tokenized], model_output_tokenized)

print(f"BLEU score: {bleu_score}")


BLEU score: 0.007293432929988352


In [73]:
from rouge import Rouge

# Example ground truth and model-generated outputs as lists of strings
ground_truth = cleaned_lines
model_output = titles

# Initialize the ROUGE scorer
rouge = Rouge()

# Calculate ROUGE scores
scores = rouge.get_scores(model_output, ground_truth, avg=True)

print(scores)


{'rouge-1': {'r': 0.0947597442240299, 'p': 0.07618976261833404, 'f': 0.08030237757727998}, 'rouge-2': {'r': 0.019254792826221393, 'p': 0.014123277516134654, 'f': 0.015853166066138503}, 'rouge-l': {'r': 0.09404545850974419, 'p': 0.07539611182468325, 'f': 0.07955049787803187}}
