In [2]:
!pip install datasets




In [3]:
from datasets import load_dataset
dataset = load_dataset("cfilt/iitb-english-hindi")


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [5]:
len(dataset['train'])

1659083

In [6]:
max_length = 256

# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-hi")




In [7]:
article = dataset['validation'][2]['translation']['en']
inputs = tokenizer(article, return_tensors="pt")

translated_tokens = model.generate(
	**inputs, max_length=256
)
tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]


'एमएनएपी शिक्षकों के राष्ट्रपति, राजस्वीवर ने इस पुरस्कार को पेश करने के द्वारा स्कूल की प्रतिष्ठा की.'

In [8]:
dataset['validation'][2]['translation']['hi']


'मनपा शिक्षक संघ के अध्यक्ष राजेश गवरे ने स्कूल को भेंट देकर सराहना की।'

In [9]:
def preprocess_function(examples):
  inputs = [ex["en"] for ex in examples["translation"]]
  targets = [ex["hi"] for ex in examples["translation"]]

  model_inputs = tokenizer(inputs, max_length=max_length, truncation=True)
  labels = tokenizer(targets,max_length=max_length, truncation=True)
  model_inputs["labels"] = labels["input_ids"]

  return model_inputs


In [10]:
tokenized_datasets_validation = dataset['validation'].map(
	preprocess_function,
	batched= True,
	remove_columns=dataset["validation"].column_names,
	batch_size = 2
)

tokenized_datasets_test = dataset['test'].map(
	preprocess_function,
	batched= True,
	remove_columns=dataset["test"].column_names,
	batch_size = 2)


In [11]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [12]:
# Access the layers and freeze the specified number of layers
# Specify the number of layers to freeze from the end

for parameter in model.parameters():
	parameter.requires_grad = True
num_layers_to_freeze = 10 # Adjust as needed
for layer_index, layer in enumerate(model.model.encoder.layers):
	print
	if layer_index < len(model.model.encoder.layers) - num_layers_to_freeze:
		for parameter in layer.parameters():
			parameter.requires_grad = False

num_layers_to_freeze = 10 # Adjust as needed
for layer_index, layer in enumerate(model.model.decoder.layers):
	print
	if layer_index < len(model.model.encoder.layers) - num_layers_to_freeze:
		for parameter in layer.parameters():
			parameter.requires_grad = False


In [13]:
!pip install evaluate



In [14]:
!pip install sacrebleu



In [15]:
import evaluate

metric = evaluate.load("sacrebleu")

import numpy as np


def compute_metrics(eval_preds):
	preds, labels = eval_preds
	# In case the model returns more than the prediction logits
	if isinstance(preds, tuple):
		preds = preds[0]

	decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

	# Replace -100s in the labels as we can't decode them
	labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
	decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

	# Some simple post-processing
	decoded_preds = [pred.strip() for pred in decoded_preds]
	decoded_labels = [[label.strip()] for label in decoded_labels]

	result = metric.compute(predictions=decoded_preds, references=decoded_labels)
	return {"bleu": result["score"]}


In [16]:
import torch
from transformers import Seq2SeqTrainingArguments

# Check if a GPU is available and define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)
training_args = Seq2SeqTrainingArguments(
    f"finetuned-nlp-en-hi",
    gradient_checkpointing=False,  # Disabling gradient checkpointing
    per_device_train_batch_size=16,  # Reducing batch size
    learning_rate=1e-5,
    warmup_steps=2,
    max_steps=1000,  # Reducing the number of steps
    fp16=True,
    optim='adafactor',
    per_device_eval_batch_size=16,
    metric_for_best_model="eval_bleu",
    predict_with_generate=True,
    push_to_hub=False,
)




In [17]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
	model,
	training_args,
	train_dataset=tokenized_datasets_test,
	eval_dataset=tokenized_datasets_validation,
	data_collator=data_collator,
	tokenizer=tokenizer,
	compute_metrics=compute_metrics,
)

trainer.train()


max_steps is given, it will override any value given in num_train_epochs


  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
!pip install gradio



In [None]:
import gradio as gr


def translate(text):
  inputs = tokenizer(text, return_tensors="pt").to(device)
  translated_tokens = model.generate(**inputs, max_length=256)
  results = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
  return results


#Creating the User Interface Space
interface = gr.Interface(fn=translate,inputs=gr.Textbox(lines=2, placeholder='Text to translate'),
						outputs='text')
#launching the interface
interface.launch()


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("finetuned-nlp-en-hi")
tokenizer.save_pretrained("finetuned-nlp-en-hi")

# Load the fine-tuned model and tokenizer for inference
tokenizer = AutoTokenizer.from_pretrained("finetuned-nlp-en-hi")
model = AutoModelForSeq2SeqLM.from_pretrained("finetuned-nlp-en-hi")
model.to(device)

# Define the translation function
def translate(text):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    translated_tokens = model.generate(**inputs, max_length=256)
    results = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    return results

# Create the Gradio interface
interface = gr.Interface(fn=translate,
                         inputs=gr.Textbox(lines=2, placeholder='Text to translate'),
                         outputs='text')

# Launch the interface
interface.launch()

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}


Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import gradio as gr
import torch
from transformers import Seq2SeqTrainingArguments

# Check if a GPU is available and define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

# Load the fine-tuned model and tokenizer for inference
tokenizer = AutoTokenizer.from_pretrained("finetuned-nlp-en-hi")
model = AutoModelForSeq2SeqLM.from_pretrained("finetuned-nlp-en-hi")
model.to(device)

# Define the translation function
def translate(text):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    translated_tokens = model.generate(**inputs, max_length=256)
    results = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    return results

# Create the Gradio interface
interface = gr.Interface(fn=translate,
                         inputs=gr.Textbox(lines=2, placeholder='Text to translate'),
                         outputs='text')

# Launch the interface
interface.launch()



Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


