In [None]:
# Run this only if you get error below

# !pip install transfromers sentencepiece torch datasets

In [2]:
import pandas as pd
from datasets import Dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
import evaluate

In [3]:
df_english = pd.read_json('English.json')

In [4]:
df_hindi = pd.read_json('Hindi.json')

In [7]:
df_english['text']= df_english['instruction']+ " " + \
df_english['input'] + " " + df_english['output']

In [10]:
df_hindi['text'] = df_hindi['instruction'] + " " + df_hindi["input"] + " " + df_hindi['output']


In [11]:
df_english = df_english[['text']]
df_hindi = df_hindi[['text']]


In [12]:
df_english

Unnamed: 0,text
0,Which is a species of fish? Tope or Rope nan Tope
1,Why can camels survive for long without water?...
2,"Alice's parents have three daughters: Amy, Jes..."
3,When was Tomoaki Komorida born? Komorida was b...
4,If I have more pieces at the time of stalemate...
...,...
15008,How do i accept the change nan Embrace the cha...
15009,What is a laser and who created it? A laser is...
15010,What is the difference between a road bike and...
15011,How does GIS help in the real estate investmen...


In [13]:
df_hindi

Unnamed: 0,text
0,मछली की कौन सी प्रजाति है? टोपे या रस्सी nan ...
1,ऊँट बिना पानी के लम्बे समय तक जीवित क्यों रह ...
2,"ऐलिस के माता-पिता की तीन बेटियाँ हैं: एमी, जेस..."
3,टोमोआकी कोमोरिडा का जन्म कब हुआ था? कोमोरिडा ...
4,"यदि गतिरोध के समय मेरे पास अधिक टुकड़े हों, त..."
...,...
15008,मैं परिवर्तन को कैसे स्वीकार करूं? nan बदलाव...
15009,लेज़र क्या है और इसे किसने बनाया? लेज़र एक उप...
15010,रोड बाइक और माउंटेन बाइक में क्या अंतर है? na...
15011,जीआईएस रियल एस्टेट निवेश उद्योग में कैसे मदद क...


In [14]:
df_combined = pd.concat([df_english, df_hindi], axis=1)
df_combined.columns = ['english_text', 'hindi_text']

In [15]:
df_combined

Unnamed: 0,english_text,hindi_text
0,Which is a species of fish? Tope or Rope nan Tope,मछली की कौन सी प्रजाति है? टोपे या रस्सी nan ...
1,Why can camels survive for long without water?...,ऊँट बिना पानी के लम्बे समय तक जीवित क्यों रह ...
2,"Alice's parents have three daughters: Amy, Jes...","ऐलिस के माता-पिता की तीन बेटियाँ हैं: एमी, जेस..."
3,When was Tomoaki Komorida born? Komorida was b...,टोमोआकी कोमोरिडा का जन्म कब हुआ था? कोमोरिडा ...
4,If I have more pieces at the time of stalemate...,"यदि गतिरोध के समय मेरे पास अधिक टुकड़े हों, त..."
...,...,...
15008,How do i accept the change nan Embrace the cha...,मैं परिवर्तन को कैसे स्वीकार करूं? nan बदलाव...
15009,What is a laser and who created it? A laser is...,लेज़र क्या है और इसे किसने बनाया? लेज़र एक उप...
15010,What is the difference between a road bike and...,रोड बाइक और माउंटेन बाइक में क्या अंतर है? na...
15011,How does GIS help in the real estate investmen...,जीआईएस रियल एस्टेट निवेश उद्योग में कैसे मदद क...


In [16]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_combined)

In [17]:
dataset

Dataset({
    features: ['english_text', 'hindi_text'],
    num_rows: 15013
})

In [18]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")


In [20]:
model

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

In [21]:
def preprocess_function_en_hi(examples):
    inputs = [f"translate English to Hindi: {text}" for text in examples["english_text"]]
    targets = examples["hindi_text"]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [22]:
def preprocess_function_hi_en(examples):
    inputs = [f"translate Hindi to English: {text}" for text in examples["hindi_text"]]
    targets = examples["english_text"]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [23]:
tokenized_dataset_en_hi = dataset.map(preprocess_function_en_hi, batched=True)
tokenized_dataset_hi_en = dataset.map(preprocess_function_hi_en, batched=True)

Map:   0%|          | 0/15013 [00:00<?, ? examples/s]

Map:   0%|          | 0/15013 [00:00<?, ? examples/s]

In [24]:
tokenized_dataset_en_hi

Dataset({
    features: ['english_text', 'hindi_text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 15013
})

In [25]:
tokenized_dataset_hi_en

Dataset({
    features: ['english_text', 'hindi_text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 15013
})

In [26]:
from datasets import concatenate_datasets
full_dataset = concatenate_datasets([tokenized_dataset_en_hi, tokenized_dataset_hi_en])


In [28]:
training_args=Seq2SeqTrainingArguments(

    output_dir='./subash_folder',
    evaluation_strategy="no",
    learning_rate= 0.01,
    per_device_train_batch_size=8,
    weight_decay = 0.01,
    num_train_epochs=1, # Increase this to 5-10 
    predict_with_generate=True,
    fp16= True,
    logging_dir="./mylogs"
)



In [30]:
trainer= Seq2SeqTrainer(
    model = model,
    args= training_args,
    train_dataset=full_dataset,
    tokenizer= tokenizer

)

  trainer= Seq2SeqTrainer(


In [33]:
trainer.train()

In [None]:
model.save_pretrained("./subash_model")
tokenizer.save_pretrained("./subash_model")

In [None]:
model.push_to_hub("subash/hindi_english_translator", use_auth_token="your hugging face token")

# Use this model

In [None]:
def translate(text, direction="en-hi"):
    if direction == "en-hi":
        prompt = f"translate English to Hindi: {text}"
    else:
        prompt = f"translate Hindi to English: {text}"

    model.to("cuda" if torch.cuda.is_available() else "cpu")  # Ensure model is on GPU
    model.eval()  # Set model to evaluation mode

    with torch.no_grad():
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding="max_length", max_length=128).to(model.device)
        output = model.generate(**inputs, max_length=100, num_beams=5)
        if torch.cuda.is_available():
            torch.cuda.synchronize()

    return tokenizer.decode(output[0], skip_special_tokens=True)


In [None]:
translate("""Mr. Subash is working on machine learning system. He is 25 years old. He lives in Atlanta. """)

In [None]:
!ls ./mbart-finetuned-en-hi-hi-en/checkpoint-500