In [None]:
!pip install datasets
!pip install transformers
!pip install scikit-learn

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets

In [None]:
!pip install accelerate>=0.21.0

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from sklearn.model_selection import train_test_split
import os
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize dataframes
train_df = None
val_df = None

# Loading the dataset
dataset_path = "/content/drive/MyDrive/train/project dataset.txt"

# Reading the dataset with error handling for incorrect line formats
try:
    # Reading the file line by line to handle errors manually
    with open(dataset_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    data = []
    for line in lines[1:]:  # Skip header if there is one
        parts = line.split(':')
        if len(parts) == 2:  # Ensure there are exactly 2 fields
            hindi, english = parts
            data.append({'hindi': hindi.strip(), 'english': english.strip()})
        else:
            logger.warning(f"Skipping line due to incorrect format: {line.strip()}")

    df = pd.DataFrame(data)
    logger.info("Dataset successfully read.")

    if df.empty:
        raise ValueError("The dataset is empty. Please check the dataset file.")

    # Removing trailing whitespaces and special characters like '|', '.' from Hindi sentences
    df['hindi'] = df['hindi'].str.strip('|. ')

    # Splitting the dataset
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    logger.info("Dataset successfully split.")
except FileNotFoundError as fnf_error:
    logger.error(f"File not found: {fnf_error}")
    exit()
except pd.errors.ParserError as parse_error:
    logger.error(f"Error parsing the dataset: {parse_error}")
    exit()
except ValueError as val_error:
    logger.error(f"Value error: {val_error}")
    exit()
except Exception as e:
    logger.error(f"Unexpected error: {e}")
    exit()

# Adding debug information to ensure data splitting is correct
if train_df is not None and val_df is not None:
    logger.info(f"Number of training samples: {len(train_df)}")
    logger.info(f"Number of validation samples: {len(val_df)}")
else:
    logger.error("DataFrames are not defined. Exiting.")
    exit()

# Loading tokenizer and model
model_checkpoint = "Helsinki-NLP/opus-mt-hi-en"
logger.info("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
logger.info("Tokenizer and model loaded.")

# Tokenize datasets
def tokenize_function(examples):
    inputs = tokenizer(examples["hindi"], padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["english"], padding="max_length", truncation=True)
    inputs["labels"] = labels["input_ids"]
    return inputs

train_dataset = Dataset.from_pandas(train_df)
logger.info("Training dataset created from DataFrame.")
train_dataset = train_dataset.map(tokenize_function, batched=True)
logger.info("Training dataset tokenized.")

val_dataset = Dataset.from_pandas(val_df)
logger.info("Validation dataset created from DataFrame.")
val_dataset = val_dataset.map(tokenize_function, batched=True)
logger.info("Validation dataset tokenized.")

# Set format for PyTorch tensors
logger.info("Setting dataset format for PyTorch tensors...")
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
logger.info("Dataset format set.")

# Training configuration
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    logging_dir='./logs',  # directory for storing logs
    logging_steps=100,  # log every 100 updates
    save_strategy='epoch',  # save model checkpoints by epoch
    logging_first_step=True,
)
logger.info("Training arguments set.")

# Trainer setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)
logger.info("Trainer set up.")

# Training the model
logger.info("Training the model...")
try:
    trainer.train()
    logger.info("Model training completed.")
except Exception as e:
    logger.error(f"Error during model training: {e}")
    exit()

# Save the model and tokenizer in the hidiProject folder
project_folder = os.path.abspath("/content/drive/MyDrive/train/trained_model")
os.makedirs(project_folder, exist_ok=True)
model.save_pretrained(project_folder)
tokenizer.save_pretrained(project_folder)
logger.info(f"Model saved in {project_folder}.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/304M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Map:   0%|          | 0/294 [00:00<?, ? examples/s]



Map:   0%|          | 0/74 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,10.6096,0.081341
2,10.6096,0.070335
3,0.4774,0.066735
4,0.4774,0.064929
5,0.4774,0.063846
6,0.0582,0.063338
7,0.0582,0.062912
8,0.0582,0.062673
9,0.0474,0.06254


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]], 'forced_eos_token_id': 0}


Epoch,Training Loss,Validation Loss
1,10.6096,0.081341
2,10.6096,0.070335
3,0.4774,0.066735
4,0.4774,0.064929
5,0.4774,0.063846
6,0.0582,0.063338
7,0.0582,0.062912
8,0.0582,0.062673
9,0.0474,0.06254
10,0.0474,0.062511


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]], 'forced_eos_token_id': 0}


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_checkpoint = "/content/drive/MyDrive/train/trained_model"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Function to generate translation based on input text
def generate_translation(input_text):
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
    model.eval()  # Set the model to evaluation mode
    # Using beam search and applying length penalty
    outputs = model.generate(input_ids, max_length=150, num_beams=4, length_penalty=0.8, early_stopping=True)
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

# Get input from user and generate translation
while True:
    user_input = input("Enter a Hindi sentence (press 'q' to quit): ")
    if user_input.lower() == 'q':
        break
    translated_text = generate_translation(user_input)
    print("Translated Text:", translated_text)
    print()



Enter a Hindi sentence (press 'q' to quit): दुःख में सब सुमिरन करें,सुख में कोई न करे,जो सुख में सुमिरन करे,दुःख कहे को होय?
Translated Text: the one who brings happiness in suffering says sorrow, doesn't anyone say it is true?

Enter a Hindi sentence (press 'q' to quit): तिनका कबहुँ ना निन्दिये,जो पाँवन तर होय,कबहुँ उड़ी आँखिन पड़े,तो पीर घनेरी होय.
Translated Text: Do not blame the chaff, which is lost in the eye, if it doesn't wear an eye, then the heart becomes thick.

Enter a Hindi sentence (press 'q' to quit): माला फेरत जुग भया,फिरा न मन का फेर,कर का मनका डार दे,मन का मनका फेर.
Translated Text: And your heart doesn't turn away, and your mind doesn't change, mind and mind.

Enter a Hindi sentence (press 'q' to quit): माया मरी न मन मरा,मर-मर गए शरीर,आशा तृष्णा न मरी,कह गए दास कबीर
Translated Text: cannot die if the heart is dead, the soul cannot die, if it doesn't die,

Enter a Hindi sentence (press 'q' to quit): q
