In [9]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load and clean the dataset
file_path = "C:\\Users\\prana\\Downloads\\Nouns_new_6.xlsx"
df = pd.read_excel(file_path)
df_cleaned = df[['Word', 'Base Word']].dropna()

# Extracting suffixes (optional, mostly for analysis)
df_cleaned['Suffix'] = df_cleaned.apply(lambda row: row['Word'].replace(row['Base Word'], ''), axis=1)

# Tokenize the words and base words
tokenizer = Tokenizer(char_level=True)  # Character-level tokenization
tokenizer.fit_on_texts(df_cleaned['Word'].tolist() + df_cleaned['Base Word'].tolist())

# Convert words and base words to sequences
word_sequences = tokenizer.texts_to_sequences(df_cleaned['Word'].tolist())
base_word_sequences = tokenizer.texts_to_sequences(df_cleaned['Base Word'].tolist())

# Pad sequences to the same length
max_seq_length = max(max([len(seq) for seq in word_sequences]), max([len(seq) for seq in base_word_sequences]))
word_sequences_padded = pad_sequences(word_sequences, maxlen=max_seq_length, padding='post')
base_word_sequences_padded = pad_sequences(base_word_sequences, maxlen=max_seq_length, padding='post')

# Convert base word sequences to categorical (one-hot encoding)
base_word_sequences_categorical = [to_categorical(seq, num_classes=len(tokenizer.word_index) + 1) for seq in base_word_sequences_padded]

# Convert to numpy arrays
X = np.array(word_sequences_padded)
y = np.array(base_word_sequences_categorical)



In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam

# Define the model with additional LSTM layer and dropout for regularization
model = Sequential()

# Embedding layer with increased dimensions
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_seq_length))

# First Bidirectional LSTM layer
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Dropout(0.2))

# Second Bidirectional LSTM layer
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.2))

# TimeDistributed layer with Dense output to match the one-hot encoded output
model.add(TimeDistributed(Dense(len(tokenizer.word_index) + 1, activation='softmax')))

# Compile the model with Adam optimizer
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()




Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 25, 128)           4992      
                                                                 
 bidirectional_1 (Bidirecti  (None, 25, 512)           788480    
 onal)                                                           
                                                                 
 dropout (Dropout)           (None, 25, 512)           0         
                                                                 
 bidirectional_2 (Bidirecti  (None, 25, 256)           656384    
 onal)                                                           
                                                                 
 dropout_1 (Dropout)         (None, 25, 256)           0         
                                                                 
 time_distributed_2 (TimeDi  (None, 25, 39)           

In [11]:
from tensorflow.keras.callbacks import EarlyStopping

# Implement Early Stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with increased epochs and early stopping
history = model.fit(X, y, epochs=50, batch_size=64, validation_split=0.2, callbacks=[early_stopping])


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50


In [15]:
# Evaluate the model
loss, accuracy = model.evaluate(X, y)
print(f"Model Loss: {loss}")
print(f"Model Accuracy: {accuracy}")

# Predict base words for new inputs
def predict_base_word(word):
    # Tokenize and pad the input word
    sequence = tokenizer.texts_to_sequences([word])
    padded_sequence = pad_sequences(sequence, maxlen=max_seq_length, padding='post')
    
    # Predict the base word sequence
    predicted_sequence = model.predict(padded_sequence)
    predicted_indices = np.argmax(predicted_sequence, axis=-1)
    
    # Convert indices back to characters
    predicted_base_word = ''.join([tokenizer.index_word[idx] for idx in predicted_indices[0] if idx > 0])
    
    return predicted_base_word

# Post-processing function to refine predictions
def post_process(predicted_base_word, original_word):
    if predicted_base_word in original_word:
        # Assuming the base word should be part of the original word
        return predicted_base_word
    # Additional heuristics can be applied here as needed
    return predicted_base_word

# Example prediction and post-processing
example_word = "marutām"
predicted_base_word = predict_base_word(example_word)
predicted_base_word = post_process(predicted_base_word, example_word)
print(f"Word: {example_word} -> Predicted Base Word: {predicted_base_word}")



Model Loss: 0.15966664254665375
Model Accuracy: 0.9589112401008606
Word: marutām -> Predicted Base Word: marut


In [17]:
# Function to predict the base word for all words in the dataset
def predict_all_base_words(df_cleaned):
    correct_predictions = 0
    total_predictions = len(df_cleaned)

    predicted_base_words = []

    for index, row in df_cleaned.iterrows():
        word = row['Word']
        actual_base_word = row['Base Word']

        # Predict the base word using the model
        predicted_base_word = predict_base_word(word)
        
        # Post-process the predicted base word
        predicted_base_word = post_process(predicted_base_word, word)
        predicted_base_words.append(predicted_base_word)
        
        # Check if the prediction is correct
        if predicted_base_word == actual_base_word:
            correct_predictions += 1

    # Calculate accuracy
    accuracy = correct_predictions / total_predictions
    return accuracy, predicted_base_words

# Compute accuracy
accuracy, predicted_base_words = predict_all_base_words(df_cleaned)
print(f"Model Accuracy on the Entire Dataset: {accuracy * 100:.2f}%")



Model Accuracy on the Entire Dataset: 49.45%


In [18]:
# Function to predict the base word for all words in the dataset and identify incorrect predictions
def get_incorrect_predictions(df_cleaned):
    incorrect_indices = []

    for index, row in df_cleaned.iterrows():
        word = row['Word']
        actual_base_word = row['Base Word']

        # Predict the base word using the model
        predicted_base_word = predict_base_word(word)
        
        # Post-process the predicted base word
        predicted_base_word = post_process(predicted_base_word, word)
        
        # If the prediction is incorrect, save the index
        if predicted_base_word != actual_base_word:
            incorrect_indices.append(index)

    return df_cleaned.iloc[incorrect_indices]

# Get the DataFrame with incorrect predictions
incorrect_predictions_df = get_incorrect_predictions(df_cleaned)




In [19]:
# Prepare the new dataset with only incorrect predictions
word_sequences_incorrect = tokenizer.texts_to_sequences(incorrect_predictions_df['Word'].tolist())
base_word_sequences_incorrect = tokenizer.texts_to_sequences(incorrect_predictions_df['Base Word'].tolist())

# Pad sequences to the same length as before
word_sequences_padded_incorrect = pad_sequences(word_sequences_incorrect, maxlen=max_seq_length, padding='post')
base_word_sequences_padded_incorrect = pad_sequences(base_word_sequences_incorrect, maxlen=max_seq_length, padding='post')

# Convert base word sequences to categorical (one-hot encoding)
base_word_sequences_categorical_incorrect = [to_categorical(seq, num_classes=len(tokenizer.word_index) + 1) for seq in base_word_sequences_padded_incorrect]

# Convert to numpy arrays
X_incorrect = np.array(word_sequences_padded_incorrect)
y_incorrect = np.array(base_word_sequences_categorical_incorrect)

# Retrain the model on the incorrect predictions
history_incorrect = model.fit(X_incorrect, y_incorrect, epochs=30, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model again on the original dataset
loss, accuracy = model.evaluate(X, y)
print(f"Retrained Model Loss: {loss}")
print(f"Retrained Model Accuracy: {accuracy * 100:.2f}%")


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Retrained Model Loss: 0.15333348512649536
Retrained Model Accuracy: 95.79%


In [20]:
# Recompute accuracy on the entire dataset after retraining
accuracy_after_retrain, _ = predict_all_base_words(df_cleaned)
print(f"Model Accuracy on the Entire Dataset After Retraining: {accuracy_after_retrain * 100:.2f}%")


Model Accuracy on the Entire Dataset After Retraining: 40.44%


In [1]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = "C:\\Users\\prana\\Downloads\\Nouns_new_7.xlsx"  # Replace with your file path
sanskrit_df = pd.read_excel(file_path)

# Preparing data for training
sanskrit_df['input_text'] = sanskrit_df['Word'] + ' ->'
sanskrit_df['target_text'] = sanskrit_df['Base Word']

# Split the data into training and testing sets
train_df, test_df = train_test_split(sanskrit_df, test_size=0.2, random_state=42)

# Custom dataset class
class SanskritDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=50):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        input_text = self.dataframe.iloc[idx]['input_text']
        target_text = self.dataframe.iloc[idx]['target_text']

        # Combine input and target for GPT-2
        combined_text = input_text + " " + target_text

        # Tokenize combined text
        encoding = self.tokenizer(
            combined_text,
            return_tensors='pt',
            truncation=True,
            padding='max_length',
            max_length=self.max_length
        )

        # Extract input_ids and attention_mask
        input_ids = encoding.input_ids.squeeze()
        attention_mask = encoding.attention_mask.squeeze()

        # Create labels: shift the input_ids for the output labels
        labels = input_ids.clone()
        labels[:len(self.tokenizer(input_text).input_ids)] = -100  # We don't want to predict the input

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Load the GPT-2 model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add special tokens (for padding, if needed)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

# Set pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

# Create datasets
train_dataset = SanskritDataset(train_df, tokenizer)
test_dataset = SanskritDataset(test_df, tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_steps=10,
    save_total_limit=2,
    report_to='none',  # Use 'tensorboard' or 'wandb' if desired
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained(".\\sanskrit_gpt2_model")
tokenizer.save_pretrained(".\\sanskrit_gpt2_model")

##
##import os
##
### Mount Google Drive
##from google.colab import drive
##drive.mount('/content/drive')
##
### Create a folder in the root directory
##!mkdir -p "/content/drive/My Drive/My Folder"
##
##
##model.save_pretrained("/content/drive/My Drive/My Folder/sanskrit_gpt2_model")
##tokenizer.save_pretrained("/content/drive/My Drive/My Folder/sanskrit_gpt2_model")
##




  0%|          | 0/1851 [00:00<?, ?it/s]

{'loss': 35.0195, 'grad_norm': 156.89047241210938, 'learning_rate': 4.9729875742841706e-05, 'epoch': 0.02}
{'loss': 0.7677, 'grad_norm': 4.899957656860352, 'learning_rate': 4.9459751485683416e-05, 'epoch': 0.03}
{'loss': 0.4541, 'grad_norm': 2.7614669799804688, 'learning_rate': 4.9189627228525127e-05, 'epoch': 0.05}
{'loss': 0.2888, 'grad_norm': 1.875085711479187, 'learning_rate': 4.891950297136683e-05, 'epoch': 0.06}
{'loss': 0.2251, 'grad_norm': 5.558084011077881, 'learning_rate': 4.864937871420854e-05, 'epoch': 0.08}
{'loss': 0.1522, 'grad_norm': 5.325222969055176, 'learning_rate': 4.8379254457050244e-05, 'epoch': 0.1}
{'loss': 0.1944, 'grad_norm': 6.228538990020752, 'learning_rate': 4.8109130199891954e-05, 'epoch': 0.11}
{'loss': 0.1398, 'grad_norm': 3.5715432167053223, 'learning_rate': 4.783900594273366e-05, 'epoch': 0.13}
{'loss': 0.1945, 'grad_norm': 4.243593215942383, 'learning_rate': 4.756888168557537e-05, 'epoch': 0.15}
{'loss': 0.1076, 'grad_norm': 5.773304462432861, 'learni

  0%|          | 0/155 [00:00<?, ?it/s]

{'eval_loss': 0.01445954479277134, 'eval_runtime': 94.9908, 'eval_samples_per_second': 3.253, 'eval_steps_per_second': 1.632, 'epoch': 1.0}
{'loss': 0.0038, 'grad_norm': 0.06204421818256378, 'learning_rate': 3.3252296056185846e-05, 'epoch': 1.0}
{'loss': 0.0049, 'grad_norm': 0.008715310133993626, 'learning_rate': 3.2982171799027556e-05, 'epoch': 1.02}
{'loss': 0.0116, 'grad_norm': 0.11105351895093918, 'learning_rate': 3.271204754186926e-05, 'epoch': 1.04}
{'loss': 0.0126, 'grad_norm': 1.4220683574676514, 'learning_rate': 3.244192328471097e-05, 'epoch': 1.05}
{'loss': 0.0443, 'grad_norm': 1.5575040578842163, 'learning_rate': 3.2171799027552674e-05, 'epoch': 1.07}
{'loss': 0.014, 'grad_norm': 0.66691654920578, 'learning_rate': 3.1901674770394384e-05, 'epoch': 1.09}
{'loss': 0.017, 'grad_norm': 0.03361918777227402, 'learning_rate': 3.1631550513236094e-05, 'epoch': 1.1}
{'loss': 0.0022, 'grad_norm': 0.10000786930322647, 'learning_rate': 3.13614262560778e-05, 'epoch': 1.12}
{'loss': 0.0148,

  0%|          | 0/155 [00:00<?, ?it/s]

{'eval_loss': 0.0031532044522464275, 'eval_runtime': 94.3956, 'eval_samples_per_second': 3.273, 'eval_steps_per_second': 1.642, 'epoch': 2.0}
{'loss': 0.0039, 'grad_norm': 0.4649900496006012, 'learning_rate': 1.6504592112371693e-05, 'epoch': 2.01}
{'loss': 0.0001, 'grad_norm': 0.019292453303933144, 'learning_rate': 1.6234467855213397e-05, 'epoch': 2.03}
{'loss': 0.0073, 'grad_norm': 0.006920979358255863, 'learning_rate': 1.5964343598055107e-05, 'epoch': 2.04}
{'loss': 0.0002, 'grad_norm': 0.0810856819152832, 'learning_rate': 1.5694219340896814e-05, 'epoch': 2.06}
{'loss': 0.0042, 'grad_norm': 0.05008886754512787, 'learning_rate': 1.542409508373852e-05, 'epoch': 2.07}
{'loss': 0.0074, 'grad_norm': 0.0003976555017288774, 'learning_rate': 1.5153970826580228e-05, 'epoch': 2.09}
{'loss': 0.0056, 'grad_norm': 1.0576472282409668, 'learning_rate': 1.4883846569421936e-05, 'epoch': 2.11}
{'loss': 0.0053, 'grad_norm': 0.010647419840097427, 'learning_rate': 1.4613722312263642e-05, 'epoch': 2.12}
{

  0%|          | 0/155 [00:00<?, ?it/s]

{'eval_loss': 0.0015758578665554523, 'eval_runtime': 44.2854, 'eval_samples_per_second': 6.977, 'eval_steps_per_second': 3.5, 'epoch': 3.0}
{'train_runtime': 30185.018, 'train_samples_per_second': 0.123, 'train_steps_per_second': 0.061, 'train_loss': 0.21530871559879813, 'epoch': 3.0}


('.\\sanskrit_gpt2_model\\tokenizer_config.json',
 '.\\sanskrit_gpt2_model\\special_tokens_map.json',
 '.\\sanskrit_gpt2_model\\vocab.json',
 '.\\sanskrit_gpt2_model\\merges.txt',
 '.\\sanskrit_gpt2_model\\added_tokens.json')

In [2]:
# Function to predict the base word using the fine-tuned model
def predict_base_word_with_model(word):
    input_text = f"{word} ->"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    with torch.no_grad():
        outputs = model.generate(input_ids, max_length=20)
    predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    base_word = predicted_text.split("->")[-1].strip()
    return base_word

# Evaluate the model on the test set
def evaluate_model(test_df):
    predictions = []
    for _, row in test_df.iterrows():
        word = row['Word']
        true_base_word = row['Base Word']
        predicted_base_word = predict_base_word_with_model(word)
        predictions.append(predicted_base_word == true_base_word)

    accuracy = sum(predictions) / len(predictions)
    return accuracy

# Calculate the accuracy
accuracy = evaluate_model(test_df)
print(f"Model accuracy on the test set: {accuracy * 100:.2f}%")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask

Model accuracy on the test set: 98.06%


In [4]:
# Combine the training and test sets into one DataFrame
full_dataset_df = pd.concat([train_df, test_df]).reset_index(drop=True)

# Function to predict the base word using the fine-tuned model
def predict_base_word_with_model(word):
    input_text = f"{word} ->"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    with torch.no_grad():
        outputs = model.generate(input_ids, max_length=512)
    predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    base_word = predicted_text.split("->")[-1].strip()
    return base_word

# Evaluate the model on the entire dataset
def evaluate_model(full_dataset_df):
    predictions = []
    for _, row in full_dataset_df.iterrows():
        word = row['Word']
        true_base_word = row['Base Word']
        predicted_base_word = predict_base_word_with_model(word)
        predictions.append(predicted_base_word == true_base_word)

    accuracy = sum(predictions) / len(predictions)
    return accuracy

# Calculate the accuracy
accuracy = evaluate_model(full_dataset_df)
print(f"Model accuracy on the entire dataset: {accuracy * 100:.2f}%")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

Model accuracy on the entire dataset: 98.83%


In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("sanskrit_gpt2_model")

# Load the model
model = GPT2LMHeadModel.from_pretrained("sanskrit_gpt2_model")

# Set the model to evaluation mode
model.eval()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50258, bias=False)
)

In [4]:
import torch
def predict(input_text, model, tokenizer, max_length=50):
    # Prepare the input text
    input_text = input_text + ' ->'
    
    # Encode the input text
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Generate predictions
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode the output
    predicted_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    return predicted_text.split('->')[-1].strip()

# Example usage
input_word = "lakshmanasya"
predicted_base_word = predict(input_word, model, tokenizer)
print(predicted_base_word)

lakshman


In [1]:
!rm -rf ~/.cache/huggingface
!pip install --upgrade transformers sentencepiece
# Step 1: Install Required Libraries
!pip install transformers pandas torch scikit-learn


'rm' is not recognized as an internal or external command,
operable program or batch file.










In [5]:

# Step 2: Import Necessary Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Step 3: Load and Preprocess the Data
# Load the Excel file into a pandas DataFrame
file_path = "C:\\Users\\prana\\Downloads\\Nouns_new_7.xlsx"  # Path to the uploaded dataset
data = pd.read_excel(file_path)

# Create input-output pairs with task-specific prompts
data['input_text'] = 'Predict the base word for: ' + data['Word']
data['target_text'] = data['Base Word']

# Split the data into training, validation, and test sets
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.66, random_state=42) # 0.66*0.3 = ~20% for testing

# Step 4: Define a Custom Dataset Class with Chunk Overlap
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=50, chunk_overlap=10):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.chunk_overlap = chunk_overlap

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        input_text = self.data.iloc[index]['input_text']
        target_text = self.data.iloc[index]['target_text']

        # Tokenize input and target texts
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=False,  # No truncation
            return_tensors="pt"
        )
        targets = self.tokenizer(
            target_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=False,  # No truncation
            return_tensors="pt"
        )

        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        target_ids = targets['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': target_ids
        }
#step 5
try:
    tokenizer = T5Tokenizer.from_pretrained("google/byt5-small")
    model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Try clearing cache or checking your internet connection.")

# Determine maximum length based on the dataset
max_input_length = max(data['input_text'].apply(lambda x: len(tokenizer.encode(x))))
max_target_length = max(data['target_text'].apply(lambda x: len(tokenizer.encode(x))))
max_len = max(max_input_length, max_target_length) + 10  # Adding buffer

# Create Dataset Instances
train_dataset = TextDataset(train_data, tokenizer, max_len=max_len)
val_dataset = TextDataset(val_data, tokenizer, max_len=max_len)
test_dataset = TextDataset(test_data, tokenizer, max_len=max_len)

# Step 6: Set Up Training Arguments and Trainer
training_args = TrainingArguments(
    output_dir="./byt5-results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    save_steps=10_000,
    eval_steps=1_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Step 7: Train the Model
trainer.train()

# Step 8: Evaluate the Model on Test Set
print("Evaluating on Test Set:")
test_results = trainer.evaluate(eval_dataset=test_dataset)
print(test_results)

# Step 9: Evaluate the Model on Complete Dataset
# Combine train, validation, and test data
complete_dataset = TextDataset(pd.concat([train_data, val_data, test_data]), tokenizer, max_len=max_len)

print("Evaluating on Complete Dataset:")
complete_results = trainer.evaluate(eval_dataset=complete_dataset)
print(complete_results)

# Step 10: Save the Fine-Tuned Model
model.save_pretrained(".//byt5-finetuned-model")
tokenizer.save_pretrained(".//byt5-finetuned-model")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'ByT5Tokenizer'. 
The class this function is called from is 'T5Tokenizer'.


Error loading model: not a string
Try clearing cache or checking your internet connection.


NameError: name 'tokenizer' is not defined

In [7]:
# Step 1: Install Required Libraries
!pip install transformers pandas torch scikit-learn sentencepiece

# Step 2: Import Necessary Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Step 3: Load and Preprocess the Data
# Load the Excel file into a pandas DataFrame
file_path = "C:\\Users\\prana\\Downloads\\Nouns_new_7.xlsx"  # Path to the uploaded dataset
data = pd.read_excel(file_path)

# Create input-output pairs with task-specific prompts
data['input_text'] = 'Predict the base word for: ' + data['Word']
data['target_text'] = data['Base Word']

# Split the data into training, validation, and test sets
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.66, random_state=42)  # 0.66*0.3 = ~20% for testing

# Step 4: Define a Custom Dataset Class
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=50):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        input_text = self.data.iloc[index]['input_text']
        target_text = self.data.iloc[index]['target_text']

        # Tokenize input and target texts
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=False,  # No truncation
            return_tensors="pt"
        )
        targets = self.tokenizer(
            target_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=False,  # No truncation
            return_tensors="pt"
        )

        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        target_ids = targets['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': target_ids
        }

# Step 5: Initialize Tokenizer and Model with Error Handling
try:
    tokenizer = T5Tokenizer.from_pretrained("google/byt5-small")
    model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
    print("Tokenizer and model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please check your internet connection and try clearing cache.")

# Ensure tokenizer is defined before proceeding
if 'tokenizer' not in globals():
    raise NameError("Tokenizer could not be initialized. Please resolve the error above before proceeding.")

# Determine maximum length based on the dataset
try:
    max_input_length = max(data['input_text'].apply(lambda x: len(tokenizer.encode(x))))
    max_target_length = max(data['target_text'].apply(lambda x: len(tokenizer.encode(x))))
    max_len = max(max_input_length, max_target_length) + 10  # Adding buffer
    print(f"Max length determined: {max_len}")
except Exception as e:
    print(f"Error during max length calculation: {e}")

# Create Dataset Instances
train_dataset = TextDataset(train_data, tokenizer, max_len=max_len)
val_dataset = TextDataset(val_data, tokenizer, max_len=max_len)
test_dataset = TextDataset(test_data, tokenizer, max_len=max_len)

# Step 6: Set Up Training Arguments and Trainer
training_args = TrainingArguments(
    output_dir="./byt5-results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    save_steps=10_000,
    eval_steps=1_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Step 7: Train the Model
trainer.train()

# Step 8: Evaluate the Model on Test Set
print("Evaluating on Test Set:")
test_results = trainer.evaluate(eval_dataset=test_dataset)
print(test_results)

# Step 9: Evaluate the Model on Complete Dataset
# Combine train, validation, and test data
complete_dataset = TextDataset(pd.concat([train_data, val_data, test_data]), tokenizer, max_len=max_len)

print("Evaluating on Complete Dataset:")
complete_results = trainer.evaluate(eval_dataset=complete_dataset)
print(complete_results)

# Step 10: Save the Fine-Tuned Model
model.save_pretrained("./byt5-finetuned-model")
tokenizer.save_pretrained("./byt5-finetuned-model")




The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'ByT5Tokenizer'. 
The class this function is called from is 'T5Tokenizer'.


Error loading model: not a string
Please check your internet connection and try clearing cache.


NameError: Tokenizer could not be initialized. Please resolve the error above before proceeding.

In [8]:
from transformers import T5ForConditionalGeneration
import torch

model = T5ForConditionalGeneration.from_pretrained('google/byt5-small')

input_ids = torch.tensor([list("Life is like a box of chocolates.".encode("utf-8"))]) + 3  # add 3 for special tokens
labels = torch.tensor([list("La vie est comme une boîte de chocolat.".encode("utf-8"))]) + 3  # add 3 for special tokens

loss = model(input_ids, labels=labels).loss # forward pass


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

: 

In [2]:
# Step 1: Install Required Libraries (Run this in your environment)
!pip install transformers pandas torch scikit-learn sentencepiece openpyxl

# Step 2: Import Necessary Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

# Step 3: Load and Preprocess the Data
# Load the Excel file into a pandas DataFrame
file_path = "C:\\Users\\prana\\Downloads\\Nouns_new_7.xlsx"  # Path to the uploaded dataset
try:
    data = pd.read_excel(file_path)
    print("Data loaded successfully.")
except Exception as e:
    print(f"Error loading data: {e}")
    raise SystemExit("Failed to load data. Exiting...")

# Create input-output pairs with task-specific prompts
try:
    data['input_text'] = 'Predict the base word for: ' + data['Word']
    data['target_text'] = data['Base Word']
    print("Input-output pairs created successfully.")
except KeyError as e:
    print(f"Error with DataFrame columns: {e}")
    raise SystemExit("Ensure the dataset has 'Word' and 'Base Word' columns. Exiting...")

# Split the data into training, validation, and test sets
try:
    train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
    val_data, test_data = train_test_split(temp_data, test_size=0.66, random_state=42)  # 0.66*0.3 = ~20% for testing
    print("Data split into train, validation, and test sets successfully.")
except Exception as e:
    print(f"Error splitting data: {e}")
    raise SystemExit("Failed to split data. Exiting...")

# Step 4: Define a Custom Dataset Class
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=50):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        input_text = self.data.iloc[index]['input_text']
        target_text = self.data.iloc[index]['target_text']

        # Tokenize input and target texts
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,  # Enable truncation to handle long texts
            return_tensors="pt"
        )
        targets = self.tokenizer(
            target_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,  # Enable truncation to handle long texts
            return_tensors="pt"
        )

        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        target_ids = targets['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': target_ids
        }

# Step 5: Initialize Tokenizer and Model with Error Handling
try:
    tokenizer = T5Tokenizer.from_pretrained("google\\byt5-small")
    model = T5ForConditionalGeneration.from_pretrained("google\\byt5-small")
    print("Tokenizer and model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please check your internet connection and try clearing cache.")
    raise SystemExit("Failed to load model or tokenizer. Exiting...")

# Ensure tokenizer is defined before proceeding
if 'tokenizer' not in globals():
    raise NameError("Tokenizer could not be initialized. Please resolve the error above before proceeding.")

# Determine maximum length based on the dataset
try:
    max_input_length = max(data['input_text'].apply(lambda x: len(tokenizer.encode(x))))
    max_target_length = max(data['target_text'].apply(lambda x: len(tokenizer.encode(x))))
    max_len = max(max_input_length, max_target_length) + 10  # Adding buffer
    print(f"Max length determined: {max_len}")
except Exception as e:
    print(f"Error during max length calculation: {e}")
    raise SystemExit("Failed to calculate max length. Exiting...")

# Create Dataset Instances
train_dataset = TextDataset(train_data, tokenizer, max_len=max_len)
val_dataset = TextDataset(val_data, tokenizer, max_len=max_len)
test_dataset = TextDataset(test_data, tokenizer, max_len=max_len)

# Step 6: Set Up Training Arguments and Trainer
training_args = TrainingArguments(
    output_dir="./byt5-results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    save_steps=10_000,
    eval_steps=1_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Step 7: Train the Model
try:
    trainer.train()
    print("Training completed successfully.")
except Exception as e:
    print(f"Error during training: {e}")
    raise SystemExit("Failed during training. Exiting...")

# Step 8: Evaluate the Model on Test Set
try:
    print("Evaluating on Test Set:")
    test_results = trainer.evaluate(eval_dataset=test_dataset)
    print(test_results)
except Exception as e:
    print(f"Error during evaluation: {e}")
    raise SystemExit("Failed during evaluation on test set. Exiting...")

# Step 9: Evaluate the Model on Complete Dataset
# Combine train, validation, and test data
complete_dataset = TextDataset(pd.concat([train_data, val_data, test_data]), tokenizer, max_len=max_len)

try:
    print("Evaluating on Complete Dataset:")
    complete_results = trainer.evaluate(eval_dataset=complete_dataset)
    print(complete_results)
except Exception as e:
    print(f"Error during complete dataset evaluation: {e}")
    raise SystemExit("Failed during evaluation on complete dataset. Exiting...")

# Step 10: Save the Fine-Tuned Model
try:
    model.save_pretrained(".\\byt5-finetuned-model")
    tokenizer.save_pretrained(".\\byt5-finetuned-model")
    print("Model and tokenizer saved successfully.")
except Exception as e:
    print(f"Error saving model: {e}")
    raise SystemExit("Failed to save the model. Exiting...")


Data loaded successfully.
Input-output pairs created successfully.
Data split into train, validation, and test sets successfully.
Error loading model: Incorrect path_or_model_id: 'google\byt5-small'. Please provide either the path to a local folder or the repo_id of a model on the Hub.
Please check your internet connection and try clearing cache.


AttributeError: 'tuple' object has no attribute 'tb_frame'

In [3]:
# Step 1: Install Required Libraries (Run this in your environment)
# !pip install transformers pandas torch scikit-learn sentencepiece openpyxl

# Step 2: Import Necessary Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import ByT5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

# Step 3: Load and Preprocess the Data
# Load the Excel file into a pandas DataFrame
file_path = "C:\\Users\\prana\\Downloads\\Nouns_new_7.xlsx"  # Path to the uploaded dataset
try:
    data = pd.read_excel(file_path)
    print("Data loaded successfully.")
except Exception as e:
    print(f"Error loading data: {e}")
    raise SystemExit("Failed to load data. Exiting...")

# Create input-output pairs with task-specific prompts
try:
    data['input_text'] = 'Predict the base word for: ' + data['Word']
    data['target_text'] = data['Base Word']
    print("Input-output pairs created successfully.")
except KeyError as e:
    print(f"Error with DataFrame columns: {e}")
    raise SystemExit("Ensure the dataset has 'Word' and 'Base Word' columns. Exiting...")

# Split the data into training, validation, and test sets
try:
    train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)
    val_data, test_data = train_test_split(temp_data, test_size=0.66, random_state=42)  # 0.66*0.3 = ~20% for testing
    print("Data split into train, validation, and test sets successfully.")
except Exception as e:
    print(f"Error splitting data: {e}")
    raise SystemExit("Failed to split data. Exiting...")

# Step 4: Define a Custom Dataset Class
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=50):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        input_text = self.data.iloc[index]['input_text']
        target_text = self.data.iloc[index]['target_text']

        # Tokenize input and target texts
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=False,  # Enable truncation to handle long texts
            return_tensors="pt"
        )
        targets = self.tokenizer(
            target_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=False,  # Enable truncation to handle long texts
            return_tensors="pt"
        )

        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        target_ids = targets['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': target_ids
        }

# Step 5: Initialize Tokenizer and Model with Error Handling
try:
    tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
    model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
    print("Tokenizer and model loaded successfully.")
except Exception as e:
    print(f"Error loading model or tokenizer: {e}")
    print("Please check your internet connection and try clearing cache.")
    raise SystemExit("Failed to load model or tokenizer. Exiting...")

# Ensure tokenizer is defined before proceeding
if 'tokenizer' not in globals():
    raise NameError("Tokenizer could not be initialized. Please resolve the error above before proceeding.")

# Determine maximum length based on the dataset
try:
    max_input_length = max(data['input_text'].apply(lambda x: len(tokenizer.encode(x))))
    max_target_length = max(data['target_text'].apply(lambda x: len(tokenizer.encode(x))))
    max_len = max(max_input_length, max_target_length) + 10  # Adding buffer
    print(f"Max length determined: {max_len}")
except Exception as e:
    print(f"Error during max length calculation: {e}")
    raise SystemExit("Failed to calculate max length. Exiting...")

# Create Dataset Instances
train_dataset = TextDataset(train_data, tokenizer, max_len=max_len)
val_dataset = TextDataset(val_data, tokenizer, max_len=max_len)
test_dataset = TextDataset(test_data, tokenizer, max_len=max_len)

# Step 6: Set Up Training Arguments and Trainer
training_args = TrainingArguments(
    output_dir="./byt5-results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    save_steps=10_000,
    eval_steps=1_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Step 7: Train the Model
try:
    trainer.train()
    print("Training completed successfully.")
except Exception as e:
    print(f"Error during training: {e}")
    raise SystemExit("Failed during training. Exiting...")

# Step 8: Evaluate the Model on Test Set
try:
    print("Evaluating on Test Set:")
    test_results = trainer.evaluate(eval_dataset=test_dataset)
    print(test_results)
except Exception as e:
    print(f"Error during evaluation: {e}")
    raise SystemExit("Failed during evaluation on test set. Exiting...")

# Step 9: Evaluate the Model on Complete Dataset
# Combine train, validation, and test data
complete_dataset = TextDataset(pd.concat([train_data, val_data, test_data]), tokenizer, max_len=max_len)

try:
    print("Evaluating on Complete Dataset:")
    complete_results = trainer.evaluate(eval_dataset=complete_dataset)
    print(complete_results)
except Exception as e:
    print(f"Error during complete dataset evaluation: {e}")
    raise SystemExit("Failed during evaluation on complete dataset. Exiting...")

# Step 10: Save the Fine-Tuned Model
try:
    model.save_pretrained("./byt5-finetuned-model")
    tokenizer.save_pretrained("./byt5-finetuned-model")
    print("Model and tokenizer saved successfully.")
except Exception as e:
    print(f"Error saving model: {e}")
    raise SystemExit("Failed to save the model. Exiting...")


Data loaded successfully.
Input-output pairs created successfully.
Data split into train, validation, and test sets successfully.




pytorch_model.bin:  56%|#####5    | 671M/1.20G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Tokenizer and model loaded successfully.
Max length determined: 68




  0%|          | 0/1350 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [5]:
# Step 1: Install Required Libraries (Run this in your environment)
# !pip install transformers pandas torch scikit-learn sentencepiece openpyxl

# Step 2: Import Necessary Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import ByT5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch

# Step 3: Load and Preprocess the Data
# Load the Excel file into a pandas DataFrame
file_path = "C:\\Users\\prana\\Downloads\\Nouns_new_7.xlsx"  # Path to the uploaded dataset
try:
    data = pd.read_excel(file_path)
    print("Data loaded successfully.")
except Exception as e:
    print(f"Error loading data: {e}")
    raise SystemExit("Failed to load data. Exiting...")

# Create input-output pairs with task-specific prompts
try:
    data['input_text'] = 'Predict the base word for the given word: ' + data['Word']
    data['target_text'] = data['Base Word']
    print("Input-output pairs created successfully.")
except KeyError as e:
    print(f"Error with DataFrame columns: {e}")
    raise SystemExit("Ensure the dataset has 'Word' and 'Base Word' columns. Exiting...")

# Split the data into training, validation, and test sets
try:
    train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)
    val_data, test_data = train_test_split(temp_data, test_size=0.66, random_state=42)  # 0.66*0.3 = ~20% for testing
    print("Data split into train, validation, and test sets successfully.")
except Exception as e:
    print(f"Error splitting data: {e}")
    raise SystemExit("Failed to split data. Exiting...")

# Step 4: Define a Custom Dataset Class with Chunk Overlap
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=512, chunk_overlap=50):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.chunk_overlap = chunk_overlap

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        input_text = self.data.iloc[index]['input_text']
        target_text = self.data.iloc[index]['target_text']

        # Tokenize input and target texts
        input_ids = self.tokenizer.encode(input_text, return_tensors="pt", padding=False)
        target_ids = self.tokenizer.encode(target_text, return_tensors="pt", padding=False)

        # Handle chunking and overlap
        if input_ids.shape[-1] > self.max_len:
            input_ids = input_ids.squeeze()
            target_ids = target_ids.squeeze()
            
            input_chunks = [
                input_ids[i : i + self.max_len]
                for i in range(0, len(input_ids), self.max_len - self.chunk_overlap)
            ]
            target_chunks = [
                target_ids[i : i + self.max_len]
                for i in range(0, len(target_ids), self.max_len - self.chunk_overlap)
            ]
            
            input_ids = input_chunks[0]
            target_ids = target_chunks[0]
        
        # Padding
        inputs = self.tokenizer.pad(
            {'input_ids': input_ids, 'attention_mask': torch.ones_like(input_ids)},
            max_length=self.max_len,
            padding='max_length'
        )
        
        targets = self.tokenizer.pad(
            {'input_ids': target_ids},
            max_length=self.max_len,
            padding='max_length'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets['input_ids'].squeeze()
        }

# Step 5: Initialize Tokenizer and Model with Error Handling
try:
    tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
    model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
    print("Tokenizer and model loaded successfully.")
except Exception as e:
    print(f"Error loading model or tokenizer: {e}")
    print("Please check your internet connection and try clearing cache.")
    raise SystemExit("Failed to load model or tokenizer. Exiting...")

# Ensure tokenizer is defined before proceeding
if 'tokenizer' not in globals():
    raise NameError("Tokenizer could not be initialized. Please resolve the error above before proceeding.")

# Determine maximum length based on the dataset
try:
    max_input_length = max(data['input_text'].apply(lambda x: len(tokenizer.encode(x))))
    max_target_length = max(data['target_text'].apply(lambda x: len(tokenizer.encode(x))))
    max_len = max(max_input_length, max_target_length) + 10  # Adding buffer
    print(f"Max length determined: {max_len}")
except Exception as e:
    print(f"Error during max length calculation: {e}")
    raise SystemExit("Failed to calculate max length. Exiting...")

# Create Dataset Instances
train_dataset = TextDataset(train_data, tokenizer, max_len=max_len)
val_dataset = TextDataset(val_data, tokenizer, max_len=max_len)
test_dataset = TextDataset(test_data, tokenizer, max_len=max_len)

# Step 6: Set Up Training Arguments and Trainer
training_args = TrainingArguments(
    output_dir="./byt5-results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    save_steps=10_000,
    eval_steps=1_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Step 7: Train the Model
try:
    trainer.train()
    print("Training completed successfully.")
except Exception as e:
    print(f"Error during training: {e}")
    raise SystemExit("Failed during training. Exiting...")

# Step 8: Evaluate the Model on Test Set
try:
    print("Evaluating on Test Set:")
    test_results = trainer.evaluate(eval_dataset=test_dataset)
    print(test_results)
except Exception as e:
    print(f"Error during evaluation: {e}")
    raise SystemExit("Failed during evaluation on test set. Exiting...")

# Step 9: Evaluate the Model on Complete Dataset
# Combine train, validation, and test data
complete_dataset = TextDataset(pd.concat([train_data, val_data, test_data]), tokenizer, max_len=max_len)

try:
    print("Evaluating on Complete Dataset:")
    complete_results = trainer.evaluate(eval_dataset=complete_dataset)
    print(complete_results)
except Exception as e:
    print(f"Error during complete dataset evaluation: {e}")
    raise SystemExit("Failed during evaluation on complete dataset. Exiting...")

# Step 10: Save the Fine-Tuned Model
try:
    model.save_pretrained("./byt5-finetuned-model")
    tokenizer.save_pretrained("./byt5-finetuned-model")
    print("Model and tokenizer saved successfully.")
except Exception as e:
    print(f"Error saving model: {e}")
    raise SystemExit("Failed to save the model. Exiting...")


Data loaded successfully.
Input-output pairs created successfully.
Data split into train, validation, and test sets successfully.




Tokenizer and model loaded successfully.
Max length determined: 83




  0%|          | 0/1545 [00:00<?, ?it/s]

{'loss': 76.0565, 'grad_norm': 1129.7034912109375, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.03}
{'loss': 73.5711, 'grad_norm': 792.9644775390625, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.06}
{'loss': 74.29, 'grad_norm': 2618.613037109375, 'learning_rate': 3e-06, 'epoch': 0.1}
{'loss': 73.0187, 'grad_norm': 3497.095703125, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.13}
{'loss': 73.1008, 'grad_norm': 3512.005859375, 'learning_rate': 5e-06, 'epoch': 0.16}
{'loss': 71.4155, 'grad_norm': 4286.779296875, 'learning_rate': 6e-06, 'epoch': 0.19}
{'loss': 70.3367, 'grad_norm': 460.0025939941406, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.23}
{'loss': 70.6074, 'grad_norm': 921.6121215820312, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.26}
{'loss': 70.3325, 'grad_norm': 1582.835205078125, 'learning_rate': 9e-06, 'epoch': 0.29}
{'loss': 68.8471, 'grad_norm': 3354.671630859375, 'learning_rate': 1e-05, 'epoch': 0.32}
{'loss': 67.4719, 'grad_norm': 700.4100

  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.0015039113350212574, 'eval_runtime': 40.6575, 'eval_samples_per_second': 2.583, 'eval_steps_per_second': 0.664, 'epoch': 3.24}
{'loss': 0.0099, 'grad_norm': 0.2035534679889679, 'learning_rate': 2.5598086124401916e-05, 'epoch': 3.27}
{'loss': 0.0131, 'grad_norm': 0.2846313714981079, 'learning_rate': 2.5119617224880382e-05, 'epoch': 3.3}
{'loss': 0.0121, 'grad_norm': 0.7762671113014221, 'learning_rate': 2.4641148325358855e-05, 'epoch': 3.33}
{'loss': 0.0131, 'grad_norm': 0.2260688692331314, 'learning_rate': 2.4162679425837324e-05, 'epoch': 3.37}
{'loss': 0.0085, 'grad_norm': 0.5136345624923706, 'learning_rate': 2.368421052631579e-05, 'epoch': 3.4}
{'loss': 0.0065, 'grad_norm': 0.5745105147361755, 'learning_rate': 2.320574162679426e-05, 'epoch': 3.43}
{'loss': 0.0074, 'grad_norm': 0.305859237909317, 'learning_rate': 2.272727272727273e-05, 'epoch': 3.46}
{'loss': 0.0133, 'grad_norm': 0.7091602683067322, 'learning_rate': 2.2248803827751195e-05, 'epoch': 3.5}
{'loss': 0.0125,

AttributeError: 'tuple' object has no attribute 'tb_frame'