In [None]:
!pip install transformers datasets accelerate bitsandbytes
!pip install unsloth
!pip install peft
!pip install torch
!pip install datasets

In [None]:
import torch
from datasets import Dataset
from unsloth import FastLanguageModel
import pandas as pd
from datetime import datetime
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from huggingface_hub import notebook_login

print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")

In [None]:
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

In [None]:
df = pd.read_excel("en-tel-colloquial.xlsx")
print(f"Dataset shape: {df.shape}")
df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
df.columns = df.columns.str.strip()
df.dropna(inplace=True)  # Drop NaNs to prevent errors
print(f"Dataset shape: {df.shape}")
print(f"Dataset columns: {df.columns.tolist()}")
print(df.head())

In [None]:
df['text'] = df.apply(lambda row: f"### Human:Translate to Telugu colloquial: {row['Human']}\n### Assistant: {row['Assistant']}", axis=1)

In [None]:
import re

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['text'] = df['text'].apply(normalize_text)

In [None]:
# Step 3: Convert the DataFrame into a Dataset
dataset = Dataset.from_pandas(df[['text']])  # Use only the 'text' column
dataset = dataset.shuffle(seed=42)
split_dataset = dataset.train_test_split(test_size=0.2)

In [None]:
# Preprocessing function
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples['text'],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length",
    )
    model_inputs['labels'] = model_inputs['input_ids']
    return model_inputs

In [None]:
# Hugging Face User Name
hugging_face_user_name="xxxx"

In [None]:
from huggingface_hub import login
login('xxxxx')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig
import torch

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load base model again
base_model = AutoModelForCausalLM.from_pretrained(
    "unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit",
    device_map="auto"
)

# Load the previously fine-tuned adapter
model = PeftModel.from_pretrained(base_model, "sril32996/en-tel")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token


In [None]:
model.enable_input_require_grads()
model.gradient_checkpointing_enable()
model.print_trainable_parameters()

In [None]:
tokenized_train = split_dataset['train'].map(preprocess_function, remove_columns=['text'], batched=True)
tokenized_val = split_dataset['test'].map(preprocess_function, remove_columns=['text'], batched=True)

# Convert to PyTorch format
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
training_args = TrainingArguments(
    output_dir="./fine_tuned_llama",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=7,
    learning_rate=3e-4,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=8,
    save_strategy="steps",
    save_steps=8,
    save_total_limit=2,  # Prevent excessive checkpoints
    load_best_model_at_end=True,
    fp16=True,    # Enable fp16 precision
    bf16=False,
    push_to_hub=True,
    hub_model_id=f"{hugging_face_user_name}/en-tel",
    gradient_accumulation_steps=8,
    warmup_steps=100,
    report_to=["none"],
    optim="adamw_torch",
    dataloader_pin_memory=False,
    torch_compile=False,
    gradient_checkpointing=True
)


In [None]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val
)

# Start training
print("Starting training...")
trainer.train()

model.save_pretrained(model_name)
tokenizer.save_pretrained(model_name)


model.push_to_hub(MODEL_NAME, token=hf_token)
tokenizer.push_to_hub(MODEL_NAME, token=hf_token)

# Save and push to hub
print("Saving model and pushing to Hugging Face Hub...")
trainer.save_model()
trainer.push_to_hub()

# Prepare model for inference
print("Preparing model for inference...")
model = FastLanguageModel.for_inference(model)