In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from transformers import AutoModelForCausalLM, GemmaConfig, AutoTokenizer, AutoModel, MistralConfig, MistralModel, MistralForCausalLM, LlamaConfig, LlamaForCausalLM
import torch
import torch.nn as nn
import torch.nn.init as init
import json
import pickle
import pandas as pd
import sys
from transformers import Trainer, TrainingArguments

In [None]:
from transformers import LlamaForCausalLM, AutoTokenizer, Trainer, TrainingArguments

checkpoint_path = "/content/drive/MyDrive/NLP/model1/checkpoint-7500"  
model = LlamaForCausalLM.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

In [None]:
%pip install datasets

In [None]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
from datasets import Dataset

df = pd.read_csv("/content/drive/MyDrive/NLP/bengali_dataset_0.5.csv")
df["text"] = df["Input"] + "<eos>"

dataset = Dataset.from_pandas(df)


def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt")
    return tokenized


tokenized_dataset = dataset.map(tokenize_function, batched=True)
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/NLP/model1",  
    overwrite_output_dir=True,
    num_train_epochs=10,  
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    logging_steps=10,
    learning_rate=2e-4,  
    report_to="none",  
    resume_from_checkpoint=checkpoint_path,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  
    eval_dataset=eval_dataset,  
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [None]:

# Resume training from the checkpoint
trainer.train(resume_from_checkpoint=checkpoint_path)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/NLP/tokenizer1")

In [None]:
len(tokenizer.vocab)

In [None]:
config = LlamaConfig(hidden_size=512,
                     vocab_size=32769,
                     num_attention_heads=8,
                     num_key_value_heads=2,
                     num_hidden_layers=24,
                     intermediate_size=1024,
                     max_position_embeddings=512)
config

In [None]:
model_mis = LlamaForCausalLM(config)

In [None]:
for i,j in model_mis.named_parameters():
  if j.requires_grad and len(j.size()) > 1:
    init.xavier_uniform_(j.data)

In [None]:
total_param=0
for i,j in model_mis.named_parameters():
    total_param += j.numel()
print(total_param/(10**6))

In [None]:
model_mis.save_pretrained("/content/drive/MyDrive/NLP/model1")
tokenizer.save_pretrained("/content/drive/MyDrive/NLP/model1")

In [None]:
%pip install datasets

In [None]:
from datasets import Dataset, DatasetDict
from datasets import load_dataset
import pandas as pd

In [None]:
%pip install -q bitsandbytes trl peft -U

In [None]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/NLP//model1",
    overwrite_output_dir=True,
    num_train_epochs=50,
    logging_steps=10,
    learning_rate=2e-3,
    bf16=True,
    do_train=True,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
)


In [None]:
# Trainer initialization
trainer = Trainer(
    model=model_mis,  
    args=training_args,
    train_dataset=train_dataset,  
    eval_dataset=eval_dataset,  
    tokenizer=tokenizer,  
    data_collator=data_collator,  
)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
log_file = open("/content/drive/MyDrive/NLP/training_log.txt", "w")
sys.stdout = log_file

trainer.train()

log_file.close()
sys.stdout = sys.__stdout__

In [None]:
trainer.save_model("/content/drive/MyDrive/NLP/trained_model1")
tokenizer.save_pretrained("/content/drive/MyDrive/NLP/trained_model1")