In [31]:
from google.colab import drive
drive.mount('/content/drive')

In [32]:
from transformers import AutoModelForCausalLM, GemmaConfig, AutoTokenizer, AutoModel, MistralConfig, MistralModel, MistralForCausalLM, LlamaConfig, LlamaForCausalLM
import torch
import torch.nn as nn
import torch.nn.init as init
import json
import pickle
import pandas as pd

### Code 1

In [33]:
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/NLP/tokenizer1")

In [34]:
len(tokenizer.vocab)

32769

In [35]:
config = LlamaConfig(hidden_size=512,
                     vocab_size=32769,
                     num_attention_heads=8,
                     num_key_value_heads=2,
                     num_hidden_layers=24,
                     intermediate_size=1024,
                     max_position_embeddings=512)
config

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "max_position_embeddings": 512,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 8,
  "num_hidden_layers": 24,
  "num_key_value_heads": 2,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.46.2",
  "use_cache": true,
  "vocab_size": 32769
}

In [36]:
model_mis = LlamaForCausalLM(config)

In [37]:
for i,j in model_mis.named_parameters():
  if j.requires_grad and len(j.size()) > 1:
    init.xavier_uniform_(j.data)

In [38]:
total_param=0
for i,j in model_mis.named_parameters():
    total_param += j.numel()
print(total_param/(10**6))

In [39]:
model_mis.save_pretrained("/content/drive/MyDrive/NLP/model1")
tokenizer.save_pretrained("/content/drive/MyDrive/NLP/model1")

('/content/drive/MyDrive/NLP/model1/tokenizer_config.json',
 '/content/drive/MyDrive/NLP/model1/special_tokens_map.json',
 '/content/drive/MyDrive/NLP/model1/tokenizer.json')

### Code 2

In [40]:
%pip install datasets

In [12]:
from datasets import Dataset, DatasetDict
from datasets import load_dataset
import pandas as pd

### Code 3

In [13]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
from datasets import Dataset

# Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/NLP/tokenizer1")

# Load data and add <eos>
df = pd.read_csv("/content/drive/MyDrive/NLP/bengali_dataset_0.5.csv")
df["text"] = df["Input"] + "<eos>"

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Tokenize with padding, truncation, and labels
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Split dataset
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# Data collator for batching
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


Map:   0%|          | 0/429533 [00:00<?, ? examples/s]

In [14]:
from transformers import Trainer, TrainingArguments

In [15]:
%pip install -q bitsandbytes trl peft -U

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.9/310.9 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [43]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/NLP//model1",
    overwrite_output_dir=True,
    num_train_epochs=50,
    logging_steps=10,
    learning_rate=2e-3,
    bf16=True,
    do_train=True,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
)


In [44]:
# Trainer initialization
trainer = Trainer(
    model=model_mis,  # Your model
    args=training_args,
    train_dataset=train_dataset,  # Tokenized training dataset
    eval_dataset=eval_dataset,  # Tokenized evaluation dataset
    tokenizer=tokenizer,  # Tokenizer for padding, decoding, etc.
    data_collator=data_collator,  # Handles batching
)

  trainer = Trainer(


In [45]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
import sys

# Redirect stdout to a file
log_file = open("/content/drive/MyDrive/NLP/training_log.txt", "w")
sys.stdout = log_file

trainer.train()

# Close the log file
log_file.close()

# Reset stdout to default
sys.stdout = sys.__stdout__

Step,Training Loss
10,4.7534
20,2.1434
30,1.5381
40,1.488
50,1.2265
60,1.4245
70,1.7364
80,1.3158
90,1.9234
100,1.9576


Step,Training Loss
10,4.7534
20,2.1434
30,1.5381
40,1.488
50,1.2265
60,1.4245
70,1.7364
80,1.3158
90,1.9234
100,1.9576


In [24]:
custom_input = "উন্নয়নে সিংড়া রাতে প্রায় ৫১লাখ টাকা ব্যয়ে নাটোর-বড়া মহাসড়কের শেরকোল হইতে"
input_dict = {'text': [custom_input]}

In [25]:
input_dict = {'input_ids': [tokenizer.encode(custom_input)]}
input_dict

{'input_ids': [[5795,
   3754,
   368,
   1704,
   1353,
   12449,
   136,
   48,
   1162,
   10228,
   14563,
   3815,
   368,
   11542,
   10387,
   47,
   629,
   15920]]}

In [26]:
custom_dataset = Dataset.from_dict(input_dict)

In [27]:
predictions = trainer.predict(custom_dataset)

Step,Training Loss
10,1.3935
20,1.5892
30,1.2435
40,1.2769
50,1.0415
60,1.1495
70,1.4351
80,1.1343
90,1.7107
100,1.7572


In [28]:
generated_outputs = predictions.predictions  # This will be logits

In [29]:
output_ids = torch.argmax(torch.tensor(generated_outputs), dim=2)

In [30]:
tokenizer.decode(output_ids[0])

',,, ও1   ব্যবসা করে  ও  <unk> করে ও'

In [None]:
output_ids

tensor([[118, 118,  22,  22, 118,  22, 118, 118, 118,  22, 118,  22,  22,  22,
          22,  22,  22,   1]])

In [None]:
trainer.save_model("/content/drive/MyDrive/NLP/trained_model1")
tokenizer.save_pretrained("/content/drive/MyDrive/NLP/trained_model1")

('/content/drive/MyDrive/NLP/trained_model1/tokenizer_config.json',
 '/content/drive/MyDrive/NLP/trained_model1/special_tokens_map.json',
 '/content/drive/MyDrive/NLP/trained_model1/tokenizer.json')