In [46]:
from transformers import AutoModelForCausalLM, GemmaConfig, AutoTokenizer, AutoModel, MistralConfig, MistralModel, MistralForCausalLM, LlamaConfig, LlamaForCausalLM
import torch
import torch.nn as nn
import torch.nn.init as init
import json
import pickle
import pandas as pd

### Code 1

In [47]:
tokenizer = AutoTokenizer.from_pretrained("test")

In [48]:
len(tokenizer.vocab)

32769

In [49]:
config = LlamaConfig(hidden_size=256,
                     vocab_size=len(tokenizer.vocab),
                     num_attention_heads=4,
                     num_key_value_heads=2,
                     num_hidden_layers=12,
                     intermediate_size=688,
                     max_position_embeddings=64)
config

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 688,
  "max_position_embeddings": 64,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "num_key_value_heads": 2,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.46.2",
  "use_cache": true,
  "vocab_size": 32769
}

In [50]:
model_mis = LlamaForCausalLM(config)

In [51]:
for i,j in model_mis.named_parameters():
  if j.requires_grad and len(j.size()) > 1:
    init.xavier_uniform_(j.data)

In [52]:
total_param=0
for i,j in model_mis.named_parameters():
    total_param += j.numel()
print(total_param/(10**6))

25.484032


In [53]:
model_mis.save_pretrained("model1")
tokenizer.save_pretrained("model1")

('model1/tokenizer_config.json',
 'model1/special_tokens_map.json',
 'model1/tokenizer.json')

### Code 2

In [54]:
from datasets import Dataset, DatasetDict
from datasets import load_dataset
import pandas as pd

In [109]:
df = pd.read_csv("data_list.csv")

In [108]:
df.head()

Unnamed: 0,Input,text
0,তোমার সাথে শেষ বার আবার বাধবো খেলা ঘড়আবার কাকচ...,তোমার সাথে শেষ বার আবার বাধবো খেলা ঘড়আবার কাকচ...
1,"আলো আধারের মাঝে, আমার দম বন্ধমন খুজছে আলোর উৎস...","আলো আধারের মাঝে, আমার দম বন্ধমন খুজছে আলোর উৎস..."
2,বাংলাদেশ জাতীয় তথ্য বাতায়নরাজশাহী বিভাগ---সি...,বাংলাদেশ জাতীয় তথ্য বাতায়নরাজশাহী বিভাগ---সি...
3,মেনু নির্বাচন করুনপ্রথম পাতাআমাদের সম্পর্কেঅফি...,মেনু নির্বাচন করুনপ্রথম পাতাআমাদের সম্পর্কেঅফি...
4,'এশীয় অবকাঠামো বিনিয়োগ ব্যাংকের প্রতিষ্ঠাতা-সদ...,'এশীয় অবকাঠামো বিনিয়োগ ব্যাংকের প্রতিষ্ঠাতা-সদ...


In [110]:
df = df.iloc[:5000, :]
df

Unnamed: 0,Input
0,তোমার সাথে শেষ বার আবার বাধবো খেলা ঘড়আবার কাকচ...
1,"আলো আধারের মাঝে, আমার দম বন্ধমন খুজছে আলোর উৎস..."
2,বাংলাদেশ জাতীয় তথ্য বাতায়নরাজশাহী বিভাগ---সি...
3,মেনু নির্বাচন করুনপ্রথম পাতাআমাদের সম্পর্কেঅফি...
4,'এশীয় অবকাঠামো বিনিয়োগ ব্যাংকের প্রতিষ্ঠাতা-সদ...
...,...
4995,বাংলাদেশআন্তর্জাতিককমিউনিটিখেলাধুলাবিনোদনলেখাল...
4996,বিনোদনরণবীরের প্রশংসা করলেন আমির খানবলিউডের নত...
4997,বাংলাদেশআন্তর্জাতিককমিউনিটিখেলাধুলাবিনোদনলেখাল...
4998,বিশিষ্ট ইসলামী চিন্তাবিদ ও মিডিয়া ব্যক্তিত্ব এ...


In [111]:
df["text"] = df["Input"] + "<eos>"

In [112]:
tokenizer = AutoTokenizer.from_pretrained("test")

In [113]:
data_list = df["text"].to_list()

In [114]:
type(data_list)

list

In [115]:
input_ids = tokenizer(df["text"].to_list())["input_ids"]

In [152]:
token_list = []
for i in input_ids:
  token_list.extend(i)

In [147]:
len(token_list)

682004

In [120]:
df = pd.DataFrame(columns=["input_ids"])
df

Unnamed: 0,input_ids


In [148]:
print(len(token_list))

682004


In [153]:
context_len = 128    ## Taking less because I have less data
token_batch = []
for i in input_ids:
  token_batch.append(token_list[:context_len])
  token_list = token_list[context_len:]

In [154]:
len(token_batch[-1])

128

In [155]:
df["input_ids"] = token_batch
df

Unnamed: 0,input_ids
0,"[2520, 524, 875, 919, 1341, 14746, 1047, 2117,..."
1,"[8494, 1898, 472, 1556, 416, 2819, 14425, 7285..."
2,"[1464, 194, 74, 8475, 25527, 1, 1, 1, 1, 1, 24..."
3,"[4639, 4637, 4758, 4560, 1027, 4748, 4731, 304..."
4,"[1, 19665, 2128, 3961, 279, 6249, 4632, 6725, ..."
...,...
4995,"[3642, 4127, 2239, 626, 1160, 2383, 841, 264, ..."
4996,"[167, 1112, 125, 5403, 26765, 17985, 125, 1401..."
4997,"[5215, 27145, 217, 279, 29105, 3151, 15481, 37..."
4998,"[1564, 16021, 184, 27226, 27384, 163, 19532, 1..."


In [156]:
attn_mask = [[1]*128]*len(df)

In [157]:
df["attention_mask"] = attn_mask
df['labels'] = df['input_ids']

In [158]:
df.head()

Unnamed: 0,input_ids,attention_mask,labels
0,"[2520, 524, 875, 919, 1341, 14746, 1047, 2117,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2520, 524, 875, 919, 1341, 14746, 1047, 2117,..."
1,"[8494, 1898, 472, 1556, 416, 2819, 14425, 7285...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[8494, 1898, 472, 1556, 416, 2819, 14425, 7285..."
2,"[1464, 194, 74, 8475, 25527, 1, 1, 1, 1, 1, 24...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1464, 194, 74, 8475, 25527, 1, 1, 1, 1, 1, 24..."
3,"[4639, 4637, 4758, 4560, 1027, 4748, 4731, 304...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[4639, 4637, 4758, 4560, 1027, 4748, 4731, 304..."
4,"[1, 19665, 2128, 3961, 279, 6249, 4632, 6725, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 19665, 2128, 3961, 279, 6249, 4632, 6725, ..."


In [159]:
hf_dataset = Dataset.from_pandas(df)
hf_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5000
})

In [160]:
split_dataset = hf_dataset.train_test_split(test_size=0.1)  # Adjust test_size as needed

train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

In [161]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 4500
})

In [163]:
train_dataset.to_parquet("train.parquet")
eval_dataset.to_parquet("test.parquet")

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

1542000

### Code 3

In [164]:
from transformers import Trainer, TrainingArguments

In [32]:
%pip install -q bitsandbytes trl peft -U

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [168]:
training_args = TrainingArguments(
    output_dir="./model1",
    overwrite_output_dir=True,
    num_train_epochs=1000,
    logging_steps=1,
    learning_rate=2e-3,
    bf16=False,
    do_train=True,
    per_device_train_batch_size=2,
    save_steps=20,
    save_total_limit=2,
    report_to="none",
)

In [169]:
trainer = Trainer(
    model=model_mis,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [170]:
tokenizer.pad_token = tokenizer.eos_token

In [171]:
trainer.train()

  0%|          | 0/2250000 [00:00<?, ?it/s]

{'loss': 10.3049, 'grad_norm': 2.339616298675537, 'learning_rate': 0.001999999111111111, 'epoch': 0.0}
{'loss': 10.3303, 'grad_norm': 1.983117938041687, 'learning_rate': 0.001999998222222222, 'epoch': 0.0}
{'loss': 10.3209, 'grad_norm': 4.155365943908691, 'learning_rate': 0.0019999973333333335, 'epoch': 0.0}
{'loss': 10.2785, 'grad_norm': 2.144869804382324, 'learning_rate': 0.0019999964444444443, 'epoch': 0.0}
{'loss': 10.1231, 'grad_norm': 5.248201370239258, 'learning_rate': 0.0019999955555555556, 'epoch': 0.0}
{'loss': 9.7937, 'grad_norm': 3.4602513313293457, 'learning_rate': 0.0019999946666666665, 'epoch': 0.0}
{'loss': 10.0758, 'grad_norm': 1.8965991735458374, 'learning_rate': 0.0019999937777777778, 'epoch': 0.0}
{'loss': 10.0378, 'grad_norm': 1.3583906888961792, 'learning_rate': 0.001999992888888889, 'epoch': 0.0}
{'loss': 8.8679, 'grad_norm': 2.532754898071289, 'learning_rate': 0.001999992, 'epoch': 0.0}
{'loss': 9.5998, 'grad_norm': 1.1671078205108643, 'learning_rate': 0.0019999

KeyboardInterrupt: 

In [172]:
custom_input = "উন্নয়নে সিংড়া রাতে প্রায় ৫১লাখ টাকা ব্যয়ে নাটোর-বড়া মহাসড়কের শেরকোল হইতে"
input_dict = {'text': [custom_input]}

In [173]:
input_dict = {'input_ids': [tokenizer.encode(custom_input)]}
input_dict

{'input_ids': [[5795,
   3754,
   368,
   1704,
   1353,
   12449,
   136,
   48,
   1162,
   10228,
   14563,
   3815,
   368,
   11542,
   10387,
   47,
   629,
   15920]]}

In [174]:
custom_dataset = Dataset.from_dict(input_dict)

In [175]:
predictions = trainer.predict(custom_dataset)

  0%|          | 0/1 [00:00<?, ?it/s]

In [176]:
generated_outputs = predictions.predictions  # This will be logits

In [177]:
output_ids = torch.argmax(torch.tensor(generated_outputs), dim=2)

In [178]:
tokenizer.decode(output_ids[0])

' সাথেহাল<unk> একটি<unk> যুদ্ধক্ষেত্র ও করে ও বাংলা<unk>,<unk> করে...বিস্তারিত<unk>। ও'

In [179]:
output_ids

tensor([[  118,  3098,     1,   375,     1, 18918,   166,   217,   166,   516,
             1,    22,     1,   217,  2236,     1,    31,   166]])

In [180]:
trainer.save_model("trained_model")
tokenizer.save_pretrained("trained_model")

('trained_model/tokenizer_config.json',
 'trained_model/special_tokens_map.json',
 'trained_model/tokenizer.json')