In [1]:
# !pip install accelerate -U -q
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
import math
import json
import torch
import pickle
import pandas as pd
import torch.nn as nn
import torch.nn.init as init
# from transformers import AutoModelForCausalLM, GemmaConfig, AutoModel, MistralConfig, MistralModel, MistralForCausalLM
from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


### Code 1

In [3]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [4]:
tokenizer = AutoTokenizer.from_pretrained("./Tokenizer3")

In [5]:
tokenizer.pad_token_id = tokenizer.eos_token_id
len(tokenizer.vocab)

128001

In [6]:
config = LlamaConfig(hidden_size=256,
                     vocab_size=128000,
                     num_attention_heads=8,
                     num_key_value_heads=8,
                     num_hidden_layers=16,
                     intermediate_size=512,
                     max_position_embeddings=2048)
config

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 32,
  "hidden_act": "silu",
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 8,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.46.2",
  "use_cache": true,
  "vocab_size": 128000
}

In [7]:
model_mis = LlamaForCausalLM(config)

In [8]:
for i,j in model_mis.named_parameters():
  if j.requires_grad and len(j.size()) > 1:
    init.xavier_uniform_(j.data)

In [9]:
total_param=0
for i,j in model_mis.named_parameters():
    total_param += j.numel()
print(total_param/(10**6))

76.030208


In [10]:
model_mis.save_pretrained("./marathi_model_large_100epoch")
tokenizer.save_pretrained("./marathi_model_large_100epoch")

('./marathi_model_large_100epoch/tokenizer_config.json',
 './marathi_model_large_100epoch/special_tokens_map.json',
 './marathi_model_large_100epoch/tokenizer.json')

### Code 2

In [11]:
# !pip install datasets -q

In [12]:
# import pandas as pd
# from datasets import load_dataset
# from datasets import Dataset, DatasetDict

In [13]:
# df = pd.read_csv("extracted_data.csv")

In [14]:
# df.head()

In [15]:
# df = df.iloc[:2000, :]
# df

In [16]:
# df["text"] = df["text"].str.strip() + "<end_of_sen>"

In [17]:
# tokenizer = AutoTokenizer.from_pretrained("Tokenizer3")

In [18]:
# df.iloc[0, -1]

In [19]:
# input_ids = tokenizer(df["text"].to_list()[:])["input_ids"]

In [20]:
# token_list = []
# for i in input_ids:
#   token_list.extend(i)

In [21]:
# token_list = token_list + [3] * 180
# len(token_list)

In [22]:
# df = pd.DataFrame(columns=["input_ids"])
# df.shape

In [23]:
# # import numpy as np
# # from tqdm import tqdm

# # context_len = 256      ## Taking less because I have less data
# # token_batch = []
# # with tqdm(total=len(input_ids)) as pbar:
# #   for i in input_ids: 
# #     token_batch.append(token_list[:context_len])
# #     token_list = token_list[context_len:]
# #     pbar.update(1)
# # token_list.extend([3, 3])
# # token_list = np.array(token_list)
# # token_batch = np.array_split(token_list, len(token_list) // context_len + 1)
# import numpy as np
# from tqdm import tqdm

# context_len = 256  # Taking less because I have less data
# token_batch = []

# # Calculate the number of batches
# num_batches = (len(token_list) + context_len - 1) // context_len

# with tqdm(total=num_batches) as pbar:
#     for i in range(num_batches):
#         token_batch.append(token_list[i * context_len:(i + 1) * context_len])
#         pbar.update(1)

# # Convert to numpy array if needed
# # token_batch = np.array(token_batch)

In [24]:
# len(token_batch), len(token_batch[-1])

In [25]:
# df["input_ids"] = token_batch
# df

In [26]:
# attn_mask = [[1]*context_len]*len(df)

In [27]:
# df["attention_mask"] = attn_mask
# df['labels'] = df['input_ids']

In [28]:
# df.head()

In [29]:
# hf_dataset = Dataset.from_pandas(df)
# hf_dataset

In [30]:
# split_dataset = hf_dataset.train_test_split(test_size=0.1)  # Adjust test_size as needed

# train_dataset = split_dataset['train']
# eval_dataset = split_dataset['test']

In [31]:
# train_dataset

In [32]:
# train_dataset.to_parquet("marathi_dataset_token_train_large.parquet")
# eval_dataset.to_parquet("marathi_dataset_token_test_large.parquet")

### Code 3

In [33]:
from transformers import Trainer, TrainingArguments, TrainerCallback

In [34]:
#!pip install -q bitsandbytes trl peft -U

In [35]:
from datasets import load_dataset
# dataset = load_dataset('parquet', data_files=
#  {'train': './marathi_dataset_token_train_large.parquet',
#   'test': './marathi_dataset_token_test_large.parquet'})
dataset = load_dataset('parquet', data_files=
 {'train': './marathi_dataset_token_train.parquet',
  'test': './marathi_dataset_token_test.parquet'})

In [36]:
import wandb
# 6d0aee422e6256561f2b4ef7fd7ddb3df99f44fb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmukul-potta[0m ([33mmukul-potta-indian-institute-of-technology-gandhinagar[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [37]:
#!pip install -q bitsandbytes trl peft -U
import os
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="marathi_nlp"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="checkpoint"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="all"

In [38]:
training_args = TrainingArguments(
    output_dir="./marathi_model_large_100epoch",
    overwrite_output_dir=True,
    num_train_epochs=100,
    eval_strategy="steps",
    save_strategy='epoch',
    # logging_steps=2393,
    logging_steps=10,
    # eval_steps=2393,
    eval_steps=11,
    learning_rate=2e-3,
    bf16=True,
    do_train=True,
    do_eval=True,
    # per_device_train_batch_size=32,
    per_device_train_batch_size=16,
    save_steps=5000,
    save_total_limit=2,
    # report_to="wandb",
    run_name='final_MCB_5epoch',
    report_to="none",
)

In [39]:
per_log_epoch = []
epoch_perplex = []

per_log_eval = []
per_log_eval_step = []
eval_perplex = []

In [40]:
epoch_cur = 1

In [41]:
from transformers.trainer_callback import TrainerControl, TrainerState
from transformers.training_args import TrainingArguments


class PerplexityCallback(TrainerCallback):

    def on_epoch_end(self, args, state, control, model=None, **kwargs):
        global epoch_cur
        if state.log_history:  
            for entry in reversed(state.log_history):
                if "eval_loss" in entry:
                    # print(entry)
                    if state.epoch == epoch_cur:
                        epoch_cur += 1
                        eval_loss = entry["eval_loss"]
                        perplexity = math.exp(eval_loss)
                        per_log_epoch.append(state.epoch)
                        epoch_perplex.append(perplexity)
                        print(f"Epoch {state.epoch}")
                        print(f"Perplexity = {perplexity:.4f}")
                        break
                    else:
                        continue

    def on_evaluate(self, args, state, control, model=None, **kwargs):
        if state.log_history:  
            for entry in reversed(state.log_history):
                if "eval_loss" in entry:
                    # print(entry)
                    eval_loss = entry["eval_loss"]
                    per_log_eval.append(eval_loss)
                    perplexity = math.exp(eval_loss)
                    step = entry["step"]
                    per_log_eval_step.append(step)
                    eval_perplex.append(perplexity)
                    # print(f"Step {step}")
                    # print(f"Perplexity = {perplexity:.4f}")
                break

In [42]:
train_data = dataset['train']
train_data

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1800
})

In [43]:
eval_data = dataset['test']
eval_data

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 200
})

In [44]:
trainer = Trainer(
    model=model_mis,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    processing_class=tokenizer,
    callbacks=[PerplexityCallback()],
)

In [45]:
training_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=11,
eval_strategy=steps,
eval_use_gather_object=Fal

In [46]:
tokenizer.pad_token = tokenizer.eos_token

In [47]:
trainer.train()

Step,Training Loss,Validation Loss
11,10.8329,9.953817
22,9.5849,9.720651
33,9.6626,9.722943
44,9.4785,9.588073
55,9.4676,9.479609
66,9.4202,9.434042
77,9.3672,9.345419
88,9.3204,9.325193
99,9.2736,9.305307
110,9.2119,9.271715


Epoch 1.0
Perplexity = 10632.9736
Epoch 2.0
Perplexity = 11380.1874
Epoch 3.0
Perplexity = 11751.0111
Epoch 4.0
Perplexity = 11726.0468
Epoch 5.0
Perplexity = 14034.4680
Epoch 6.0
Perplexity = 16839.8036
Epoch 7.0
Perplexity = 24078.3050
Epoch 8.0
Perplexity = 37244.6573
Epoch 9.0
Perplexity = 71605.5866
Epoch 10.0
Perplexity = 139767.2052
Epoch 11.0
Perplexity = 267824.1736
Epoch 12.0
Perplexity = 489995.2210
Epoch 13.0
Perplexity = 849138.4609
Epoch 14.0
Perplexity = 1493328.9823
Epoch 15.0
Perplexity = 2280075.2606
Epoch 16.0
Perplexity = 3793168.1760
Epoch 17.0
Perplexity = 6278252.2511
Epoch 18.0
Perplexity = 8800593.6943
Epoch 19.0
Perplexity = 13102270.9397
Epoch 20.0
Perplexity = 17684313.3443
Epoch 21.0
Perplexity = 23430724.1232
Epoch 22.0
Perplexity = 32675603.3413
Epoch 23.0
Perplexity = 39583434.9714
Epoch 24.0
Perplexity = 51037620.6156
Epoch 25.0
Perplexity = 68687140.4503
Epoch 26.0
Perplexity = 83355928.6995
Epoch 27.0
Perplexity = 100294954.3461
Epoch 28.0
Perplexity 

TrainOutput(global_step=11300, training_loss=0.85618089201242, metrics={'train_runtime': 2876.2874, 'train_samples_per_second': 62.581, 'train_steps_per_second': 3.929, 'total_flos': 2990283816960000.0, 'train_loss': 0.85618089201242, 'epoch': 100.0})

In [48]:
perplex_df = pd.DataFrame({'Steps': per_log_eval_step, 'evalloss': per_log_eval, 'Perplexity': eval_perplex})
perplex_df.to_csv("perplexity_100Epoch.csv", index=False)

In [49]:
epochs_df = pd.DataFrame({'Epoch': per_log_epoch, 'Perplexity': epoch_perplex})
epochs_df.to_csv("Epoch_Matrix.csv", index=False)

In [50]:
trainer.save_model("./trained_model_large_100epoch")
tokenizer.save_pretrained("./trained_model_large_100epoch")

('./trained_model_large_100epoch/tokenizer_config.json',
 './trained_model_large_100epoch/special_tokens_map.json',
 './trained_model_large_100epoch/tokenizer.json')