In [None]:
import os
import torch 
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM, get_cosine_schedule_with_warmup
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
import pandas as pd 

In [None]:
from huggingface_hub import login
# Use environment variable for Hugging Face token
hf_token = os.getenv('HUGGINGFACE_TOKEN')
if hf_token:
    login(hf_token)
else:
    print("Warning: HUGGINGFACE_TOKEN environment variable not set")

In [3]:
quant_config=BitsAndBytesConfig(load_in_8bit=True)
use_quant_config=True
device='cuda' if torch.cuda.is_available() else "cpu"
# model_id="meta-llama/Llama-3.2-3B-Instruct"
# model_id='meta-llama/Llama-3.2-1B'
model_id='google/gemma-3-1b-it'
tokenizer=AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id,padding_side='right')
llm_model=AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                               torch_dtype=torch.bfloat16,
                                               attn_implementation='eager',
                                               quantization_config=quant_config if use_quant_config else None)

In [4]:
llm_model

Gemma3ForCausalLM(
  (model): Gemma3TextModel(
    (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 1152, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma3DecoderLayer(
        (self_attn): Gemma3Attention(
          (q_proj): Linear8bitLt(in_features=1152, out_features=1024, bias=False)
          (k_proj): Linear8bitLt(in_features=1152, out_features=256, bias=False)
          (v_proj): Linear8bitLt(in_features=1152, out_features=256, bias=False)
          (o_proj): Linear8bitLt(in_features=1024, out_features=1152, bias=False)
          (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
          (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
        )
        (mlp): Gemma3MLP(
          (gate_proj): Linear8bitLt(in_features=1152, out_features=6912, bias=False)
          (up_proj): Linear8bitLt(in_features=1152, out_features=6912, bias=False)
          (down_proj): Linear8bitLt(in_features=6912, out_features=1152, bias=False)
          (act_fn): PytorchGELUTanh()
        )
  

In [5]:
for param in llm_model.parameters():
    param.requires_grad=False



for name,param in llm_model.model.layers[-1].named_parameters():
    if param.is_floating_point():
        print(f"available for grad {name} and it's type {param.dtype}")
        param.requires_grad=True
        
    else:
        print(f'skipping-:{name}  type{param.dtype} ')

for param in llm_model.model.norm.parameters():
    param.requires_grad = True
for param in llm_model.lm_head.parameters():
    param.requires_grad = True

skipping-:self_attn.q_proj.weight  typetorch.int8 
skipping-:self_attn.k_proj.weight  typetorch.int8 
skipping-:self_attn.v_proj.weight  typetorch.int8 
skipping-:self_attn.o_proj.weight  typetorch.int8 
available for grad self_attn.q_norm.weight and it's type torch.bfloat16
available for grad self_attn.k_norm.weight and it's type torch.bfloat16
skipping-:mlp.gate_proj.weight  typetorch.int8 
skipping-:mlp.up_proj.weight  typetorch.int8 
skipping-:mlp.down_proj.weight  typetorch.int8 
available for grad input_layernorm.weight and it's type torch.bfloat16
available for grad post_attention_layernorm.weight and it's type torch.bfloat16
available for grad pre_feedforward_layernorm.weight and it's type torch.bfloat16
available for grad post_feedforward_layernorm.weight and it's type torch.bfloat16


In [6]:
# text= 'what is protein'
# input_ids=tokenizer(text=text,return_tensors='pt')
# input_ids={k:v.to(device) for k,v in input_ids.items()}
# output=llm_model.generate(**input_ids,max_new_tokens=256)
# ans=tokenizer.decode(output[0],skip_special_tokens=True,stream=True)
# print('LLM REPLY-->',ans)

In [7]:
dum_dat={'Text':'do you think that these two are correlated in anyway. regardless now is the best time to “buy the dip” get in while ',
         'Datatype':'Objective'}

In [8]:
def prompt_generator(data:dict):
    text=data['Text']
    datatype=data['Datatype']

    prompt=f''' 
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an Ai classification system that Read and understand the given Text in input and analyze the Text then classify the Content Type .
NO EXPLANATIONS is required . YOU must choose from One of following Classes:
Labelled class: Noise 
or Labelled class: Objective
or Labelled class: Positive
or Labelled class: Negative
or Labelled class: QUESTION
or Labelled class: Advertisement
or Labelled class: Neutral sentiment
or Labelled class: Miscellaneous
Ensure Strictly that output is from above list.


### Input:
{text}

### Response:
Labelled class: {datatype}{tokenizer.eos_token}'''
    return prompt

In [9]:
# print(prompt_generator(data=dum_dat))

In [10]:
# def prompt_generator(data:dict):
#     text=data['Text']
#     prompt=[]

#     prompt.append({
#         'role':'system',
#         'content':'''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
# ### Instruction:
# You are an Ai classification system that Read and understand the given Text in input and analyze the Text then classify the Content Type .
# NO EXPLANATIONS is required . YOU must choose from One of following Classes:
# Labelled class: Noise 
# or Labelled class: OBJECTIVE
# or Labelled class: POSITIVE
# or Labelled class: NEGATIVE
# or Labelled class: QUESTION
# or Labelled class: ADVERTISEMENT
# or Labelled class: NEUTRAL SENTIMENT
# or Labelled class: MISCELLANEOUS
# Ensure Strictly that output is from above list.'''
#     })
#     prompt.append({
#         'role':'user',
#         'content':f'### Input:\n{text}'
#     })

#     prompt.append({
#         'role':'assistant',
#         'content': 'Labelled class:'
#     })
#     return prompt

In [11]:
# print(prompt_generator(data=dum_dat)+Response_generator(data=dum_dat)+tokenizer.eos_token)

In [12]:
tokenizer.pad_token = tokenizer.eos_token
class Instruction_datast(Dataset):
    def __init__(self,csv_file,tokenizer,Max_length):
        
        self.file=pd.read_csv(csv_file)
        self.data= self.file[['Text', 'Datatype']].to_dict(orient='records')

        all_response=self._full_response(data=self.data)
        self.encoded=[tokenizer.encode(response) for response in all_response]
 
        if Max_length == None:
            self.max_length=self._longest_length()
        else:
            self.max_length=Max_length
        

        encoded_text=tokenizer(all_response,return_tensors='pt',padding='max_length',truncation=True,max_length=self.max_length)
        encoded_id=encoded_text['input_ids']
        encoded_mask=encoded_text['attention_mask']
             
        self.input_ids=encoded_id[:,:-1]
        self.input_mask=encoded_mask[:, :-1]
        self.target_ids=encoded_id[: ,1:]
        self.target_mask=encoded_mask[:, 1:]


        self.target_compare = encoded_id[:, 1:].clone()
        answer_texts = [" "+item["Datatype"] + tokenizer.eos_token for item in self.data]
        self.answer_token_ids =[tokenizer.encode(ans, add_special_tokens=False) for ans in answer_texts]

        
        for i in range(self.target_compare.shape[0]):
            full = self.target_compare[i]  # shape: [seq_len
            response=torch.tensor(self.encoded[i])
            answer_ids = torch.tensor(self.answer_token_ids[i])

            position=response.shape[0]-answer_ids.shape[0]-1
            is_match = torch.equal(full[position:position + answer_ids.shape[0]],answer_ids)

            if is_match:
                mask = torch.full_like(full, -100)
                mask[position:position + answer_ids.shape[0]] = full[position:position + answer_ids.shape[0]]
                self.target_compare[i] = mask 
            else:
                print('not found')
        


    def _longest_length(self):
        return max((len(encoding) for encoding in self.encoded),default=0)

    def _full_response(self,data:list[dict]):
        full_response=[]
        for dict in data:
            prompt=prompt_generator(data=dict)
            full_response.append(prompt)
        return full_response
    
    def __len__(self):
        return len(self.file) 

    def __getitem__(self,index):
        ''' 
        return{
            'input_ids':self.input_ids[index],
            'attention_mask':self.input_mask[index]
        },{
            'input_ids':self.target_ids[index],
            'attention_mask':self.target_mask[index]
        }'''
        return self.input_ids[index],self.target_compare[index]




In [13]:
dum_list=dum_dat=[{'Text':'what is a stock market','Datatype':'Question'},
                  {'Text':'nice too meet you','Datatype':'positive'},
                  {'Text':'stock goes up','Datatype':'Question'}
                  ]

In [14]:
dataset_test=Instruction_datast(csv_file='Vtwitter_test.csv',tokenizer=tokenizer,Max_length=None)
dataset_train=Instruction_datast(csv_file='Vtwitter_train.csv',tokenizer=tokenizer,Max_length=None)
dataset_val=Instruction_datast(csv_file='Vtwitter_validation.csv',tokenizer=tokenizer,Max_length=None)
# check.input_ids[1]

In [15]:
num_workers = 0
batch_size = 4

torch.manual_seed(123)


train_loader = DataLoader(
    dataset_train,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

val_loader = DataLoader(
    dataset_val,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

test_loader = DataLoader(
    dataset_train,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

In [16]:
def calc_accuracy_loader(data_loader, model, device, num_batches=None):
    model.eval()
    total_correct = 0
    total_count = 0
    with torch.no_grad():
        for i, (input_batch, target_batch) in enumerate(data_loader):
            if num_batches is not None and i >= num_batches:
                break

            input_batch = input_batch.to(device)
            target_batch = target_batch.to(device)

            output = model(input_batch)
            logits = output.logits  # shape [B, T, V]

            # We assume next-token prediction → shift for accuracy
            preds = logits.argmax(dim=-1)  # shape [B, T]
            mask = (target_batch != -100)  # skip ignored positions

            correct = (preds == target_batch) & mask
            total_correct += correct.sum().item()
            total_count += mask.sum().item()

    return total_correct / total_count if total_count > 0 else float('nan')


In [17]:
import torch.nn.functional as F

def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)

    output = model(input_batch)  # assumed shape: [B, T, V]
    logits=output.logits
    if torch.isnan(logits).any():
        print("⚠️ logits contain NaN!")
    if torch.isinf(logits).any():
        print("⚠️ logits contain Inf!")
    loss = F.cross_entropy(
        logits.view(-1, logits.size(-1)),  # [B*T, V]
        target_batch.view(-1),             # [B*T]
        ignore_index=-100                  # optional, if you use masked labels
    )
    return loss

In [18]:
def calc_loss_accuracy(dataloader,device,model,num_batches=None):
    total_loss=0
    if num_batches == None:
        num_batches=len(dataloader)
    else:
        num_batches= min(num_batches,len(dataloader))
    for i,(input_batch,target_batch) in enumerate(dataloader):
        if i<num_batches:
            # print("target min:", target_batch.min().item(), "max:", target_batch.max().item())
            loss= calc_loss_batch(input_batch=input_batch,target_batch=target_batch,model=model,device=device)
            total_loss+= loss.item()
        else:
            break
    return total_loss/num_batches

def evaluate_model(model, train_loader, vali_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_accuracy(train_loader, device, model, eval_iter)
        vali_loss = calc_loss_accuracy(vali_loader, device, model, eval_iter)
        train_acc = calc_accuracy_loader(train_loader, model, device, eval_iter)
        val_acc = calc_accuracy_loader(vali_loader, model, device, eval_iter)
    model.train()
    return train_loss, vali_loss, train_acc, val_acc


In [19]:

def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter,scheduler):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            scheduler.step()
            tokens_seen += input_batch.numel() # Returns the total number of elements (or tokens) in the input_batch.
            global_step += 1


            if global_step % eval_freq == 0:
                train_loss, val_loss, train_acc, val_acc = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)

                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f} | "
                      f"Train acc {train_acc*100:.2f}%, Val acc {val_acc*100:.2f}%")

    return train_losses, val_losses, track_tokens_seen

In [21]:
import time

start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(llm_model.parameters(), lr=5e-5, weight_decay=0.1)

num_epochs = 1
total_steps = len(train_loader) * num_epochs
warmup_steps = int(0.1 * total_steps)  # 10% warmup
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

train_losses, val_losses, tokens_seen = train_model_simple(
    llm_model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=40, eval_iter=25,scheduler=scheduler
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 0.583, Val loss 0.869 | Train acc 79.52%, Val acc 72.20%
Ep 1 (Step 000040): Train loss 0.669, Val loss 0.826 | Train acc 74.63%, Val acc 74.63%
Ep 1 (Step 000080): Train loss 0.655, Val loss 0.820 | Train acc 82.76%, Val acc 74.63%
Ep 1 (Step 000120): Train loss 0.571, Val loss 0.871 | Train acc 77.56%, Val acc 73.17%
Ep 1 (Step 000160): Train loss 0.550, Val loss 0.828 | Train acc 80.68%, Val acc 73.66%
Ep 1 (Step 000200): Train loss 0.613, Val loss 0.811 | Train acc 82.13%, Val acc 75.12%
Ep 1 (Step 000240): Train loss 0.647, Val loss 0.885 | Train acc 80.68%, Val acc 74.15%
Ep 1 (Step 000280): Train loss 0.620, Val loss 0.799 | Train acc 78.57%, Val acc 76.59%
Ep 1 (Step 000320): Train loss 0.582, Val loss 0.802 | Train acc 79.61%, Val acc 75.12%
Ep 1 (Step 000360): Train loss 0.620, Val loss 0.847 | Train acc 83.25%, Val acc 75.12%
Ep 1 (Step 000400): Train loss 0.523, Val loss 0.818 | Train acc 83.98%, Val acc 73.66%
Ep 1 (Step 000440): Train loss 0