## Phi3.5 Sentiment Finetune

For HearSay Project

In [1]:
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import TrainingArguments, DataCollatorForLanguageModeling
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import os

# Enable CUDA optimization
os.environ["CUDA_LAUNCH_BLOCKING"] = "0"
torch.backends.cudnn.benchmark = True

# === Load model + tokenizer ===
max_seq_length = 1024
model_name = "microsoft/Phi-3.5-mini-instruct"

# Setup quantization config (replacing load_in_4bit)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,  # Double quantization for better performance
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

model = prepare_model_for_kbit_training(model)

# Setup LoRA configuration
lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0.0,
    bias="none",
    task_type="CAUSAL_LM",
)

# Apply LoRA
model = get_peft_model(model, lora_config)

# Enable gradient checkpointing for memory efficiency
if hasattr(model, "enable_gradient_checkpointing"):
    model.enable_gradient_checkpointing()

tokenizer




`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaTokenizerFast(name_or_path='microsoft/Phi-3.5-mini-instruct', vocab_size=32000, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '<|endoftext|>', 'unk_token': '<unk>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=False),
	32000: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<|assistant|>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=True),
	32002: AddedToken("<|placeholder1|>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=

In [2]:
# Convert to HF dataset
#dataset = Dataset.from_list(examples)
import json
from datasets import Dataset
from datasets import load_dataset

dataset = load_dataset("json", data_files="finetunedabsa_edit.jsonl")["train"]
#dataset = dataset[:300]
#dataset = dataset.select(range(5000))
dataset[:2]

{'messages': [[{'role': 'system',
    'content': 'You are an AI assistant that extracts keywords from customer reviews. For each review, identify keywords and assign a sentiment score as a float between -1 and 1. Respond ONLY with valid JSON using this format:\n\n{"keywords": [{"review_id": "<id>", "keyword": "<word>", "sentiment": "<float>"}, ...]}'},
   {'role': 'user',
    'content': '1.) The quality of the fabric used in this dress is impeccable , but the design is outdated .'},
   {'role': 'assistant',
    'content': '{"keywords": [{"review_id": "1", "keyword": "fabric", "sentiment": "1"}, {"review_id": "1", "keyword": "impeccable", "sentiment": "1"}, {"review_id": "1", "keyword": "design", "sentiment": "-1"}, {"review_id": "1", "keyword": "outdated", "sentiment": "-1"}]}'}],
  [{'role': 'system',
    'content': 'You are an AI assistant that extracts keywords from customer reviews. For each review, identify keywords and assign a sentiment score as a float between -1 and 1. Respond

In [3]:
def extract_user_assistant_content(example):
    """
    Extract only the user query and assistant response from the messages field.
    
    Args:
        example: A single data example from the dataset
        
    Returns:
        dict: A dictionary with user_query and assistant_response fields
    """
    user_content = None
    assistant_content = None
    
    for message in example['messages']:
        if message['role'] == 'user':
            user_content = message['content']
        elif message['role'] == 'assistant':
            assistant_content = message['content']
    
    return {
        'text': user_content,
        'labels': assistant_content
    }

# Apply the transformation to each example in the dataset
simplified_dataset = dataset.map(extract_user_assistant_content)

# Keep only the columns we need
simplified_dataset = simplified_dataset.remove_columns(['messages'])

simplified_dataset[0:2]

{'text': ['1.) The quality of the fabric used in this dress is impeccable , but the design is outdated .',
  '2.) I was excited to purchase these shoes , but they were too narrow and uncomfortable to wear for extended periods of time .'],
 'labels': ['{"keywords": [{"review_id": "1", "keyword": "fabric", "sentiment": "1"}, {"review_id": "1", "keyword": "impeccable", "sentiment": "1"}, {"review_id": "1", "keyword": "design", "sentiment": "-1"}, {"review_id": "1", "keyword": "outdated", "sentiment": "-1"}]}',
  '{"keywords": [{"review_id": "2", "keyword": "shoes", "sentiment": "-1"}, {"review_id": "2", "keyword": "narrow", "sentiment": "-1"}, {"review_id": "2", "keyword": "shoes", "sentiment": "-1"}, {"review_id": "2", "keyword": "uncomfortable", "sentiment": "-1"}]}']}

In [4]:
from datasets import Dataset
import pandas as pd

# Convert our dataset format
def prepare_dataset(dataset):
    all_examples = []
    for text, labels in zip(dataset["text"], dataset["labels"]):
        # Create the complete example
        example = {
            "text": f"<|user|>{text}<|end|><|assistant|>{labels}<|endoftext|>"
        }
        all_examples.append(example)
    
    # Convert to pandas then to Dataset
    df = pd.DataFrame(all_examples)
    return Dataset.from_pandas(df)

# Create a new properly formatted dataset
formatted_dataset = prepare_dataset(simplified_dataset)


# === Trainer setup ===
def formatting_func(example):
    return example["text"]

trainer = SFTTrainer(
    model=model,
    train_dataset=formatted_dataset,
    formatting_func=formatting_func,
    #max_seq_length=max_seq_length,
    #tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    ),
    args=TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        num_train_epochs=1,
        #max_steps=10,
        learning_rate=2e-5, #2e-5 for final run.
        logging_steps=2,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="phi3.5-model",
        report_to="none",
    ),
)

# === Train ===
trainer.train()

Applying formatting function to train dataset:   0%|          | 0/39693 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/39693 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/39693 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/39693 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/39693 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
You are not running the flash-attention implementation, expect numerical differences.


Step,Training Loss
2,1.7135
4,1.8164
6,1.6059
8,1.7608
10,1.623
12,1.6294
14,1.9284
16,1.7238
18,1.6414
20,1.7241


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


KeyboardInterrupt: 

In [None]:
# === Inference function ===

def generate_text(model, tokenizer, prompt, max_new_tokens=127, temperature=0.7, top_p=0.9):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    try:
        # First try to use the standard generate method with streamer disabled
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=top_p,
                do_sample=(temperature > 0),
                pad_token_id=tokenizer.pad_token_id,
                use_cache=True,
                streamer=None
            )
        
        # Get only the generated text (not the prompt)
        generated_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
        
    except AttributeError:
        # Fallback to manual generation if the standard method fails
        input_ids = inputs.input_ids
        
        # Manual token-by-token generation loop
        with torch.no_grad():
            for _ in range(max_new_tokens):
                # Get logits for next token
                outputs = model(input_ids=input_ids)
                next_token_logits = outputs.logits[:, -1, :]
                
                if temperature > 0:
                    # Apply temperature
                    next_token_logits = next_token_logits / temperature
                    
                    # Apply top-p (nucleus) sampling
                    if top_p < 1.0:
                        sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
                        cumulative_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)
                        
                        # Remove tokens with cumulative probability above the threshold
                        sorted_indices_to_remove = cumulative_probs > top_p
                        # Shift the indices to the right to keep also the first token above the threshold
                        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                        sorted_indices_to_remove[..., 0] = 0
                        
                        # Create a mask for indices to remove
                        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
                        next_token_logits[indices_to_remove] = -float('Inf')
                    
                    # Sample from the filtered distribution
                    probs = torch.nn.functional.softmax(next_token_logits, dim=-1)
                    next_token = torch.multinomial(probs, num_samples=1)
                else:
                    # Greedy decoding
                    next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
                
                # Append next token to input_ids
                input_ids = torch.cat([input_ids, next_token], dim=1)
                
                # Stop if we predict EOS token
                if next_token.item() == tokenizer.eos_token_id:
                    break
        
        # Get only the generated text (not the prompt)
        input_len = inputs.input_ids.shape[1]
        generated_text = tokenizer.decode(input_ids[0][input_len:], skip_special_tokens=True)
    
    return generated_text



# === Tests ===
print(generate_text(model=model, tokenizer=tokenizer, prompt="1) The design is the dress is outdated ."))
print("=====")
print(generate_text(model=model, tokenizer=tokenizer, prompt="2) I was really disappointed with the lack of cleanliness at the hospital . The floors were dirty ."))
print("=====")
print(generate_text(model=model, tokenizer=tokenizer, prompt="3) Tell me a joke."))

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from safetensors.torch import save_file
import subprocess
import os

def save_model_safetensors(model, output_path):
    """
    Save the model in safetensors format.
    
    Args:
        model: The model to save
        output_path: Path to save safetensors files
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    
    # Get state dict
    state_dict = model.state_dict()
    
    # Save in safetensors format
    for name, tensor in state_dict.items():
        # Split large models into parts if needed
        save_file({name: tensor}, f"{output_path}/{name.replace('.', '_')}.safetensors")
    
    print(f"Model saved in safetensors format at {output_path}")


save_path = "phi3.5-model-finetuned"

# Merge LoRA weights into base model
model = model.merge_and_unload()

# Save as hf format (as a .safetensors)
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

# Save per safetensor layer
safetensors_path = f"{save_path}-safetensors"
save_model_safetensors(model, safetensors_path)