In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
try:
  import unsloth
  from unsloth import FastLanguageModel
except:
  # must change runtime to nvidia GPU
  !pip install unsloth -q
  import unsloth
  from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer
from datasets import load_dataset # To load the validation data again if needed

# --- Configuration ---
BASE_MODEL_NAME = "unsloth/gemma-3-4b-it" # The base model you used
LORA_ADAPTER_DIR = "/content/drive/MyDrive/PROJECTS/Customer Feedback ML Engineer Project/Gemma 3 Parameters/unsloth_gemma3_finetuned_adapters_csv/final_lora_adapter" # Path to your saved adapter
MAX_SEQ_LENGTH = 2048
LOAD_IN_4BIT = True

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.4/46.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.5/203.5 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.0/129.0 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.5/31.5 MB[0m [31m80.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.2/865.2 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# --- Load Model and Tokenizer with Adapter ---
print("Loading base model and tokenizer...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    load_in_4bit=LOAD_IN_4BIT,
    # Load the tokenizer that was saved with the adapter - it knows the chat template
    # If you didn't save the tokenizer with the adapter, you might need to re-apply the chat template:
    # tokenizer = get_chat_template(tokenizer, chat_template="gemma-3")
)
print("Base model loaded.")

print(f"Loading LoRA adapter from: {LORA_ADAPTER_DIR}")
# This automatically merges the adapter layers on top of the base model
# Make sure the path points to the directory containing adapter_config.json etc.
model.load_adapter(LORA_ADAPTER_DIR)
print("LoRA adapter loaded.")

# Ensure the pad token is set if it wasn't saved correctly (optional check)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "<pad>"})
    model.resize_token_embeddings(len(tokenizer))
    print("Added pad token to tokenizer.")

Loading base model and tokenizer...
==((====))==  Unsloth 2025.4.3: Fast Gemma3 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/4.56G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

Base model loaded.
Loading LoRA adapter from: /content/drive/MyDrive/PROJECTS/Customer Feedback ML Engineer Project/Gemma 3 Parameters/unsloth_gemma3_finetuned_adapters_csv/final_lora_adapter
LoRA adapter loaded.


In [None]:
# --- Prepare for Inference ---
# Move model to GPU
if torch.cuda.is_available():
    print("CUDA is available. The model should utilize it automatically.")
else:
    print("WARNING: CUDA is not available. Inference might be slower.")

# Optional: Use TextStreamer for token-by-token output
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

CUDA is available. The model should utilize it automatically.


In [None]:
import pandas as pd

df_train = pd.read_csv("/content/drive/MyDrive/PROJECTS/Customer Feedback ML Engineer Project/train_df_formatted.csv")
df_train.head()
df_test = pd.read_csv("/content/drive/MyDrive/PROJECTS/Customer Feedback ML Engineer Project/test_df_formatted.csv")
df_test.head()

Unnamed: 0,text
0,<start_of_turn>user\nI can't talk with a huma...
1,<start_of_turn>user\nI have got to locate hte ...
2,"<start_of_turn>user\nI cannot pay, help me to ..."
3,<start_of_turn>user\nI want help speaking to c...
4,<start_of_turn>user\nI try to see th accepted ...


In [None]:
df_test.iloc[10]['text']

'<start_of_turn>user\nI don\'t know how to delete my platinum account<end_of_turn>\n<start_of_turn>model\nThank you for getting in touch to us regarding the deletion of your {{Account Category}} account. I completely understand that you may be uncertain about the process, and I\'m here to guide you through it. To delete your {{Account Category}} account, you can follow these steps:\n\n1. Log in to your account on our website.\nIf you don\'t remember your login details, you can use the "Forgot Password" option to recover your account access.\n\n2. Once you are logged in, navigate to the account settings or profile section.\n\n3. Look for the option to manage your account or subscriptions. It may be labeled as "Delete Account," "Cancel Subscription," or something similar.\n\n4. Click on the delete/cancel option and follow any additional prompts or confirmation steps that are presented.\n\n5. If you encounter any difficulties or have further questions during the process, please don\'t hes

In [None]:
# --- Example Inference ---

# Option 1: Manually define a prompt in the Gemma 3 format
# Remember the structure: <start_of_turn>user\nPROMPT<end_of_turn>\n<start_of_turn>model\n
# The apply_chat_template method handles this structure for you.

prompt_text = "I don't know how to delete my platinum account" # Your test prompt

messages = [
    {"role": "user", "content": prompt_text},
    # Add previous turns here if you want to test multi-turn context
]

In [None]:
# Apply the chat template to format the input correctly for the model
# add_generation_prompt=True adds the '<start_of_turn>model\n' suffix to prompt generation
formatted_prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False, # Get the string representation
    add_generation_prompt=True
)

print("\n--- Formatted Prompt ---")
print(formatted_prompt)
print("------------------------\n")


--- Formatted Prompt ---
<bos><start_of_turn>user
I don't know how to delete my platinum account<end_of_turn>
<start_of_turn>model

------------------------



In [None]:
# Tokenize the formatted prompt
inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Generate the response
print("--- Generated Response ---")
outputs = model.generate(
    **inputs,
    max_new_tokens=150,  # Adjust as needed
    use_cache=True,      # Recommended for faster generation
    # Recommended Gemma-3 settings (adjust if needed)
    temperature=0.8, # Slightly lower temp might give more focused answers
    top_p=0.95,
    top_k=64,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    streamer=streamer, # Pass the streamer here
)
print("------------------------\n")


--- Generated Response ---
I understand your confusion and uncertainty regarding the process of deleting your {{Account Category}} account. Don't worry, I'm here to guide you through each step to ensure a smooth and hassle-free experience. To delete your {{Account Category}} account, you can follow these simple instructions:

1. Log in to your account on our website or mobile app.
2. Navigate to the account settings or profile section.
3. Look for the option to manage or delete your account.
4. Follow the prompts to confirm your decision and complete the deletion process.

If you encounter any difficulties or have any further questions, please don't hesitate to reach out to our customer support team. They are available {{Customer Support Hours}}
------------------------



In [None]:
# --- Option 2: Get prompt from your validation set ---
# You might need to reload your validation data if it's not in memory

print("\n--- Evaluating on Validation Data Example ---")
try:
   # Reload the dataset if needed (adjust path and split logic if necessary)
   full_dataset_val = load_dataset("csv", data_files="/content/drive/MyDrive/PROJECTS/Customer Feedback ML Engineer Project/train_df_formatted.csv", split="train")
   split_dataset_val = full_dataset_val.train_test_split(test_size=0.1, seed=42) # Use same seed
   valid_dataset_eval = split_dataset_val["test"]

   # Get a sample from the validation set
   sample_index = 10 # Choose an index
   validation_example_text = valid_dataset_eval[sample_index]['text']
   print(f"Validation Example (Index {sample_index}):\n{validation_example_text}")

   # Extract the user prompt part(s) manually or using string splitting
   # This can be tricky depending on multi-turn complexity.
   # Example for a simple two-turn conversation:
   parts = validation_example_text.split("<start_of_turn>model\n")
   user_prompt_section = parts[0] + "<start_of_turn>model\n" # Include the final model marker
   actual_response = parts[1].replace("<end_of_turn>", "").strip() if len(parts) > 1 else "[NO RESPONSE IN DATA]"

   print(f"\nExtracted Prompt Section:\n{user_prompt_section}")
   print(f"\nActual Response in Data:\n{actual_response}")

   # Generate response for the extracted prompt
   inputs_val = tokenizer([user_prompt_section], return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

   print("\n--- Generated Response for Validation Prompt ---")
   outputs_val = model.generate(
      **inputs_val,
      max_new_tokens=len(actual_response.split()) + 20, # Guess length based on actual
      use_cache=True,
      temperature=0.8, top_p=0.95, top_k=64,
      eos_token_id=tokenizer.eos_token_id,
      pad_token_id=tokenizer.pad_token_id,
      streamer=streamer,
   )
   print("--------------------------------------------\n")

except Exception as e:
   print(f"Could not load or process validation data for evaluation: {e}")


--- Evaluating on Validation Data Example ---


Generating train split: 0 examples [00:00, ? examples/s]

Validation Example (Index 10):
<start_of_turn>user
can uhelp me to use the {{Account Category}} profile<end_of_turn>
<start_of_turn>model
Unquestionably! I appreciate your interest in utilizing our {{Account Category}} account, which unlocks a plethora of additional features and benefits. Allow me to guide you through the process:

1. **Account Login:** Begin by signing in to your existing account.
2. **Access Account Settings:** Once you're logged in, navigate to your account settings or profile section.
3. **Upgrade Option:** Look for an option labeled "Upgrade" or "Switch Account Type" within these sections.
4. **{{Account Category}} Selection:** Choose the "{{Account Category}}" account type from the available options.
5. **Confirmation:** Follow the on-screen prompts to confirm your selection and complete the upgrade.

If you encounter any difficulties or have further inquiries about the {{Account Category}} account, please don't hesitate to reach out. I'm here to assist you every