<a href="https://colab.research.google.com/github/Nielia2002/sri-lanka-hotel-assistant-llm/blob/main/hotel_assistant_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1) Install required libraries
!pip install --quiet transformers datasets accelerate peft

# 2) Imports
import pandas as pd
from datasets import Dataset
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import LoraConfig, get_peft_model

# 3) Load csv file of hotel data and drop duplicates
df = pd.read_csv("ProcessedHotels.csv") \
       .drop_duplicates(subset=["name","address","description"])
print(f"Loaded {len(df)} unique records")

# 4) Build augmented QA examples in-memory
rows = []
for _, r in df.iterrows():
    name      = r["name"]
    addr      = r["address"]
    all_amens = r["all_amenities"]
    top5      = ", ".join(all_amens.split(",")[:5])
    email     = r["email"]
    phone     = r["phone"]
    price     = r["pricerange"]
    rating    = r["rating"]
    cls       = r["hotelclass"]
    website   = r["website"]

    rows += [
        {"prompt": f"List all amenities at {name} in {addr}.",
         "response": all_amens},
        {"prompt": f"What are the top 5 amenities at {name} in {addr}?",
         "response": top5},
        {"prompt": f"Give me the email and phone for {name}.",
         "response": f"{email}, {phone}"},
        {"prompt": f"What is the price range for {name}?",
         "response": price},
        {"prompt": f"What is the guest rating and class for {name}?",
         "response": f"{rating} stars, class {cls}"},
        {"prompt": f"Where can I find more info online about {name}?",
         "response": website},
        {"prompt": f"Summarize {name}: location, price, rating, and key amenities.",
         "response": (
             f"{name} is at {addr}. Price: {price}. Rating: {rating} stars. "
             f"Key amenities: {top5}."
         )}
    ]

aug_df = pd.DataFrame(rows)
print(f"Generated {len(aug_df)} QA examples")

# 5) Convert to HF Dataset and split 90/10
ds = Dataset.from_pandas(aug_df)
splits = ds.train_test_split(test_size=0.1, seed=42)
train_ds, eval_ds = splits["train"], splits["test"]
print(f"Train size: {len(train_ds)}, Eval size: {len(eval_ds)}")

# 6) Load tokenizer & model with LoRA
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small").to(device)
lora_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q","v"],
    lora_dropout=0.05,
    bias="none"
)
model = get_peft_model(base_model, lora_cfg)

# 7) Preprocessing function
def preprocess(batch):
    return tokenizer(
        batch["prompt"],
        text_target=batch["response"],
        max_length=128,
        padding="max_length",
        truncation=True
    )

# Apply tokenization: drop only 'prompt' and 'response'
train_ds = train_ds.map(
    preprocess,
    batched=True,
    remove_columns=["prompt", "response"]
)
eval_ds = eval_ds.map(
    preprocess,
    batched=True,
    remove_columns=["prompt", "response"]
)

# 8) Training arguments
training_args = TrainingArguments(
    output_dir="hotel-ft-final",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    learning_rate=3e-4,
    fp16=True,
    logging_steps=100,
    save_strategy="no",
    report_to="none"
)

# 9) Trainer & train
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator
)
trainer.train()

# 10) Save the fine‐tuned model & tokenizer
model.save_pretrained("hotel-ft-final")
tokenizer.save_pretrained("hotel-ft-final")
print("fine-tuning complete! Artifacts in ./hotel-ft-final/")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/6186 [00:00<?, ? examples/s]

Map:   0%|          | 0/688 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,0.0
200,0.0
300,0.0
400,0.0
500,0.0
600,0.0
700,0.0
800,0.0
900,0.0
1000,0.0


fine-tuning complete! Artifacts in ./hotel-ft-final/


In [4]:
# Reload your fine-tuned model & tokenizer for direct testing (no RAG)
from transformers import pipeline
import torch

device = 0 if torch.cuda.is_available() else -1
generator = pipeline(
    "text2text-generation",
    model="hotel-ft-final",
    tokenizer="hotel-ft-final",
    device=device,
    max_new_tokens=60,
    no_repeat_ngram_size=3,
    early_stopping=True
)

# 20 direct test prompts (🔸 bullets)
prompts = [
    "🔸 List the top 5 amenities at Ivy Lane Colombo.",
    "🔸 What is the price range and rating of Hotel Eurolanka?",
    "🔸 Are pets allowed at Breeze Apartment?",
    "🔸 What languages are spoken by staff at Ivy Lane Colombo?",
    "🔸 Is airport transportation available at Steps Backpackers Hostel?",
    "🔸 Provide a brief overview of Breeze Apartment’s location advantages."
]

# Run each prompt and display the response
for prompt in prompts:
    print(prompt)
    response = generator(prompt, do_sample=False)[0]["generated_text"].strip()
    print("→", response, "\n")


Device set to use cuda:0


🔸 List the top 5 amenities at Ivy Lane Colombo.
→ Ivy Lane Colombo is a 5-star hotel located in the heart of the city. 

🔸 What is the price range and rating of Hotel Eurolanka?
→ rated 3 out of 5 

🔸 Are pets allowed at Breeze Apartment?
→ No pets are allowed at Breeze Apartments. 

🔸 What languages are spoken by staff at Ivy Lane Colombo?
→ English Language 

🔸 Is airport transportation available at Steps Backpackers Hostel?
→ is available at Steps Backpackers Hostel 

🔸 Provide a brief overview of Breeze Apartment’s location advantages.
→ Breeze Apartments offers a wide range of amenities, including a 24-hour front desk, 24-hour room service, and a fully equipped kitchen. 

