In [20]:
# 1) Install required libraries
!pip install --quiet transformers datasets accelerate peft

# 2) Imports
import pandas as pd
from datasets import Dataset
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import LoraConfig, get_peft_model

# 3) Load csv file of hotel data and drop duplicates
df = pd.read_csv("ProcessedHotels.csv") \
       .drop_duplicates(subset=["name","address","description"])
print(f"Loaded {len(df)} unique records")

# 4) Build augmented QA examples in-memory
rows = []
for _, r in df.iterrows():
    name      = r["name"]
    addr      = r["address"]
    all_amens = r["all_amenities"]
    top5      = ", ".join(all_amens.split(",")[:5])
    email     = r["email"]
    phone     = r["phone"]
    price     = r["pricerange"]
    rating    = r["rating"]
    cls       = r["hotelclass"]
    website   = r["website"]

    rows += [
        {"prompt": f"List all amenities at {name} in {addr}.",
         "response": all_amens},
        {"prompt": f"What are the top 5 amenities at {name} in {addr}?",
         "response": top5},
        {"prompt": f"Give me the email and phone for {name}.",
         "response": f"{email}, {phone}"},
        {"prompt": f"What is the price range for {name}?",
         "response": price},
        {"prompt": f"What is the guest rating and class for {name}?",
         "response": f"{rating} stars, class {cls}"},
        {"prompt": f"Where can I find more info online about {name}?",
         "response": website},
        {"prompt": f"Summarize {name}: location, price, rating, and key amenities.",
         "response": (
             f"{name} is at {addr}. Price: {price}. Rating: {rating} stars. "
             f"Key amenities: {top5}."
         )}
    ]

aug_df = pd.DataFrame(rows)
print(f"Generated {len(aug_df)} QA examples")

# 5) Convert to HF Dataset and split 90/10
ds = Dataset.from_pandas(aug_df)
splits = ds.train_test_split(test_size=0.1, seed=42)
train_ds, eval_ds = splits["train"], splits["test"]
print(f"Train size: {len(train_ds)}, Eval size: {len(eval_ds)}")

# 6) Load tokenizer & model with LoRA
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small").to(device)
lora_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q","v"],
    lora_dropout=0.05,
    bias="none"
)
model = get_peft_model(base_model, lora_cfg)

# 7) Preprocessing function
def preprocess(batch):
    return tokenizer(
        batch["prompt"],
        text_target=batch["response"],
        max_length=128,
        padding="max_length",
        truncation=True
    )

# Apply tokenization: drop only 'prompt' and 'response'
train_ds = train_ds.map(
    preprocess,
    batched=True,
    remove_columns=["prompt", "response"]
)
eval_ds = eval_ds.map(
    preprocess,
    batched=True,
    remove_columns=["prompt", "response"]
)

# 8) Training arguments
training_args = TrainingArguments(
    output_dir="hotel-ft-final",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    learning_rate=3e-4,
    fp16=True,
    logging_steps=100,
    save_strategy="no",
    report_to="none"
)

# 9) Trainer & train
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator
)
trainer.train()

# 10) Save the fine‐tuned model & tokenizer
model.save_pretrained("hotel-ft-final")
tokenizer.save_pretrained("hotel-ft-final")
print("fine-tuning complete! Artifacts in ./hotel-ft-final/")


Loaded 982 unique records
Generated 6874 QA examples
Train size: 6186, Eval size: 688


Map:   0%|          | 0/6186 [00:00<?, ? examples/s]

Map:   0%|          | 0/688 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
100,0.0
200,0.0
300,0.0
400,0.0
500,0.0
600,0.0
700,0.0
800,0.0
900,0.0
1000,0.0


fine-tuning complete! Artifacts in ./hotel-ft-final/


In [27]:
# 1) Load csv file and the model
import pandas as pd
from transformers import pipeline
import torch

df = pd.read_csv("ProcessedHotels.csv").drop_duplicates(
    subset=["name","address","description"]
)

device = 0 if torch.cuda.is_available() else -1
generator = pipeline(
    "text2text-generation",
    model="hotel-ft-final",
    tokenizer="hotel-ft-final",
    device=device,
    max_new_tokens=60,
    no_repeat_ngram_size=3,
    early_stopping=True
)

# 2) Refined helper to ask only first five amenities
def ask_hotel(hotel_name, question):
    rec = df[df.name == hotel_name].iloc[0]
    context = "\n".join([
        f"- Name: {rec.name}",
        f"- Address: {rec.address}",
        f"- Amenities: {rec.all_amenities}",
        f"- Email: {rec.email}",
        f"- Phone: {rec.phone}",
        f"- Price range: {rec.pricerange}",
        f"- Rating: {rec.rating} stars",
        f"- Website: {rec.website or 'N/A'}",
    ])
    prompt = (
        f"Here is the hotel information:\n{context}\n\n"
        f"Question: {question}\nAnswer:"
    )
    return generator(prompt, do_sample=False)[0]["generated_text"].strip()

# 3) Test only top-5 amenities with an explicit instruction
print("🔸 Top 5 amenities at Rockwell Colombo:")
print(ask_hotel(
    "Rockwell Colombo",
    "From the Amenities list above, list only the first five items as bullet points."
), "\n")

# 4) Verify other fields stay correct
print("🔸 Email & phone for Ivy Lane Colombo:")
print(ask_hotel("Ivy Lane Colombo", "What is the email and phone number?"), "\n")

print("🔸 Website for Rockwell Colombo:")
print(ask_hotel("Rockwell Colombo", "What is the website URL?"))


Device set to use cuda:0


🔸 Top 5 amenities at Rockwell Colombo:
Breakfast included, Dry Cleaning, Non-smoking rooms, Laundry Service, Concierge, Air conditioning, Multilingual Staff, Minibar, Safe, Flat-screen TV, Private Balcony, Bath / Shower, Bottled Water, Complimentary Instant Coffee 

🔸 Email & phone for Ivy Lane Colombo:
+94 11 2 575733 

🔸 Website for Rockwell Colombo:
www.rockwellcolombo.com
