In [1]:
import pandas as pd

df=pd.read_csv('/content/data.csv')
df.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0


In [2]:
df.shape

(6810, 12)

In [3]:
import pandas as pd

df=pd.read_csv('/content/data.csv')
df.head()

# Drop rows with missing values and make a clean copy
df = df.dropna(subset=["title", "authors", "categories", "description", "published_year"]).copy()

# Drop duplicates
df = df.drop_duplicates(subset="title").copy()

# Clean text fields
def clean(text):
    return str(text).replace("\n", " ").strip()

df["description"] = df["description"].apply(clean)
df["authors"] = df["authors"].str.replace(";", " & ", regex=False).apply(clean)
df["categories"] = df["categories"].apply(clean)

import random
import json

def create_instruction(category):
    templates = [
        f"Can you recommend a good {category.lower()} book?",
        f"I'm interested in {category.lower()}. Any book suggestions?",
        f"Suggest a {category.lower()} book worth reading."
    ]
    return random.choice(templates)

instruction_data = []

for _, row in df.iterrows():
    title = row["title"]
    author = row["authors"]
    category = row["categories"]
    year = int(row["published_year"])
    description = row["description"]

    prompt = create_instruction(category)
    response = f"You might enjoy *{title}* by {author}, published in {year}. {description}"

    sample = {
        "instruction": prompt,
        "input": "",
        "output": response
    }

    instruction_data.append(sample)

with open("clean_data.jsonl", "w", encoding="utf-8") as f:
    for ex in instruction_data:
        f.write(json.dumps(ex, ensure_ascii=False) + "\n")

print(f"✅ Cleaned and saved {len(instruction_data)} samples for LoRA training.")


✅ Cleaned and saved 6076 samples for LoRA training.


In [4]:
!pip install -q --no-cache-dir peft==0.8.2 bitsandbytes accelerate
!pip install transformers==4.28.0

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m236.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m156.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m148.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m164.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m206.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m216.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m232.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
import json
from datasets import Dataset

# Load your jsonl file manually
with open("/content/clean_data.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# Create HF Dataset from list of dicts
dataset = Dataset.from_list(data)
dataset = dataset.select(range(1000))

print(dataset)

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 1000
})


In [6]:
from transformers import AutoTokenizer

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)  # <-- Must be here BEFORE preprocess_function

def preprocess_function(examples):
    inputs = [f"{instr} {inp}".strip() for instr, inp in zip(examples["instruction"], examples["input"])]
    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding="max_length",
    )
    labels = tokenizer(
        examples["output"],
        max_length=256,
        truncation=True,
        padding="max_length",
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [7]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForSeq2SeqLM

base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v", "k", "o"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)  # Now 'model' is defined

training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-bookbuddy-lora-checkpoints",
    per_device_train_batch_size=2,
    max_steps=500,
    learning_rate=5e-4,
    num_train_epochs=1,
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="no",
    # gradient_checkpointing=True,
    save_total_limit=2,
    fp16=False,  # disable fp16 on CPU
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
model.save_pretrained("flan-bookbuddy-adapter")

tokenizer.save_pretrained("flan-bookbuddy-adapter")




config.json: 0.00B [00:00, ?B/s]



model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


trainable params: 1,769,472 || all params: 249,347,328 || trainable%: 0.7096414524241463


Step,Training Loss
10,17.6254
20,17.119
30,14.7681
40,8.1933
50,5.0158
60,4.5399
70,4.3326
80,4.2348
90,4.0453
100,3.8997




('flan-bookbuddy-adapter/tokenizer_config.json',
 'flan-bookbuddy-adapter/special_tokens_map.json',
 'flan-bookbuddy-adapter/spiece.model',
 'flan-bookbuddy-adapter/added_tokens.json',
 'flan-bookbuddy-adapter/tokenizer.json')

In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel
import torch

model_name = "google/flan-t5-base"
adapter_folder = "flan-bookbuddy-adapter"

# Load base model
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
model = PeftModel.from_pretrained(base_model, adapter_folder).eval()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(adapter_folder)

# Allowed prefixes for filtering user input
ALLOWED_PREFIXES = [
    "suggest", "recommend", "find", "give", "book", "novel", "short story",
    "what book", "books about", "book on", "good book"
]

def is_valid_book_prompt(prompt: str) -> bool:
    prompt_lower = prompt.lower()
    return any(prefix in prompt_lower for prefix in ALLOWED_PREFIXES)

# Basic word filter for toxicity
BANNED_WORDS = ["white people", "black people", "racist", "jews", "nazi", "suicide", "rape"]

def contains_toxicity(text: str) -> bool:
    return any(word in text.lower() for word in BANNED_WORDS)

# Master generation function with safeguards
def generate_response(prompt: str) -> str:
    if not is_valid_book_prompt(prompt):
        return "❌ Please ask a question related to book recommendations."

    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.2,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id
        )

    generated = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    if contains_toxicity(generated):
        return "⚠️ The model refused to answer this prompt due to policy restrictions."

    return generated

# Local test (comment this out in Space)
print(generate_response("Can you recommend a good detective book with a strong female lead?"))


You might enjoy *The Greatest Mystery Ever?* by John B. White, published in 1999. The story of a group of people who go to an unidentified man in the world's most dangerous city, with a group of men in search of a new lover. They are to go on a trip to a strange place, a strange world, a mysterious city, and a strange young woman who has a mysterious past. The book is
