In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_classic.schema import Document

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
from dotenv import load_dotenv
import pandas as pd

In [None]:
load_dotenv()

In [None]:
books= pd.read_csv("clean_book.csv")

In [None]:
books['tagged_description'].to_csv('tagged_description.txt', index=False, header=False, lineterminator="\n")

In [None]:
raw_doc = TextLoader('tagged_description.txt',encoding="utf-8").load()

In [None]:
def modify_isbn(isbn):
    isbn_digits = ''.join(ch for ch in isbn if ch.isdigit())
    return int(isbn_digits)

In [None]:
def split_lines_to_documents(raw_doc):
    try:
        def _extract_isbn(isbn_str):
            try:
                return modify_isbn(isbn_str)
            except Exception:
                pass
            digits = ''.join(ch for ch in isbn_str if ch.isdigit())
            return int(digits) if digits else None
    except NameError:
        def _extract_isbn(isbn_str):
            digits = ''.join(ch for ch in isbn_str if ch.isdigit())
            return int(digits) if digits else None

    full_text = "\n".join(getattr(d, "page_content", "") for d in raw_doc)
    lines = [ln.strip() for ln in full_text.splitlines() if ln.strip()]

    line_docs = []
    source = None
    if raw_doc and isinstance(raw_doc, (list, tuple)) and getattr(raw_doc[0], "metadata", None):
        source = raw_doc[0].metadata.get("source")

    for line in lines:
        parts = line.split(" ", 1)
        if len(parts) == 2:
            isbn_part, desc = parts
            isbn_val = _extract_isbn(isbn_part)
        else:
            isbn_val = None
            desc = line

        md = {}
        if source:
            md["source"] = source
        if isbn_val is not None:
            md["isbn"] = isbn_val

        line_docs.append(Document(page_content=desc, metadata=md))

    return line_docs

texts = split_lines_to_documents(raw_doc)

print(f"Created {len(texts)} documents (one per non-empty line).")

In [None]:
texts

In [None]:
BI_ENCODER_MODEL_NAME = 'all-MiniLM-L6-v2'
emb_model1 = SentenceTransformer(BI_ENCODER_MODEL_NAME)

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
embs=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
bi_vector_store = FAISS.from_documents(texts,embs)

In [None]:
query="A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the worst the world has to offer. At its heart is a tale of the sacred bonds between fathers and sons, pitch-perfect in style and story, set to dazzle critics and readers alike."
query1="The Chronometer of Clouds is a thrilling middle-grade adventure about family legacy, environmental stewardship, and the difficult choices we face when given the power to change the very air we breathe."

In [None]:
query2 = "a book to teach children about nature"

In [None]:
CANDIDATE_COUNT = 50 
candidates = bi_vector_store.similarity_search(query2, k=CANDIDATE_COUNT)

In [None]:
for doc in candidates:
    print(doc.page_content)

In [None]:
from sentence_transformers import CrossEncoder

CROSS_ENCODER_MODEL_NAME = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
reranker_model = CrossEncoder(CROSS_ENCODER_MODEL_NAME)


In [None]:
input_pairs = []
for doc in candidates:
    input_pairs.append([query2, doc.page_content])

relevance_scores = reranker_model.predict(input_pairs)

In [None]:
scored_candidates = []
for doc, score in zip(candidates, relevance_scores):
    scored_candidates.append({'doc': doc, 'score': score})

scored_candidates.sort(key=lambda x: x['score'], reverse=True)


FINAL_TOP_N = 10
final_recommendations = [item['doc'] for item in scored_candidates[:FINAL_TOP_N]]


print("\n--- Final Top 10 Recommendations ---")
for i, doc in enumerate(final_recommendations):
    title = doc.metadata.get('isbn', 'No Title Available') 
    score = scored_candidates[i]['score']
    name = book = books[books['isbn13'] == title]['title']
    print(f"{i+1}. {title} ,{name} (Score: {score:.4f})")

In [None]:
bi_vector_store.save_local(folder_path="vector_store")

In [None]:
local_store=FAISS.load_local(
    "vector_store",
    embs,
    allow_dangerous_deserialization=True
)

In [None]:
retrieved = local_store.similarity_search(query1,k=50)