In [1]:
from langchain_core.documents import Document

docs = []
with open("tag_description.txt", encoding="utf-8") as f:
    for line in f:
        docs.append(Document(page_content=line.rstrip("\n")))

texts = [d.page_content.strip() for d in docs if getattr(d, "page_content", None)]
seen, dedup_texts = set(), []
for t in texts:
    if t and t not in seen:
        dedup_texts.append(t)
        seen.add(t)

print(f"Original: {len(docs)} | After clean+dedup: {len(dedup_texts)}")


Original: 5197 | After clean+dedup: 5197


In [4]:
dedup_texts[100]

'"9780060099459 It\'s Christmas Eve, and Amelia Bedelia has to get the Rogers house ready for a visit from Aunt Myra. But with Amelia Bedelia in charge, this Christmas will be one of the funniest ever!"'

In [5]:
import re

def parse_isbn_line(line: str):
    """Return (isbn, text). Assumes each line starts with a valid ISBN-13 (13 digits)."""
    line = line.strip().lstrip('"').rstrip('"')   # strip spaces + optional quotes
    parts = line.split(maxsplit=1)
    if not parts:
        return None, ""
    
    isbn = parts[0]
    if not re.fullmatch(r"\d{13}", isbn):
        raise ValueError(f"Line does not start with a valid ISBN-13: {line}")
    
    text = parts[1].strip() if len(parts) > 1 else ""
    return isbn, text


In [6]:
records = []
for line in dedup_texts:
    isbn, body = parse_isbn_line(line)
    if body:
        records.append({"isbn": isbn, "text": body})

print(f"Parsed {len(records)} rows with text; {sum(1 for r in records if r['isbn'])} have ISBNs")


Parsed 5197 rows with text; 5197 have ISBNs


In [7]:
from langchain_community.embeddings import HuggingFaceEmbeddings

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
emb = HuggingFaceEmbeddings(
    model_name=MODEL_NAME,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True, "batch_size": 64}
)


  emb = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [33]:
records[0]

{'isbn': '9780002005883',
 'text': 'A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of 

In [32]:
records_df = pd.DataFrame(records)

In [30]:
import pandas as pd
books = pd.read_csv('books_data_classified_with_emotions.csv')
books.columns

Index(['isbn13', 'isbn10', 'title', 'authors', 'categories', 'thumbnail',
       'description', 'published_year', 'average_rating', 'num_pages',
       'ratings_count', 'title_subtitle', 'tag_description',
       'simple_categories', 'anger', 'disgust', 'fear', 'joy', 'neutral',
       'sadness', 'surprise'],
      dtype='object')

In [36]:
# 1) Normalize both keys to comparable strings (digits only)
def norm_isbn(s):
    return (s.astype("string")                # robust to ints/NaN
             .str.replace(r"[^0-9]", "", regex=True))

records_df["isbn_key"]  = norm_isbn(records_df["isbn"])
books["isbn13_key"]     = norm_isbn(books["isbn13"])

books["isbn13_key"]  = books["isbn13_key"].str.zfill(13)
records_df["isbn_key"] = records_df["isbn_key"].str.zfill(13)

cols = ['isbn13','authors','thumbnail','published_year',
        'title_subtitle','simple_categories','anger','disgust',
        'fear','joy','neutral','sadness','surprise']

records_df = records_df.merge(
    books[cols + ["isbn13_key"]],
    left_on="isbn_key",
    right_on="isbn13_key",
    how="left"
)

# 3) Optional cleanup
records_df.drop(columns=["isbn13_key", "isbn_key"], inplace=True)


In [39]:
records_df.drop(columns=["isbn13"], inplace=True)


In [41]:
records_df.head(2)


Unnamed: 0,isbn,text,authors,thumbnail,published_year,title_subtitle,simple_categories,anger,disgust,fear,joy,neutral,sadness,surprise
0,9780002005883,A NOVEL THAT READERS and critics have been eag...,Marilynne Robinson,http://books.google.com/books/content?id=KQZCP...,2004.0,Gilead,Fiction,0.064134,0.273591,0.928168,0.932798,0.646216,0.967158,0.729602
1,9780002261982,A new 'Christie for Christmas' -- a full-lengt...,Charles Osborne;Agatha Christie,http://books.google.com/books/content?id=gA5GP...,2000.0,Spider's Web: A Novel,Fiction,0.612619,0.348284,0.942528,0.704422,0.88794,0.11169,0.252546


In [42]:
records_dict = records_df.to_dict(orient="records")

In [44]:
records_df.columns

Index(['isbn', 'text', 'authors', 'thumbnail', 'published_year',
       'title_subtitle', 'simple_categories', 'anger', 'disgust', 'fear',
       'joy', 'neutral', 'sadness', 'surprise'],
      dtype='object')

In [43]:
records_dict[0]

{'isbn': '9780002005883',
 'text': 'A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of 

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [None]:
records

In [45]:
from langchain_core.documents import Document

docs = [
    Document(
        page_content=r["text"],
        metadata={k: v for k, v in r.items() if k != "text"}
    )
    for _, r in records_df.iterrows()
]


In [53]:
docs[0]

Document(metadata={'isbn': '9780002005883', 'authors': 'Marilynne Robinson', 'thumbnail': 'http://books.google.com/books/content?id=KQZCPgAACAAJ&printsec=frontcover&img=1&zoom=1&source=gbs_api', 'published_year': 2004.0, 'title_subtitle': 'Gilead', 'simple_categories': 'Fiction', 'anger': 0.0641336366534233, 'disgust': 0.2735911309719085, 'fear': 0.9281682968139648, 'joy': 0.932798147201538, 'neutral': 0.6462159156799316, 'sadness': 0.967157542705536, 'surprise': 0.7296021580696106}, page_content='A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift betw

In [63]:
from qdrant_client import QdrantClient
from langchain_community.vectorstores import Qdrant


vectorstore = Qdrant.from_documents(
    documents=docs,
    embedding=emb,
    url=Qdrant_URL,
    api_key=Qdrant_API_KEY,
    collection_name=Qdrant_name,
)


In [64]:
docs = vectorstore.similarity_search("preacher in Iowa, multi-generational", k=5)
for d in docs:
    print(d.metadata.get("isbn"), d.page_content[:100])


9781581345285 A prayerful pastor and a skeptical reporter compare notes and find themselves fighting a plot to sub
9780374299194 The author describes growing up in a family of all boys in Webster Groves, Missouri, reflecting on s
9780883689448 Recapture God's glory with twelve compelling spiritual biographies of some of the most powerful mini
9780310252191 Tested strategies for pastors and churches that want to be somewhere else in the postmodern world an
9780787981297 Churches have tried all kinds of ways to attract new and younger members - revised vision statements


In [65]:
docs

[Document(metadata={'isbn': '9781581345285', 'authors': 'Frank E. Peretti', 'thumbnail': 'http://books.google.com/books/content?id=eFBOvBvKqE4C&printsec=frontcover&img=1&zoom=1&source=gbs_api', 'published_year': 2003.0, 'title_subtitle': 'This Present Darkness', 'simple_categories': 'Fiction', 'anger': 0.0641336366534233, 'disgust': 0.1040067374706268, 'fear': 0.9730560183525084, 'joy': 0.069264568388462, 'neutral': 0.549476683139801, 'sadness': 0.1116901934146881, 'surprise': 0.0787654668092727, '_id': 'c80c50ba-b229-4c01-aa9f-73f52f97c035', '_collection_name': 'Book-Recommendar'}, page_content='A prayerful pastor and a skeptical reporter compare notes and find themselves fighting a plot to subjugate the human race. A gripping look into the invisible spiritual warfare around us and the power of prayer.'),
 Document(metadata={'isbn': '9780374299194', 'authors': 'Jonathan Franzen', 'thumbnail': 'http://books.google.com/books/content?id=Z2vOAwAAQBAJ&printsec=frontcover&img=1&zoom=1&sourc