In [1]:
import os
import requests
import psycopg2
from dotenv import load_dotenv

load_dotenv()

db_host = os.getenv("DB_HOST")
db_user = os.getenv("DB_USER")
db_password = os.getenv("DB_PASSWORD")
db_port = os.getenv("DB_PORT")
db_name = os.getenv("DB_NAME")

# Connecting to database
conn = psycopg2.connect(
    dbname = db_name,
    user = db_user,
    password = db_password,
    host = db_host,
    port = db_port
)

cursor = conn.cursor()

In [24]:
# Creating the table if it doesn't already exist
cursor.execute("CREATE TABLE IF NOT EXISTS books (book_id TEXT PRIMARY KEY, description TEXT)")
conn.commit()

In [25]:
# Function to fetch data from the Open Library API and storing it in a database
def fetch_books(subject, limit=1000):
    url = f"https://openlibrary.org/subjects/{subject}.json"
    params = {
        'limit': 100,  
        'offset': 0    
    }

    while limit > 0:
        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"Error: {response.status_code}")
            break
        
        data = response.json()
        works = data.get('works', [])

        for book in works:
            id = book.get('availability', {}).get('identifier', '')
            if not id:
                continue
            try:
                description = f"The title of the book is {book.get('title', '')} and was writen by {book.get('authors', {})[0].get('name', [])}. It was initially publish in the year {book.get('first_publish_year')}. Genres are {', '.join(book.get('subject', []))}."
                cursor.execute("INSERT INTO books (book_id, description) VALUES (%s, %s) ON CONFLICT (book_id) DO NOTHING", (id, description))
            except:
                pass
        
        conn.commit()

        params['offset'] += 100
        limit -= 100

In [26]:
# Fetching the data
fetch_books('fiction', 5000)
fetch_books('nonfiction', 5000)

In [27]:
# Table for storing chunked data
cursor.execute("CREATE TABLE IF NOT EXISTS chunks (chunk_id UUID PRIMARY KEY, b_id TEXT REFERENCES books(book_id), text TEXT, metadata JSONB, embeddings FLOAT8[])")
conn.commit()

In [28]:
import re
import uuid
from transformers import AutoTokenizer
import json as json

# Function to tranform text data into manageable chunks for use in models
def book_chunker(conn, 
                model_name, 
                chunk_size = 1024, 
                chunk_overlap = 0,  
                separator = ' ', 
                secondary_chunking_regex = r'\S+?[\.,;!?]'):
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    books = {}

    with conn.cursor() as cursor:
        cursor.execute("SELECT * FROM books")
        rows = cursor.fetchall()

    for id, desc in rows:
        all_chunks = {}
        words = desc.split(separator)
        current_chunk = ""
        chunks = []
        for word in words:
            new_chunk = current_chunk + (separator if current_chunk else '') + word
            if len(tokenizer.tokenize(new_chunk)) <= chunk_size:
                current_chunk = new_chunk
            else:
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = word
        
        if current_chunk:
            chunks.append(current_chunk)

        refined_chunks = []
        for chunk in chunks:
            if len(tokenizer.tokenize(chunk)) > chunk_size:
                sub_chunks = re.split(secondary_chunking_regex, chunk)
                sub_chunk_accum = ""
                for sub_chunk in sub_chunks:
                    if sub_chunk_accum and len(tokenizer.tokenize(sub_chunk_accum + sub_chunk + ' ')) > chunk_size:
                        refined_chunks.append(sub_chunk_accum.strip())
                        sub_chunk_accum = sub_chunk
                    else:
                        sub_chunk_accum += (sub_chunk + ' ')
                if sub_chunk_accum:
                    refined_chunks.append(sub_chunk_accum.strip())
            else:
                refined_chunks.append(chunk)

        final_chunks = []
        if chunk_overlap > 0 and len(refined_chunks) > 1:
            for i in range(len(refined_chunks) - 1):
                final_chunks.append(refined_chunks[i])
                overlap_start = max(0, len(refined_chunks[i]) - chunk_overlap)
                overlap_end = min(chunk_overlap, len(refined_chunks[i + 1]))
                overlap_chunk = refined_chunks[i][overlap_start:] + ' ' + refined_chunks[i + 1][:overlap_end]
                final_chunks.append(overlap_chunk)
            final_chunks.append(refined_chunks[-1])
        else:
            final_chunks = refined_chunks

        for chunk in final_chunks:
            chunk_id = str(uuid.uuid4())
            all_chunks[chunk_id] = {"text": chunk, "metadata": {"book_id": id}}

        books[id] = all_chunks

    with conn.cursor() as cursor:
        for book_id, chunks in books.items():
            for chunk_id, chunk_data in chunks.items():
                cursor.execute("INSERT INTO chunks (chunk_id, b_id, text, metadata) VALUES (%s, %s, %s, %s) ON CONFLICT (chunk_id) DO NOTHING", (chunk_id, book_id, chunk_data["text"], json.dumps(chunk_data["metadata"])))
        conn.commit()

In [29]:
book_chunker(conn, model_name='BAAI/bge-small-en-v1.5', chunk_size = 256, chunk_overlap = 25)

In [None]:
# Saving the tokeniser and model locally
from transformers import AutoModel, AutoTokenizer
import torch
import os

model_name = "BAAI/bge-small-en-v1.5"

#  Loads the tokenizer associated with the specified model. The tokenizer is responsible for converting text into tokens that the model can understand.
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Loads the pre-trained model itself, which will be used to generate embeddings from the tokenized text.
model = AutoModel.from_pretrained(model_name)

tokenizer_save_path = "model/tokenizer"
model_save_path = "model/embedding"

os.makedirs(tokenizer_save_path, exist_ok=True)
os.makedirs(model_save_path, exist_ok=True)

tokenizer.save_pretrained(tokenizer_save_path)
model.save_pretrained(model_save_path)

In [9]:
# Function to generate embeddings for a given text using the loaded model and tokenizer
def compute_embeddings(text):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
    model = AutoModel.from_pretrained(model_save_path)

    inputs = tokenizer(text, return_tensors = "pt", padding = True, truncation = True)

    # Temporarily disables gradient calculation which reduces memory usage and speeds up computation
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim = 1).squeeze()
    
    return embeddings.tolist()

In [46]:
# Function that retrieves book descriptions and computes their embeddings
def create_vector_store():
    cursor.execute("SELECT chunk_id, text FROM chunks")
    rows = cursor.fetchall()

    for chunk_id, text in rows:
        embedding = compute_embeddings(text)
        cursor.execute("UPDATE chunks SET embeddings = %s WHERE chunk_id = %s", (embedding, chunk_id))
        conn.commit()

In [47]:
create_vector_store()

In [4]:
import numpy as np

# Function that computes embeddings of the query string, computes cosine similarity, and return top_k matches based on the scores
def compute_matches(query_str, top_k):
    query_str_embedding = np.array(compute_embeddings(query_str))
    scores = []

    cursor.execute("SELECT b_id, chunk_id, embeddings FROM chunks WHERE embeddings IS NOT NULL")
    rows = cursor.fetchall()

    for book_id, chunk_id, chunk_embedding in rows:
        chunk_embedding_array = np.array(chunk_embedding)

        # Normalizing embeddings to unit vectors for cosine similarity calculation
        norm_query = np.linalg.norm(query_str_embedding)
        norm_chunk = np.linalg.norm(chunk_embedding_array)

        if norm_query == 0 or norm_chunk == 0:
            score = 0
        else:
            score = np.dot(chunk_embedding_array, query_str_embedding) / (norm_query * norm_chunk)

        scores.append((book_id, chunk_id, score))

    sorted_scores = sorted(scores, key = lambda item: item[2], reverse = True)[:top_k]
    top_results = [(book_id, chunk_id, score) for (book_id, chunk_id, score) in sorted_scores]

    return top_results


In [59]:
matches = compute_matches("Mystery novels with strong female leads", 5)

print("Top Matches: ")
for book_id, chunk_id, score in matches:
    print(f"Book ID: {book_id}, Chunk ID: {chunk_id}, Similarity Score: {score:.4f}")

    cursor.execute("SELECT text FROM chunks WHERE chunk_id = %s", (chunk_id,))
    matching_text = cursor.fetchone()[0]
    print("Matching text: ", matching_text)

Top Matches: 
Book ID: smertelnyiiadrom00saye, Chunk ID: 5c85bff1-c693-4b4d-9357-4c11b8b14db8, Similarity Score: 0.6613
Matching text:  The title of the book is Strong Poison and was writen by Dorothy L. Sayers. It was initially publish in the year 1930. Genres are Women detectives, English Detective and mystery stories, Translations into Russian, Harriet Vane (Fictitious character), Private investigators, Lord Wimsey, Peter (Fictitious character), Fiction, Detective and mystery stories, Large type books, Apologetics, Dogma, Time, Fiction, mystery & detective, general, Wimsey, peter, lord (fictitious character), fiction, Fiction, mystery & detective, traditional, England, fiction.
Book ID: lhotelbertram0000chri, Chunk ID: 35d7f6c5-8243-47cd-a55b-fdd79ef04afa, Similarity Score: 0.6429
Matching text:  The title of the book is At Bertram's Hotel and was writen by Agatha Christie. It was initially publish in the year 1965. Genres are Fiction, Jane Marple (Fictitious character), Women detec

In [5]:
from llama_cpp import Llama
import sys

def stream_and_buffer(prompt, llm, max_tokens = 3000, echo = True, stream = True):

    formatted_prompt = f"Q: {prompt} A: "

    res = llm(formatted_prompt, max_tokens = max_tokens, echo = echo, stream = stream)

    buffer = ""

    for message in res:
        chunk = message['choices'][0]['text']
        buffer += chunk

        words = buffer.split(' ')
        for word in words[:-1]:
            sys.stdout.write(word + ' ')
            sys.stdout.flush()

        buffer = words[-1]

    if buffer:
        sys.stdout.write(buffer)
        sys.stdout.flush()

In [6]:
# Function to construct the prompt
def construct_prompt(system_prompt, retrieved_data, user_query):
    prompt = f"""{system_prompt}

    Here is the user's query:
    {user_query}

    Here is the retrieved context:
    {retrieved_data}

    """
    return prompt

In [7]:
# Function to retrieve data of the top matches
def retrieve_data(matches):
    for match in matches:
        book_id = match[0]
        chunk_id = match[1]

        cursor.execute("SELECT text FROM chunks WHERE b_id = %s AND chunk_id = %s", (book_id, chunk_id))
        data = cursor.fetchall()

    return data

In [11]:
system_prompt = """
You are a knowledgeable and helpful book recommendation system. Your task is to provide book recommendations and insights based on the context provided. You should rely solely on the information given in the context to generate your responses. Do not include any information that is not present in the context and don't mention 'context' in your answer. Focus on providing relevant and accurate recommendations or answers according to the book descriptions and details provided.
"""

user_query = "Suggest some must-read non-fiction book from the last decade"

matches = compute_matches(user_query, 5)
retrieved_data = retrieve_data(matches)
prompt = construct_prompt(system_prompt, retrieved_data, user_query)

model_path = "model/mistral-7b-instruct-v0.2.Q3_K_L.gguf"

llm = Llama(model_path=model_path, n_gpu_layers=1)

stream_and_buffer(prompt, llm, echo = False, max_tokens=300)

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from d:\CDriveFolders\Downloads\mistral-7b-instruct-v0.2.Q3_K_L.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: 


     Based on the information provided, I cannot directly recommend a must-read non-fiction book from the last decade since the context only mentions the book "How Not to Act Old" by Pamela Redmond Satran, which is a humorous take on aging. To provide a more accurate recommendation, I would need more information about the user's interests, preferences, or specific topics they are interested in. However, some highly acclaimed non-fiction books from the last decade include:

1. "Sapiens: A Brief History of Humankind" by Yuval Noah Harari (2014) - An engaging and thought-provoking exploration of human history, evolution, and the future.

2. "Thinking, Fast and Slow" by Daniel Kahneman (2011) - A Pulitzer Prize-winning exploration of the human mind and the way we make decisions.

3. "Between the World and Me" by Ta-Nehisi Coates (2015) - An eloquent and powerful exploration of race and identity in America.

4. "Moonwalking with Einstein: The Art and Science of Learning Memory" by Joshua F


llama_print_timings:        load time =   46686.98 ms
llama_print_timings:      sample time =      29.84 ms /   264 runs   (    0.11 ms per token,  8848.67 tokens per second)
llama_print_timings: prompt eval time =   46686.69 ms /   248 tokens (  188.25 ms per token,     5.31 tokens per second)
llama_print_timings:        eval time =   95796.43 ms /   263 runs   (  364.24 ms per token,     2.75 tokens per second)
llama_print_timings:       total time =  143190.91 ms /   511 tokens


(201