# Real book Reccomendation Gameplan 

In [1]:
from langchain_community.llms import Ollama # newer version of this import from langchain_community.chat_models import ChatOllama idk if it changes anything though
from langchain_core.prompts import ChatPromptTemplate # Prompt templates convert raw user input to better input to the LLM (guide reponse)
from langchain_core.output_parsers import StrOutputParser # original model output is a message but this function parses it to a string (easier to work with)
from langchain_core.messages import HumanMessage, AIMessage # Used to frame history to LLM and retriever
import pandas as pd
import json
import time
import requests
import os
from filelock import FileLock, Timeout

In [None]:
# LLM that provides a list of book reccomendations (title only) in the format of a python list

model = "llama3.1" # mistral, llama2, kdl_copilot_llama3, llama3, llama2:13b
llm = Ollama(model=model, format='json') # how to structure LLM output: https://python.langchain.com/v0.2/docs/how_to/structured_output/

system = """Your are a helpful librarian AI assistant. You provide relevant book recommendations based on the context of the users input. \
Output your response in json, structured with only the book title. \
Include at least 7 relevant  books. \ 

Here are some examples of proper json output based on an example prompts:

example_user: I like mystery book, what are some recommendations like that if I just read Louise Penny?
example_assistant: {{"book_titles": [
    "A Rule Against Murder",
    "The Word is Murder",
    "Bury Your Dead"]}}

example_user: What books should I read after 1984 by George Orwell??
example_assistant: {{"book_titles": [
    "Fahrenheit 451",
    "Animal Farm",
    "Brave new world",
    "The Handmaid's Tale",
    "Lord of the Flies",
    "A Clockwork Orange"]}}
"""

prompt = ChatPromptTemplate.from_messages([  # LLM unreliable for ISBN number but does good for book titles
    ("system", system),
    ("user", "{input}")
])

output_parser = StrOutputParser()
chain = prompt | llm | output_parser

orig_prompt = "I liked the mistborn series by sanderson, what are some books like it?"

output1 = chain.invoke({"input": orig_prompt}) 

print(output1) 

In [3]:
# Convering the json output of the LLM into a python list 
import json

# Replace 'your_data' with your actual JSON data
data = json.loads(output1)

# Extract the list of book titles from the 'book_titles' property
book_titles = data['book_titles']

# Example output
#book_titles = ['The Silent Companions', 'The dangers of mr.bigbe', 'The Word is Murder', 'The 7 1/2 Deaths of Evelyn Hardcastle', 'The Devil in the Marshalsea', 'The Blind Barber']

In [None]:
book_titles = ['The Silent Companions', 'The dangers of mr.bigbe', 'The Word is Murder', 'The 7 1/2 Deaths of Evelyn Hardcastle', 'The Devil in the Marshalsea', 'The Blind Barber']

In [None]:
# Now that we have a list of book titles in the form of a python list, provide each title to the google reads API in order to get the author of each book (needed to match book title to title in KDL database)
# Because ISBN number is not a direct match in our comparison dataset, it's difficult to use this API in our dataset each book title would have to be associated with it's various ISBN versions 

import time
import requests

# Initalize dictionary that will store all API output  
master_dict = {}

# Function that takes list of book titles as input and returns dictionary with book title as keys and author as value
def get_author(titles):
    for title in titles:
        url = f"https://www.googleapis.com/books/v1/volumes?q={title}&maxResults=1"
        response = requests.get(url)
        data = response.json() 
        author = data["items"][0]["volumeInfo"]["authors"][0]
        last_name = author.split(" ")[1]
        #print("Author:", last_name)
        
        # Writing the output from the google reads API into a dict then updating it to master
        temp_dict = {title: last_name}
        master_dict.update(temp_dict)
        #isbn_13 = data["items"][0]["volumeInfo"]["industryIdentifiers"][0]["identifier"]
        #print("ISBN-13:", isbn_13)

        time.sleep(.5) 
    return master_dict

master_dict = get_author(book_titles)

In [None]:
# Checking out the dictionary 
master_dict # values() & keys()

In [7]:
# Loading in the KDL database we're going to compare our LLM reccomendation output against 
monster = pd.read_excel(r'Book db\kdl_report_edited.xlsx')

In [8]:
key = "The Sea of Tranquility"

key_lower = key.lower() # Converts titles from both API and KDL database to lowercase, easier search when finding observational rows 
monster_title = monster['title'].str.lower()

row_indices = monster.loc[monster_title.str.contains(key_lower)].index.tolist()

In [6]:
def match_recs(master_dict):
    # Initialize an empty DataFrame for storing index data
    book_recs = pd.DataFrame(columns=["Title", "Author", "index"])  # Initialize with column names

    # Use .loc to find and select rows in our KDL database where title and author match the LLM output
    for key, value in master_dict.items():
        
        key_lower = key.lower() # Converts titles from both API and KDL database to lowercase, easier search when finding observational rows 
        monster_title = monster['title'].str.lower()
        
        # Find rows where the lowercase column contains the lowercase search string
        row_indices = monster.loc[monster_title.str.contains(key_lower)].index.tolist()
        if row_indices is not None: # If we a book title matches from the LLM to KDL database then we keep going to find the exact observation   
            for index in row_indices: 
                value_lower = value.lower() # standardize the string casing for both values of authors from our API and KDL databse for easier search 
                monster_author = monster['Author'].str.lower()
                
                if value_lower in monster_author[index]:  # Matches the authors last name we got from the API to find the correct observation in KDL database
                    new_data = pd.DataFrame({
                        "Title": [key],
                        "Author": [value],
                        "index": [index]
                    })
                    book_recs = pd.concat([book_recs, new_data], ignore_index=True)
        else: # Writes in blank index if title string from LLM is not found in KDL database
            new_data = pd.DataFrame({
                "Title": [key],
                "Author": [value],
                "index": [""]
            })
            book_recs = pd.concat([book_recs, new_data], ignore_index=True)
    return book_recs

book_recs = match_recs(master_dict)

In [None]:
book_recs

In [None]:
# Checking the index output of match_recs function ensuring it leads to the book it says in the db
list(book_recs["index"])

row_data = monster.iloc[50115]
print(row_data)
author_value = row_data['summary']
#print("Author:", author_value)

In [8]:
# Drops all rows except for the first occurance of every title
book_recs.drop_duplicates(subset='Title', keep='first', inplace=True)

In [9]:
# taking the index variable from book_recs and making it into a list for itterating 
index_list = book_recs['index']

#initalizing empty dictionary to put specific book and summary data from KDL db in  
llm_data = {}

for index in index_list:
    row_data = monster.iloc[index] # Finds the specific row for each index variable saving all info 
    #print(row_data, "\n")
    summary_value = row_data['summary'] # Keep in mind this summary variable from Sheri's dataset is truncated 
    title_value = row_data['title']
    temp_dict = {title_value: summary_value} # updates temp dict into llm_data for each book in our index_list
    llm_data.update(temp_dict)

# Second LLM 

In [None]:
# For books that are in KDL's database (AKA have am index present in book_recs df) this LLM recommends them in relation to the original question! 

model = "llama3" # mistral, llama2, kdl_copilot_llama3, llama3, llama2:13b
llm = Ollama(model=model, temperature=0)

system2 = """Your are a helpful librarian AI assistant. You know a lot about books and have great recommendation advice . \
Given a python dictionary of book titles and their associated summary as extra context, you can provide me with a summary of why those books are great recommendations in relation to the original prompt. \
If a book in the python dictionary doesn't at all match the themes of the other books in the original prompt or python dictionary ignore that specific book in your output,
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system2), 
    ("user", "{input}")
])

output_parser = StrOutputParser()
chain = prompt | llm | output_parser # If our kdl db had a link to bibliocommons that would be sweet to add in here for easy staff access! 

output2 = chain.invoke({"input": f"Given the follow python dictionary of book titles and associated summaries: {llm_data}, provide a summary of why those books are great reccomendations in relation to the original prompt: {orig_prompt}."}) 

print(output2) 

# Saving output into json file 

In [11]:
dict = {"prompt": orig_prompt, "output": output2}

In [None]:
# Loading in the Address DB, appending the current address results, and then resaves it 
save_folder = r"C:\Users\Ryan\Coding Projects\KDL Project\AI PT\LLM book rec\outputs"
file_path = os.path.join(save_folder, 'LLM_Recs_db.json')
lock_file_path = file_path + '.lock'

# Ensure the save folder exists
os.makedirs(save_folder, exist_ok=True)

# Function to save the updated address database
def resave_json(dict):
    lock = FileLock(lock_file_path, timeout=10) 

    try:
        with lock:
            # Load the address database within the lock context
            if os.path.exists(file_path):
                with open(file_path, 'r') as f:
                    rec_db = json.load(f)
            else:
                rec_db = []

            # Append the new entry
            rec_db.append(dict)

            # Save the updated address database
            with open(file_path, 'w') as f:
                json.dump(rec_db, f, indent=4)

            print(f"Results have been saved to '{file_path}'")

    except Timeout:
        print("Another process is currently accessing the file. Please try again later.")

resave_json(dict)

# Function testing from functions file

In [None]:
from functions_book_rec import *

orig_prompt = "Recommend some quintessential science fiction books"

output1 = input_llm(orig_prompt)

In [None]:
book_titles = convert_json(output1)

In [None]:
master_dict = get_author(book_titles)

In [None]:
book_recs = match_recs(master_dict)

In [None]:
book_recs = match_recs(master_dict)

# Drops all rows except for the first occurance of every title
book_recs.drop_duplicates(subset='Title', keep='first', inplace=True)

# taking the index variable from book_recs and making it into a list for itterating 
index_list = book_recs['index']

#initalizing empty dictionary to put specific book and summary data from KDL db in  
llm_data = {}

for index in index_list:
    row_data = monster.iloc[index] # Finds the specific row for each index variable saving all info 
    #print(row_data, "\n")
    summary_value = row_data['summary'] # Keep in mind this summary variable from Sheri's dataset is truncated 
    title_value = row_data['title']
    temp_dict = {title_value: summary_value} # updates temp dict into llm_data for each book in our index_list
    llm_data.update(temp_dict)

In [None]:
output2 = output_llm(llm_data, orig_prompt)
print(output2)