In [None]:
!pip install langchain_community langchain_chroma langchain_openai

Collecting langchain_community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain_chroma
  Downloading langchain_chroma-1.0.0-py3-none-any.whl.metadata (1.9 kB)
Collecting langchain_openai
  Downloading langchain_openai-1.0.3-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-core<2.0.0,>=1.0.1 (from langchain_community)
  Downloading langchain_core-1.0.5-py3-none-any.whl.metadata (3.6 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain_community)
  Downloading langchain_classic-1.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain_community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting chromadb<2.0.0,>=1.0.20 (from langchain_chroma)
  Downloading chromadb-1.3.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.me

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document

In [None]:
from typing import List

In [None]:
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
HUGGING_FACE_ACCESS_TOKEN = userdata.get('HUGGING_FACE_ACCESS_TOKEN')

In [None]:
books = pd.read_csv('books_cleaned.csv')
books.head()

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...
2,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine..."
3,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...
4,9780006280934,6280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le..."


In [None]:
books['tagged_description'].to_csv("tagged_description.txt",
                            sep="\n",
                            index=False,
                            header = False
)

In [None]:
raw_documents = TextLoader("tagged_description.txt",encoding='utf-8').load()

In [None]:
text_splitter = CharacterTextSplitter(chunk_overlap = 0, chunk_size = 1, separator="\n")
documents = text_splitter.split_documents(raw_documents)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [None]:
documents[0]

Document(metadata={'source': 'tagged_description.txt'}, page_content='9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gi

In [None]:
db_books = Chroma.from_documents(documents, embedding=OpenAIEmbeddings(api_key = OPENAI_API_KEY))

In [None]:
query = "A book about authoritarianism, consciousness and tyranny"
docs = db_books.similarity_search(query, k = 5)
docs

[Document(id='0c5b8f45-a1c5-457c-8ab2-d31a77d7c6b9', metadata={'source': 'tagged_description.txt'}, page_content='"9780452284234 Portrays a terrifying vision of life in the future when a totalitarian government, considered a ""Negative Utopia,"" watches over all citizens and directs all activities, becoming more powerful as time goes by."'),
 Document(id='d0d8dc4e-3c31-402a-8808-9e6d8cf286bc', metadata={'source': 'tagged_description.txt'}, page_content='9780965020596 A bookburner official in a future fascist state finds out books are a vital part of a culture he never knew. He clandestinely pursues reading, until he is betrayed.'),
 Document(id='a62dc199-34f6-4ea8-94b6-a584377ee970', metadata={'source': 'tagged_description.txt'}, page_content='9780552999953 The author presents a terrifying vision of a world out of control, ruled by drug lords and their criminal organizations and encouraging an environment of lawlessness and chaos.'),
 Document(id='c700e488-fc67-4cec-aa44-e0716b8f4eda',

In [None]:
def get_isbn13_from_doc(docs: List[Document]) -> int:
  isbns = []
  for doc in docs:
    content = doc.page_content
    isbn_string = content.split()[0]
    #cleaning isbn_string and returning int
    isbn = isbn_string.strip().replace(",","").replace('"','')
    isbns.append(int(isbn))
  return isbns


In [None]:
def retrieve_semantic_recommendations(query: str, k = 10)-> pd.DataFrame:
  docs = db_books.similarity_search(query, k = k )
  isbns = get_isbn13_from_doc(docs)
  return books[books["isbn13"].isin(isbns)][["authors","title"]].reset_index(drop=True)

In [None]:
recommendations = retrieve_semantic_recommendations("A book about authoritarianism, consciousness and tyranny")
recommendations

Unnamed: 0,authors,title
0,Chinua Achebe;Sandra Widner,Things Fall Apart
1,William T. Vollmann,You Bright and Risen Angels
2,Reinaldo Arenas;Andrew Hurley,The Assault
3,George Orwell,Animal Farm and 1984
4,Bret Easton Ellis,Glamorama
5,Joy Kogawa,Obasan
6,George Orwell,1984
7,Ben Elton,High Society
8,Hannah Arendt,The Origins of Totalitarianism
9,Ray Bradbury,Fahrenheit 451


In [None]:
recommendations = retrieve_semantic_recommendations("a whaling expedition to find the greatest white whale")
recommendations

Unnamed: 0,authors,title
0,Christopher Moore,Fluke
1,Michael Crichton,Eaters of the Dead
2,Patrick Robinson,U.S.S. Seawolf
3,Nathaniel Philbrick,In the Heart of the Sea
4,Brian Hall,I Should be Extremely Happy in Your Company
5,Herman Melville;Geraldine McCaughrean;Victor G...,"Moby Dick, Or, The White Whale"
6,Dan Simmons,The Terror
7,Clive Cussler;Jack B. Du Brul,Dark Watch
8,Robert Whitaker,The Mapmaker's Wife
9,Stephen E. Ambrose,Undaunted Courage
