In [1]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain
import tiktoken
from pinecone import Pinecone, ServerlessSpec
from langchain.vectorstores import Pinecone as LangChainPinecone
from tqdm.auto import tqdm
from uuid import uuid4
import re



  from tqdm.autonotebook import tqdm


In [2]:
## Get env variables
# Get the current working directory
current_directory = os.getcwd()
# Construct the path to the .env file in the parent directory
env_path = os.path.join(current_directory, '..', '.env')
# Load the environment variables from the .env file
load_dotenv(dotenv_path=env_path)
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT')


In [4]:
current_dir = os.path.abspath(os.getcwd())
relative_path = "../data/"
filename = "Masterworks282Jean-MichelBasquiatPolloFrito.pdf"
file_path = os.path.join(current_dir, relative_path, filename)
print(current_dir)
print(file_path)

/Users/franciscomeyo/Documents/Masterworks/llm_testing/llm_testing
/Users/franciscomeyo/Documents/Masterworks/llm_testing/llm_testing/../data/Masterworks282Jean-MichelBasquiatPolloFrito.pdf


In [5]:
def extract_page_data(file_path):
    # Initialize the PyMuPDFLoader
    loader = PyMuPDFLoader(file_path)
    # Load the documents from the specified file_path
    docs = loader.load()

    data = []
    for doc in docs:
        tmp_dict = {}
        tmp_dict['text'] = doc.page_content.replace('\n', ' ')
        tmp_dict['page'] = doc.metadata['page']
        tmp_dict['title'] = doc.metadata['title']
        data.append(tmp_dict)

    return data

In [7]:
data = extract_page_data(file_path)
for entry in data:
    print(f"Page: {entry['page']}")

    print(f"Title: {entry['title']}")
    # Split text by period or newline characters and print each sentence on a new line
    lines = entry['text'].replace(' ● ', '\n● ').split('. ')
    for line in lines:
        print(line)
    print("\n" + "-"*50 + "\n") 

Page: 0
Title: Masterworks 282, LLC, Jean-Michel Basquiat, Pollo Frito - Q2 2024 Internal Appraisal Report
Fair Market Value Appraisal: Internal Use Client Masterworks 282, LLC Intended User Masterworks Administrative Services, LLC on behalf of the above-named Client
Intended Use To determine fair market value as outlined in the Scope of Work
Effective Date of Appraisal Report June 30, 2024 Issuance Date of Appraisal July 17, 2024 

--------------------------------------------------

Page: 1
Title: Masterworks 282, LLC, Jean-Michel Basquiat, Pollo Frito - Q2 2024 Internal Appraisal Report
TABLE OF CONTENTS Certification 2 About Masterworks 3 Scope of Work 3 Definition of Value 4 Method of Research 4 Method of Examination 4 Assignment Conditions 4 Approach to Value 5 Opinion of Value 5 Global Art Market Overview 6 Subject Artwork 9 Fair Market Value 9 Artist Background 9 Valuation Narrative 10 Comparable Sales 11 Sources of Data 12 Statements and Disclosures 13 Appraisal Terminology and

In [8]:
# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)
  
# tokenizer setup
tiktoken.encoding_for_model('gpt-3.5-turbo')
tokenizer = tiktoken.get_encoding('cl100k_base')
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)



In [16]:
extracted_text = data[11]['text']
lines = extracted_text.replace(' ● ', '\n● ').split('. ')
for line in lines:
    print(line)
print("\n" + "-"*50 + "\n") 


the artist created that year, specifically, on Pollo Frito, it is written three times on the bottom left quadrant
Basquiat’s iconic crown can be spotted on the top left corner of the canvas
The crown is used frequently across the Artist’s work and is thought to signify the duality of his self-image, both as a king of the world and the human suffering insinuated by the crown of thorns.  Since the acquisition of the Painting, there have been three sales of similar paintings at auction, all from the same seminal year as the subject work
Most recently, “Untitled (Elmar)” (1982), 68 x 93.13 inches, which is approximately the same size as the subject work but “fresh to the market” (i.e
has not previously been offered/sold through auction, whereas the subject work was sold through auction in November 2018), sold at Phillips New York for $46,479,000; although it lacks the abundance of text observed within the subject work, “Untitled (Elmar)” has an arguably more commercial bright turquoise, ye

In [37]:
page = '''The artist created that year, specifically, on Pollo Frito, it is written three times on the bottom left quadrant
Basquiat’s iconic crown can be spotted on the top left corner of the canvas
The crown is used frequently across the Artist’s work and is thought to signify the duality of his self-image, both as a king of the world and the human suffering insinuated by the crown of thorns.  
'''

In [38]:
chunk = text_splitter.split_text(page)
chunk

['The artist created that year, specifically, on',
 'specifically, on Pollo Frito, it is',
 'it is written three times on the bottom left quadrant',
 'Basquiat’s iconic crown can be spotted',
 'can be spotted on the top left corner of the',
 'corner of the canvas',
 'The crown is used frequently across the Artist’s',
 'the Artist’s work and is thought to signify the',
 'to signify the duality of his self-image,',
 'self-image, both as a king of the world',
 'of the world and the human suffering insinuated',
 'insinuated by the crown of',
 'the crown of thorns.']

In [9]:
# embeddings setup using OpenAI
model_name = 'text-embedding-ada-002'
embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

  warn_deprecated(


In [61]:
embeds = embed.embed_documents(chunk)
embeds

[[-0.02562267345013173,
  -0.024313775838810947,
  -0.0032986877706984766,
  -0.016751253409287094,
  -0.013366626677221508,
  5.4227552580094964e-05,
  -0.033740485831811225,
  -0.017848612257307432,
  -0.013307131923912247,
  0.013326963508348668,
  0.03580299296494244,
  0.01939549074451055,
  -0.006861799461183828,
  0.004058906656501072,
  -0.013426122361853414,
  0.01059678651326612,
  0.025860654326014063,
  0.02285944474564446,
  0.0227272323200897,
  -0.01917073092491916,
  -0.039134732677876295,
  0.011879242943327652,
  0.013802926191435983,
  0.008302909963720692,
  -0.003067316957300292,
  0.01123140373449009,
  -0.0011626387447795357,
  -0.03347606098070171,
  0.005946231999809299,
  0.018800537623482064,
  0.003118549016089034,
  -0.011396668800772216,
  -0.022634683063407778,
  -0.01665870415260517,
  -0.05365160522224987,
  -0.010147265942760697,
  -0.034480872434685415,
  -0.012586576905474477,
  0.01751808212474318,
  0.01088104294881206,
  0.030064994930022168,
  0.

In [10]:
from pinecone.exceptions import PineconeApiException 

# Create an instance of the Pinecone class
pc = Pinecone(api_key=PINECONE_API_KEY)

# Prepare the index name
index_name = (
    data[0]['title']
    .lower()                              # Convert to lowercase
    .replace(' ', '-')                    # Replace spaces with hyphens
)

# Remove all characters except lowercase letters, numbers, and hyphens
index_name = re.sub(r'[^a-z0-9\-]', '', index_name)[:45]

# List current indexes to ensure the check is accurate
current_indexes = pc.list_indexes()

if index_name not in current_indexes:
    try:
        pc.create_index(
            name=index_name,
            dimension=1536, # 1536 dim of text-embedding-ada-002
            metric="cosine", # Replace with your model metric
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            ) 
        )
    except PineconeApiException as e:
        if "ALREADY_EXISTS" in str(e):
            print(f"Index '{index_name}' already exists.")
        else:
            raise
else:
    print(f"Index '{index_name}' already exists.")

# Access the index using the Index class
index = pc.Index(index_name)


Index 'masterworks-282-llc-jean-michel-basquiat-poll' already exists.


In [None]:
def process_data_in_batches(data, text_splitter, embed, index, batch_limit=100):
    texts = []
    metadatas = []

    for i, record in enumerate(tqdm(data)):
        # first get metadata fields for this record
        metadata = {
            'title': record['title'],
            'page': record['page'],
        }
        # now we create chunks from the record text
        record_texts = text_splitter.split_text(record['text'])
        # create individual metadata dicts for each chunk
        record_metadatas = [{
            "chunk": j, "text": text, **metadata
        } for j, text in enumerate(record_texts)]
        # append these to current batches
        texts.extend(record_texts)
        metadatas.extend(record_metadatas)
        # if we have reached the batch_limit we can add texts
        if len(texts) >= batch_limit:
            ids = [str(uuid4()) for _ in range(len(texts))]
            embeds = embed.embed_documents(texts)
            index.upsert(vectors=zip(ids, embeds, metadatas))
            texts = []
            metadatas = []

    if len(texts) > 0:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        
process_data_in_batches(data, text_splitter, embed, index, batch_limit=100)

In [11]:
# retrieving vectorstore from pinecone
text_field = "text"
# switch back to normal index for langchain
index = pc.Index(index_name)
vectorstore = LangChainPinecone(
    index=index,
    embedding=embed.embed_query,  # The function or object to generate embeddings
    text_key=text_field
)

  warn_deprecated(


In [20]:
# query retrieval
# completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

question = "What are Pollo Frito's comparable sales? List them and include: name, price and date of sale for each one"
response = qa(question)
answer=response['result']
print(answer)

The comparable sales to Jean-Michel Basquiat's "Pollo Frito" (1982) are as follows:

1. "Untitled (Elmar)" (1982) - Sold for $46,479,000 at Phillips New York. Date of sale not specified.
2. "The Italian Version of Popeye Has No Pork In His Diet" (1982) - Sold for $32,035,000 at Christie's New York. Date of sale not specified.
3. "Self-Portrait As A Heel (Part Two)" (1982) - Sold for $42,000,000 at Sotheby’s New York on November 16, 2023.


In [22]:
import pprint

In [23]:
question = "Write up the main differences between Pollo Frito and its comps"
response = qa(question)
answer=response['result']
pprint.pprint(answer)

('The main differences between Pollo Frito (1982) by Jean-Michel Basquiat and '
 'its comparable sales are as follows:\n'
 '\n'
 '1. **Text Usage**: Pollo Frito features text in varying sizes repeated '
 "across the painting, demonstrating Basquiat's play-on-words approach. In "
 'contrast, "Untitled (Elmar)" lacks the abundance of text observed in Pollo '
 'Frito but has a more commercial bright turquoise, yellow, red, and green '
 'palette.\n'
 '\n'
 "2. **Iconography**: Pollo Frito includes Basquiat's iconic crown on the top "
 'left corner, symbolizing the duality of his self-image. "Untitled (Elmar)" '
 'features the iconic image of a full figure beneath a crown of thorns.\n'
 '\n'
 '3. **Size**: Pollo Frito measures 60 x 120.5 inches, while "Untitled '
 '(Elmar)" is slightly larger at 68 x 93.13 inches. "The Italian Version of '
 'Popeye Has No Pork In His Diet" is much smaller at 60 x 60 inches, and '
 '"Self-Portrait As A Heel (Part Two)" is larger at 96 x 61.5 inches.\n'
 '\n'