In [55]:
import os
from io import BytesIO

import chromadb
import requests
from dotenv import load_dotenv
from IPython.display import Markdown, display
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from openai import OpenAI
from pypdf import PdfReader

In [2]:
load_dotenv()

True

In [3]:
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

model = "gpt-3.5-turbo"

In [4]:
def read_pdf(url):
    response = requests.get(url)
    pdf_file = BytesIO(response.content)
    pdf_reader = PdfReader(pdf_file)
    return pdf_reader

In [5]:
url = "https://script-pdf.s3-us-west-2.amazonaws.com/avengers-endgame-script-pdf.pdf"
pdf_reader = read_pdf(url)

In [6]:
len(pdf_reader.pages)

149

In [7]:
print(pdf_reader.pages[1].extract_text())

              
              AVENGERS: ENDGAME
   Adapted Screenplay
Written by Christopher Markus and Stephen McFeely


In [8]:
texts = [
    (page.page_number + 1, page.extract_text().strip())
    for page in pdf_reader.pages[2:-1]
]  # exclude first 2 and last page
texts[:3]

[(3,
  "EXT. BARTON HOME - DAY\nCLOSE ON: A HOUSE-ARREST ANKLE BRACELET.\nCLINT BARTON (O.S.)\nOkay, you see where you’re going? \nLet’s work on how to get there.\nPan up to find...CLINT BARTON, with his daughter, LILA, \ncoaching her as she notches an arrow in her bow.\nCLINT BARTON(CONT'D)\nOkay, good...tip down...bow arm \nout...three fingers-\nLILA BARTON\nWhy three?\nCLINT BARTON\n‘Cause two’s not enough and four’s \ntoo much-\nLAURA BARTON (O.S.)\nYou guys want mustard or mayo, or \nboth? \nCLINT TURNS.  IN THE FIELD BEHIND THEM, his wife, LAURA \nBARTON sets up a picnic as COOPER and NATHANIEL play soccer.  \nLILA BARTON\nWho puts mayo on a hot dog?\nCLINT BARTON\nWe’ll both have mustard, hon!\n(to Lila)\nOkay.  Draw back, deep breath...\nShe lets loose.  THUD!  HER ARROW HITS NEAR THE BULLS-EYE. \nCLINT BARTON(CONT'D)\nGood job, Hawkeye.  Go get your \narrow. \nLAURA BARTON (O.S.)\nEnough murder practice!  Soup’s on!\nCLINT BARTON\nOne sec, babe.  Be right there! \nWe’re gonna 

In [35]:
splitter = SentenceTransformersTokenTextSplitter(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    chunk_overlap=25,
    tokens_per_chunk=256,
)

In [10]:
# Split the texts
split_texts = []
for page_num, text in texts:
    chunks = splitter.split_text(text)
    split_texts.extend([(page_num, chunk) for chunk in chunks])

In [11]:
split_texts[:4]

[(3,
  "ext. barton home - day close on : a house - arrest ankle bracelet. clint barton ( o. s. ) okay, you see where you ’ re going? let ’ s work on how to get there. pan up to find... clint barton, with his daughter, lila, coaching her as she notches an arrow in her bow. clint barton ( cont'd ) okay, good... tip down... bow arm out... three fingers - lila barton why three? clint barton ‘ cause two ’ s not enough and four ’ s too much - laura barton ( o. s. ) you guys want mustard or mayo, or both? clint turns. in the field behind them, his wife, laura barton sets up a picnic as cooper and nathaniel play soccer. lila barton who puts mayo on a hot dog? clint barton we ’ ll both have mustard, hon! ( to lila ) okay. draw back, deep breath... she lets loose. thud! her arrow hits near the bulls - eye. clint barton ( cont'd ) good job, hawkeye. go get your arrow. laura barton ( o. s. ) enough murder practice! soup ’ s on! clint barton one sec,"),
 (3,
  'your arrow. laura barton ( o. s. ) e

In [12]:
# Print the result
for index, (page_number, chunk) in enumerate(split_texts[:5], start=1):
    display(
        Markdown(
            f"<u>**Page No: {page_number}**, **Chunk: {index}**</u><br><br>{chunk}<br>"
        )
    )

<u>**Page No: 3**, **Chunk: 1**</u><br><br>ext. barton home - day close on : a house - arrest ankle bracelet. clint barton ( o. s. ) okay, you see where you ’ re going? let ’ s work on how to get there. pan up to find... clint barton, with his daughter, lila, coaching her as she notches an arrow in her bow. clint barton ( cont'd ) okay, good... tip down... bow arm out... three fingers - lila barton why three? clint barton ‘ cause two ’ s not enough and four ’ s too much - laura barton ( o. s. ) you guys want mustard or mayo, or both? clint turns. in the field behind them, his wife, laura barton sets up a picnic as cooper and nathaniel play soccer. lila barton who puts mayo on a hot dog? clint barton we ’ ll both have mustard, hon! ( to lila ) okay. draw back, deep breath... she lets loose. thud! her arrow hits near the bulls - eye. clint barton ( cont'd ) good job, hawkeye. go get your arrow. laura barton ( o. s. ) enough murder practice! soup ’ s on! clint barton one sec,<br>

<u>**Page No: 3**, **Chunk: 2**</u><br><br>your arrow. laura barton ( o. s. ) enough murder practice! soup ’ s on! clint barton one sec, babe. be right there! we ’ re gonna kill some hot dogs. we ’ re hungry. but when he turns back... lila is gone. the bow and arrow lie at his feet. he stares.<br>

<u>**Page No: 4**, **Chunk: 3**</u><br><br>2clint barton ( cont'd ) lila? babe, did you see lila - he turns to his wife, but she ’ s gone. so are the boys. the soccer ball rolls to a stop near the picnic blanket. no one can be seen for a hundred yards in all directions. clint barton ( cont'd ) guys? guys, come on... clint walks toward the field, dread growing. clint barton ( cont'd ) laura? clint breaks into a panicked run. clint barton ( cont'd ) laura! clint melts down, spinning around, frantic... marvel flip ext. space - night blackness. title : “ twenty - two days later. ” soon, the benatar tumbles past, adrift... int. benatar, galley - night tony stark and nebula play table - top football. she wins. tony ( offering his hand ) good sport. have fun? nebula studies his hand, confused. finally she shakes. nebula it was fun. int. benatar, flight deck - night tony, thin and haggard, kneels in front of his broken iron man helmet. he hits a switch. a light blinks. 2<br>

<u>**Page No: 5**, **Chunk: 4**</u><br><br>3tony ’ s recording : he stares directly into the “ camera. ” tony this thing on? hey, ms. potts. pep. if you find this recording, don ’ t post it on social media. it ’ s going to be a real tearjerker. ext. space - night the benatar sparks, adrift. tony ( o. s. ) i don ’ t know if you ’ re ever going to see these. i don ’ t even know if you ’ re still... god, i hope so. today ’ s day twenty - one, no... twenty - two. int. benatar, galley - flashback nebula lasers shut tony ’ s infected wound. tony ( o. s. ) you know, if it wasn ’ t for the existential terror of staring into the literal void of space, i ’ d say i ’ m feeling a little better today. infection ’ s run its course, thanks to the blue meanie back there. you ’ d like her. she ’ s very practical. and only a tiny bit sadistic. int. benatar, galley - flashback tony and nebula mend the empty fuel cells under the floor. tony ( o. s. ) the<br>

<u>**Page No: 5**, **Chunk: 5**</u><br><br>, galley - flashback tony and nebula mend the empty fuel cells under the floor. tony ( o. s. ) the fuel cells were cracked during battle, but we figured out a way to reverse the ion charge. bought ourselves about 48 hours of flight time. problem is that was about... 49 hours ago. ext. space - night from high overhead, we watch the benatar drift in space. 3<br>

In [13]:
print(f"Number of chunks: {len(split_texts)}")

Number of chunks: 233


In [14]:
chroma_db = chromadb.PersistentClient(path="db")

chroma_db.list_collections()

[]

In [16]:
collection = chroma_db.get_or_create_collection("endgame")

In [17]:
documents = [chunk for _, chunk in split_texts]
metadatas = [{"page": str(page_num)} for page_num, _ in split_texts]
ids = [f"id_{i}" for i in range(len(split_texts))]

In [18]:
documents[:3]

["ext. barton home - day close on : a house - arrest ankle bracelet. clint barton ( o. s. ) okay, you see where you ’ re going? let ’ s work on how to get there. pan up to find... clint barton, with his daughter, lila, coaching her as she notches an arrow in her bow. clint barton ( cont'd ) okay, good... tip down... bow arm out... three fingers - lila barton why three? clint barton ‘ cause two ’ s not enough and four ’ s too much - laura barton ( o. s. ) you guys want mustard or mayo, or both? clint turns. in the field behind them, his wife, laura barton sets up a picnic as cooper and nathaniel play soccer. lila barton who puts mayo on a hot dog? clint barton we ’ ll both have mustard, hon! ( to lila ) okay. draw back, deep breath... she lets loose. thud! her arrow hits near the bulls - eye. clint barton ( cont'd ) good job, hawkeye. go get your arrow. laura barton ( o. s. ) enough murder practice! soup ’ s on! clint barton one sec,",
 'your arrow. laura barton ( o. s. ) enough murder 

In [19]:
metadatas[:3]

[{'page': '3'}, {'page': '3'}, {'page': '4'}]

In [20]:
ids[:3]

['id_0', 'id_1', 'id_2']

In [21]:
collection.add(documents=documents, metadatas=metadatas, ids=ids)

In [22]:
print(f"Number of items in collection: {collection.count()}")

Number of items in collection: 233


In [46]:
query = "What does Sam Wilson tell Captain America when he enters the battlefield?"

In [47]:
# Example query
results = collection.query(
    query_texts=[query],
    n_results=3,
)

In [48]:
print("Query Results:")
context = ""
for i, (doc, metadata) in enumerate(
    zip(results["documents"][0], results["metadatas"][0]), start=1
):
    display(Markdown(f"**Result {i} (Page {metadata['page']})**<br>{doc}"))
    context += doc
    context += "\n\n"

Query Results:


**Result 1 (Page 125)**<br>... and starts walking toward him. one man against thousands. all alone. but just then, steve ’ s com crackles. he strains to hear. the com crackles again. we can make out a muffled voice. muffled voice ( on com ) - ap -... - comin -... - ot - steve shakes his head, trying to clear it. slowly, sound returns... and the words ring clear : sam wilson ( o. s. ) cap. on your left. pull back... to see a portal opening in the distance. out of it flies... sam wilson. 123

**Result 2 (Page 146)**<br>144move around him to find sam watching from a few yards away. sam wilson hey, cap. the man turns and we now see : steve rogers, age 112. old steve hi, sam. sam wilson something go wrong... or something go right? old steve i thought, after everything that ’ s happened... maybe i should try out some of that life tony always told me to get. sam wilson how was it? steve looks at the river, 112 years of emotion in his face. old steve it was beautiful. sam wilson i ’ m happy for you. really. old steve thanks. sam wilson only thing bumming me out is now i live in a world without captain america. steve pulls up a case from beside him, giving it to sam. old steve that ’ s kind of why i ’ m here. sam unzips it, revealing... a brand new shield. sam wilson where did you even get this? old steve i had a little time to travel. ( beat ) try it on. 144

**Result 3 (Page 132)**<br>... the cannons swiveling up toward the clouds. sam wilson flies overhead. sam wilson what the hell is this? tony ’ s hud : tony flies overhead, searching upward. tony friday, what are they firing at? friday ( o. s. ) something just entered the upper atmosphere. 130

In [56]:
messages = [
    {
        "role": "system",
        "content": "You are a helpful expert on the movie 'Avengers Endgame'. Your users are asking questions about information contained in attached information."
        "You will be shown the user's question, and the relevant information. Answer the user's question using only this information, in the format of Question, Answer and Proof",
    },
    {"role": "user", "content": f"Question: {query}. \n Information: {context}"},
]

response = client.chat.completions.create(
    model=model,
    messages=messages,
)
content = response.choices[0].message.content

In [57]:
print(content)

Question: What does Sam Wilson tell Captain America when he enters the battlefield?

Answer: Sam Wilson tells Captain America, "Cap, on your left."

Proof: The screenplay describes the moment when Sam Wilson (Falcon) flies in to the battlefield and says, "Cap, on your left." This signifies the start of the epic battle scene in 'Avengers Endgame'.
