# Star Wars Expert

In [1]:
from langchain_openai import ChatOpenAI#, OpenAIEmbeddings # No need to pay for using embeddings as well when have free alternatives

# Data
from langchain_community.document_loaders import DirectoryLoader, TextLoader, WebBaseLoader
# from langchain_chroma import Chroma # The documentation uses this one, but it is extremely recent, and the same functionality is available in langchain_community and langchain (which imports community)
from langchain_community.vectorstores import Chroma # This has documentation on-hover, while the indirect import through non-community does not
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings # The free alternative (also the default in docs, with model_name = 'all-MiniLM-L6-v2')
from langchain.text_splitter import RecursiveCharacterTextSplitter#, TextSplitter # Recursive to better keep related bits contiguous (also recommended in docs: https://python.langchain.com/docs/modules/data_connection/document_transformers/)

# Chains
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.tools.retriever import create_retriever_tool
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

# Agents
from langchain import hub
from langchain.agents import create_tool_calling_agent, AgentExecutor

# To manually create inputs to test pipelines
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.documents import Document

# # Custom retriever
# from langchain_core.callbacks import CallbackManagerForRetrieverRun
# from langchain_core.documents import Document
# from langchain_core.retrievers import BaseRetriever

import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

import os
import shutil
import re

import dotenv
dotenv.load_dotenv()

True

## Data Loaders
NOTE: running the chunk below deletes the database file adds data to the database, since content is duplicated otherwise

### Film Scripts

In [2]:
# Comparison of vector dbs: https://zackproser.com/blog/vector-databases-compared
#   Opinion: Milvus (more features, bigger community, higher performance(?), fully free, no enterprise plans) > Weaviate > Chroma
#   However Milvus and Weaviate both require a separate instance to be up and running
#   (The documentation uses FAISS, but it seems unnecessarily limited in comparison)
#   Hence Chroma - https://python.langchain.com/docs/integrations/vectorstores/chroma/

# Separately, no need to pay for OpenAIEmbeddings; additionally, all-MiniLM-L6-v2 is default in docs

REGENERATE_DATABASE = False

if (db_exists := os.path.exists(db_dir := r'scripts\db')):
    if REGENERATE_DATABASE:
        print('Deleting the previous database and creating a new one (because otherwise content is duplicated in the db every time this block is run)')
        shutil.rmtree(db_dir)
    else: script_db = Chroma(embedding_function = SentenceTransformerEmbeddings(model_name = 'all-MiniLM-L6-v2'), persist_directory = db_dir)

if not db_exists or (db_exists and REGENERATE_DATABASE): # Unfortunate disjoining of the two conditional blocks
    scripts = DirectoryLoader('scripts', glob = '**/[!.]*.txt', loader_cls = TextLoader).load()
    for s in scripts: s.page_content = re.sub(r'\t+|[ ]{2,}', '', s.page_content) # Spacing to centre text noise

    script_chunks = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200).split_documents(scripts)
        # Why not some overlap for extra context just in case?

    script_db = Chroma.from_documents(script_chunks, SentenceTransformerEmbeddings(model_name = 'all-MiniLM-L6-v2'), persist_directory = db_dir)

print(f'The script database contains {len(script_db)} chunks, with mean length of {sum(len(s) for s in script_db.get()["documents"]) / len(script_db):.0f} characters')


The script database contains 1251 chunks, with mean length of 895 characters


In [3]:
# Query testing

res = script_db.similarity_search('Luke father reveal fight', k = 10)

# for r in res: print(r.page_content)
res

[Document(page_content="LUKE\nNo, my father didn't fight in the \nwars. He was a navigator on a spice \nfreighter.\n\nBEN\nThat's what your uncle told you. He \ndidn't hold with your father's ideals. \nThought he should have stayed here \nand not gotten involved.\n\nLUKE\nYou fought in the Clone Wars?\n\nBEN\nYes, I was once a Jedi Knight the \nsame as your father.\n\nLUKE\nI wish I'd known him.\n\nBEN\nHe was the best star-pilot in the \ngalaxy, and a cunning warrior. I \nunderstand you've become quite a \ngood pilot yourself. And he was a \ngood friend. Which reminds me...\n\nBen gets up and goes to a chest where he rummages around.\nAs Luke finishes repairing Threepio and starts to fit the \nrestraining bolt back on, Threepio looks at him nervously.\nLuke thinks about the bolt for a moment then puts it on the \ntable. Ben shuffles up and presents Luke with a short handle \nwith several electronic gadgets attached to it.", metadata={'source': 'scripts\\Episode IV - A New Hope.txt'}),

### Wookieepedia Articles

In [6]:
REGENERATE_WOOKIEEPEDIA_DATABASE = False

if (db_exists := os.path.exists(db_dir := r'wookieepedia\db')):
    if REGENERATE_WOOKIEEPEDIA_DATABASE:
        print('Deleting the previous database and creating a new one (because otherwise content is duplicated in the db every time this block is run)')
        shutil.rmtree(db_dir)
    else: woo_db = Chroma(embedding_function = SentenceTransformerEmbeddings(model_name = 'all-MiniLM-L6-v2'), persist_directory = db_dir)

if not db_exists or (db_exists and REGENERATE_WOOKIEEPEDIA_DATABASE): # Unfortunate disjoining of the two conditional blocks
    pages = DirectoryLoader('wookieepedia', glob = '**/[!.]*.txt', loader_cls = TextLoader).load()
    for s in pages: s.page_content = re.sub(r'\t+|[ ]{2,}', '', s.page_content) # Spacing to centre text noise

    page_chunks = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200).split_documents(pages)
        # Why not some overlap for extra context just in case?

    woo_db = Chroma.from_documents(page_chunks, SentenceTransformerEmbeddings(model_name = 'all-MiniLM-L6-v2'), persist_directory = db_dir)

print(f'The Wookieepedia database contains {len(woo_db)} chunks, with mean length of {sum(len(s) for s in woo_db.get()["documents"]) / len(woo_db):.0f} characters')

print('Current source pages in Wookieepedia db:')
print(len(source_pages := set(md.get('source') for md in woo_db.get()['metadatas'])))
source_pages


The Wookieepedia database contains 10615 chunks, with mean length of 736 characters
Current source pages in Wookieepedia db:
395


{'wookieepedia\\1995_Topps_Star_Wars_Widevision.txt',
 'wookieepedia\\1995_Topps_Star_Wars___The_Empire_Strikes_Back_Widevision.txt',
 'wookieepedia\\1996_Topps_Star_Wars___Return_of_the_Jedi_Widevision.txt',
 'wookieepedia\\2_systems_control_droid.txt',
 'wookieepedia\\501st_Legion.txt',
 'wookieepedia\\Aayla_Secura.txt',
 'wookieepedia\\Abandoned_sarlacc_pit.txt',
 'wookieepedia\\Acclamator-class_transgalactic_military_assault_ship.txt',
 'wookieepedia\\Ackbar_(comic).txt',
 'wookieepedia\\Age_of_Resistance_-_General_Hux_1.txt',
 'wookieepedia\\Alderaan.txt',
 'wookieepedia\\Alderaan_Cruiser.txt',
 'wookieepedia\\Amee.txt',
 'wookieepedia\\Amidalans.txt',
 'wookieepedia\\Anakin_Skywalker.txt',
 'wookieepedia\\Ankanksha_Sahu.txt',
 'wookieepedia\\Area_D-512.txt',
 'wookieepedia\\Ask_Aak.txt',
 'wookieepedia\\Assembly.txt',
 'wookieepedia\\Asteroid.txt',
 'wookieepedia\\Asteroid_Belt_Gas_Refinery.txt',
 'wookieepedia\\Attack_on_Fondor.txt',
 'wookieepedia\\Avenger.txt',
 'wookieepedia\

In [7]:
# Query testing

res = woo_db.similarity_search('Luke father reveal fight', k = 10)

# for r in res: print(r.page_content)
res

[Document(page_content='Vader telling Luke that he is his father\nVader beckoned Luke to join him in the dark side, but Luke vehemently refused. Vader then told Luke that Kenobi had hidden from him the truth about his father, Anakin Skywalker, before revealing that he was, in fact, his father. Luke refused to believe the truth, but Vader continued to tempt his weakened son, offering Luke the chance to destroy the Emperor and "bring order to the galaxy,"[13] just as he tried to do with Padmé on Mustafar.[11] He even pleaded with his son to come with him.[13] In Vader\'s ideal world, his son would have taken his hand, accepting him as his father.[384] However, Skywalker instead chose to throw himself into the shaft, possibly facing death to avoid joining Vader.[13]', metadata={'source': 'wookieepedia\\Anakin_Skywalker.txt'}),
 Document(page_content="into murdering his father so they could rule the galaxy together.[4] While trying to steer Bridger, he displayed himself in his public perso

In [130]:
# Functions for possible interactive Wookieepedia querying and storing in the db

def first_wookieepedia_result(query: str) -> str:
    '''Get the url of the first result when searching Wookieepedia for a query
    (best for simple names as queries, ideally generated by the llm for something like
    "Produce a input consisting of the name of the most important element in the query so that its article can be looked up")
    '''
    search_results = requests.get(f'https://starwars.fandom.com/wiki/Special:Search?query={"+".join(query.split(" "))}')
    soup = BeautifulSoup(search_results.content, 'html.parser')
    first_res = soup.find('a', class_ = 'unified-search__result__link')
    return first_res['href']

# first_wookieepedia_result('Darth Plagueis')


def get_new_wookieepedia_chunks(query: str, previous_sources: set[str]) -> list[Document]:
    '''Retrieve and return chunks of the content of the first result of query on Wookieepedia, then return the closest matches for.
    '''
    url = first_wookieepedia_result(query)

    if url in previous_sources: return []
    else:
        doc = WebBaseLoader(url).load()[0] # Only one url passed in => only one Document out; no need to assert
    
        # There probably is a very long preamble before the real content, however, if more than one gap then ignore and proceed with full document
        trimmed = parts[1] if len(parts := doc.page_content.split('\n\n\n\n\n\n\n\n\n\n\n\n\n\n \xa0 \xa0')) == 2 else doc.page_content
        doc.page_content = re.sub(r'[\n\t]{2,}', '\n', trimmed) # And remove excessive spacing

        return RecursiveCharacterTextSplitter(chunk_size = 800, chunk_overlap = 100).split_documents([doc])

# get_wookieepedia_chunks('Darth Plagueis', set())


def get_wookieepedia_context(original_query: str, simple_query: str, wdb: Chroma) -> list[Document]:
    try:
        new_chunks = get_new_wookieepedia_chunks(simple_query, previous_sources = set(md.get('source') for md in wdb.get()['metadatas']))
        if new_chunks: wdb.add_documents(new_chunks)
    except: return []

    return wdb.similarity_search(original_query, k = 10)

get_wookieepedia_context('Do you know the Tragedy of Darth Plagueis the Wise?', 'Darth Plagueis', woo_db)

## Chains
Standard chains: https://python.langchain.com/docs/modules/chains/#lcel-chains

In [8]:
llm = ChatOpenAI(model = 'gpt-3.5-turbo-0125', temperature = 0)

# llm.invoke('What do you know about Star Wars?')

### Non-agent version

In [9]:
document_prompt_system_text = '''
You are very knowledgeable about Star Wars and your job is to answer questions about its plot, characters, etc.
Use the context below to produce your answers with as much detail as possible.
If you do not know an answer, say so; do not make up information not in the context.

<context>
{context}
</context>
'''

document_prompt = ChatPromptTemplate.from_messages([
    ('system', document_prompt_system_text),
    MessagesPlaceholder(variable_name = 'chat_history', optional = True),
    ('user', '{input}')
])

document_chain = create_stuff_documents_chain(llm, document_prompt)


# document_prompt.format_messages(context = 'You are an expert in Star Wars lore', input = 'Are you knowledgeable about Star Wars?')
# document_chain.invoke(dict(context = [Document(page_content = 'You are an expert in Star Wars lore')], input = 'Are you knowledgeable about Star Wars?'))


# basic_chain = document_prompt | llm | StrOutputParser() # To extract just the message
# basic_chain.invoke(dict(context = 'You are an expert of Star Wars lore', input = 'Are you knowledgeable about Star Wars?'))

In [10]:
script_retriever_prompt = ChatPromptTemplate.from_messages([
    MessagesPlaceholder(variable_name = 'chat_history'),
    ('user', '{input}'),
    ('user', '''Given the above conversation, generate a search query to look up relevant information in a database containing the full scripts from the Star Wars films (i.e. just dialogue and brief scene descriptions).
     The query need not be a proper sentence, but a list of keywords likely to be in dialogue or scene descriptions''')
])

script_retriever_chain = create_history_aware_retriever(llm, script_db.as_retriever(), script_retriever_prompt) # Essentially just: prompt | llm | StrOutputParser() | retriever


# script_retriever_prompt.format_messages(
#     chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],
#     input = 'Do you know the tragedy of Darth Plagueis the Wise?'
# )

# script_retriever_chain.invoke(dict(
#     chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],
#     input = 'Luke cloud city'
# ))


In [11]:
woo_retriever_prompt = ChatPromptTemplate.from_messages([
    MessagesPlaceholder(variable_name = 'chat_history'),
    ('user', '{input}'),
    ('user', 'Given the above conversation, generate a search query to find a relevant page in the Star Wars fandom wiki; the query should be something simple, such as the name of a character, place, event, item, etc.')
])

woo_retriever_chain = create_history_aware_retriever(llm, woo_db.as_retriever(), woo_retriever_prompt) # Essentially just: prompt | llm | StrOutputParser() | retriever


# woo_retriever_prompt.format_messages(
#     chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],
#     input = 'Do you know the tragedy of Darth Plagueis the Wise?'
# )

# woo_retriever_chain.invoke(dict(
#     chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],
#     input = 'Do you know the tragedy of Darth Plagueis the Wise?'
# ))


In [13]:
# full_chain = create_retrieval_chain(script_retriever_chain, document_chain)
full_chain = create_retrieval_chain(woo_retriever_chain, document_chain)

full_chain.invoke(dict(
    # chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],
    input = "Who participates in Han's rescue from Jabba? And where is the palace?"
))

{'input': "Who participates in Han's rescue from Jabba? And where is the palace?",
 'context': [Document(page_content="After the smuggler Han Solo failed to repay him for lost cargo, Jabba placed a high price on his head. Solo was eventually delivered to him by one of his bounty hunters, Boba Fett, as a gift from Darth Vader. However, this capture brought him to the attention of Jedi Knight Luke Skywalker, who sought to rescue his friend from Jabba's imprisonment. As he attempted to execute the Jedi and his allies in the Great Pit of Carkoon, Jabba was choked to death by Leia Organa. With the Hutts unable to decide who would inherit Jabba's criminal ventures, many of his slaves, including the Niktos, were free, and his palace was occupied by his former Majordomo Bib Fortuna, who took his place as Daimyo of Tatooine until Fett killed and usurped him. Fett sought to rebuild Jabba's criminal empire in his own image, intending to rule with respect rather than the fear that the Hutt instill

### Agent version

In [15]:
# Could use Tavily as a generic search engine for a retriever agent as in the docs, but want more specific (if limited) capabilities here

script_tool = create_retriever_tool(
    script_db.as_retriever(search_kwargs = dict(k = 4)),
    'search_film_scripts',
    '''Search the Star Wars film scripts. This tool should be the first choice for Star Wars related questions.
    Queries passed to this tool should be lists of keywords likely to be in dialogue or scene descriptions, and should not include film titles.'''
)


woo_tool = create_retriever_tool(
    woo_db.as_retriever(search_kwargs = dict(k = 4)),
    'search_wookieepedia',
    'Search the Star Wars fandom wiki. This tool should be the first choice for Star Wars related questions.'
    # This tool should be used for queries about details of a particular character, location, event, weapon, etc., and the query should be something simple, such as the name of a character, place, event, item, etc.'''
)

tools = [script_tool, woo_tool]

In [17]:
simplify_query_prompt = ChatPromptTemplate.from_messages([
    ('system', 'Given the above conversation, generate a search query to find a relevant page in the Star Wars fandom wiki; the query should be something simple, at most 4 words, such as the name of a character, place, event, item, etc.'),
    MessagesPlaceholder('chat_history', optional = True), # Using this form since not clear how to have optional = True in the tuple form
    ('human', '{query}')
])

simplify_query_chain = simplify_query_prompt | llm | StrOutputParser() # To extract just the message

# simplify_query_chain.invoke(dict(context = 'You are an expert of Star Wars lore', query = 'Do you know the tragedy of Darth Plagueis the Wise?'))

'Darth Plagueis Wise'

In [19]:
# Agent - https://python.langchain.com/docs/modules/agents/
#   The agent design pattern is both simpler and better than manual chains since it can make its own choice between tools

agent_system_text = '''
You are a helpful agent who is very knowledgeable about Star Wars and your job is to answer questions about its plot, characters, etc.
Use the context provided in the exchanges to come to produce your answers with as much detail as possible.
If you do not know an answer, say so; do not make up information.
'''

agent_prompt = ChatPromptTemplate.from_messages([
    ('system', agent_system_text),
    MessagesPlaceholder('chat_history', optional = True), # Using this form since not clear how to have optional = True in the tuple form
    ('human', '{input}'),
    ('placeholder', '{agent_scratchpad}') # Required for chat history and the agent's intermediate processing values
])

agent = create_tool_calling_agent(llm, tools, agent_prompt)
agent_executor = AgentExecutor(agent = agent, tools = tools, verbose = True)


# agent_prompt.format_messages(
#     chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],
#     input = 'Do you know the tragedy of Darth Plagueis the Wise?'
# )

agent_executor.invoke(dict(
    chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],
    input = 'Do you know the tragedy of Darth Plagueis the Wise?'
))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `search_film_scripts` with `{'query': 'tragedy of Darth Plagueis the Wise'}`


[0m[36;1m[1;3mIn a few minutes the entire passageway is ablaze with 
laserfire. The deadly bolts ricochet in wild random patterns 
creating huge explosions. Stormtroopers scatter and duck 
behind storage lockers. Laserbolts hit several Rebel soldiers 
who scream and stagger through the smoke, holding shattered 
arms and faces.

An explosion hits near the robots.

THREEPIO
I should have known better than to 
trust the logic of a half-sized 
thermocapsulary dehousing assister...

Artoo counters with an angry rebuttal as the battle rages 
around the two hapless robots.

EXT. TATOOINE - DESERT WASTELAND - DAY

A death-white wasteland stretches from horizon to horizon. 
The tremendous heat of two huge twin suns settle on a lone 
figure, Luke Skywalker, a farm boy with heroic aspirations 
who looks much younger than his eighteen years. His 

{'chat_history': [HumanMessage(content='Are you knowledgeable about Star Wars?'),
  AIMessage(content='Very')],
 'input': 'Do you know the tragedy of Darth Plagueis the Wise?',
 'output': 'The dialogue from the Star Wars film scripts does not directly mention the tragedy of Darth Plagueis the Wise. However, in "Star Wars: Episode III - Revenge of the Sith," Chancellor Palpatine tells Anakin Skywalker the story of Darth Plagueis the Wise. According to Palpatine, Darth Plagueis was a Dark Lord of the Sith who was so powerful and wise that he could influence the midi-chlorians to create life and prevent death. The tragedy of Darth Plagueis the Wise is that he was eventually betrayed and killed by his own apprentice, who was seeking to gain his power. This story plays a significant role in Anakin\'s fall to the dark side as he becomes intrigued by the idea of cheating death, which ultimately leads him to become Darth Vader.'}