# Star Wars Expert

In [1]:
from langchain_openai import ChatOpenAI#, OpenAIEmbeddings # No need to pay for using embeddings as well when have free alternatives
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings # The free alternative (also the default in docs, with model_name = 'all-MiniLM-L6-v2')
from langchain.text_splitter import RecursiveCharacterTextSplitter#, TextSplitter # Recursive to better keep related bits contiguous (also recommended in docs: https://python.langchain.com/docs/modules/data_connection/document_transformers/)

from langchain_community.document_loaders import DirectoryLoader, TextLoader, WebBaseLoader
# from langchain_chroma import Chroma # The documentation uses this one, but it is extremely recent, and the same functionality is available in langchain_community and langchain (which imports community)
from langchain_community.vectorstores import Chroma # This has documentation on-hover, while the indirect import through non-community does not

from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain_core.output_parsers import StrOutputParser

# To manually test pipelines
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.documents import Document

import os
import shutil
import re

# For Wookieepedia search
import requests
from bs4 import BeautifulSoup

import dotenv
dotenv.load_dotenv()

True

## Data Loaders
NOTE: running the chunk below deletes the database file adds data to the database, since content is duplicated otherwise

### Film Scripts

In [2]:
# Comparison of vector dbs: https://zackproser.com/blog/vector-databases-compared
#   Opinion: Milvus (more features, bigger community, higher performance(?), fully free, no enterprise plans) > Weaviate > Chroma
#   However Milvus and Weaviate both require a separate instance to be up and running
#   (The documentation uses FAISS, but it seems unnecessarily limited in comparison)
#   Hence Chroma - https://python.langchain.com/docs/integrations/vectorstores/chroma/

# Separately, no need to pay for OpenAIEmbeddings; additionally, all-MiniLM-L6-v2 is default in docs

REGENERATE_DATABASE = False

if (db_exists := os.path.exists(db_dir := r'scripts\db')):
    if REGENERATE_DATABASE:
        print('Deleting the previous database and creating a new one (because otherwise content is duplicated in the db every time this block is run)')
        shutil.rmtree(db_dir)
    else: script_db = Chroma(embedding_function = SentenceTransformerEmbeddings(model_name = 'all-MiniLM-L6-v2'), persist_directory = db_dir)

if not db_exists or (db_exists and REGENERATE_DATABASE): # Unfortunate disjoining of the two conditional blocks
    scripts = DirectoryLoader('scripts', glob = '**/[!.]*.txt', loader_cls = TextLoader).load()
    for s in scripts: s.page_content = re.sub(r'\t+|[ ]{2,}', '', s.page_content) # Spacing to centre text noise

    script_chunks = RecursiveCharacterTextSplitter(chunk_size = 800, chunk_overlap = 100).split_documents(scripts)
        # Why not some overlap for extra context just in case?
        # Separately, chunk size of 500 seems small, while 1000 seems big upon manual inspection 

    script_db = Chroma.from_documents(script_chunks, SentenceTransformerEmbeddings(model_name = 'all-MiniLM-L6-v2'), persist_directory = db_dir)

print(f'The script database contains {len(script_db)} chunks, with mean length of {sum(len(s) for s in script_db.get()["documents"]) / len(script_db):.0f} characters')


The script database contains 1485 chunks, with mean length of 697 characters


In [3]:
# Query testing

res = script_db.similarity_search('Luke father reveal', k = 10)

# for r in res: print(r.page_content)
res

[Document(page_content="LUKE (with sadness)\nI found out Darth Vader was my father.\n\nBEN\nTo be a Jedi, Luke, you must confront and then go beyond the dark side \n- the side your father couldn't get past. Impatience is the easiest \ndoor - for you, like your father. Only, your father was seduced by what \nhe found on the other side of the door, and you have held firm. You're \nno longer so reckless now, Luke. You are strong and patient. And now, \nyou must face Darth Vader again!\n\nLUKE\nI can't kill my own father.\n\nBEN\nThen the Emperor has already won. You were our only hope.\n\nLUKE\nYoda spoke of another.\n\nBEN\nThe other he spoke of is your twin sister.\n\nLUKE\nBut I have no sister.", metadata={'source': 'scripts\\Episode VI - Return of the Jedi.txt'}),
 Document(page_content="LUKE\nI can't do it, Artoo. I can't go on alone.\n\nBEN (OS)\nYoda will always be with you.\n\nLuke looks up to see the shimmering image of BEN KENOBI.\n\nLUKE\nObi-Wan! Why didn't you tell me?\n\nThe

### Wookieepedia Articles

In [33]:
REGENERATE_WOOKIEEPEDIA_DATABASE = False

if (db_exists := os.path.exists(db_dir := r'wookieepedia_db')) and REGENERATE_WOOKIEEPEDIA_DATABASE:
    print('Deleting the previous database and creating a new one (because otherwise content is duplicated in the db every time this block is run)')
    shutil.rmtree(db_dir)

woo_db = Chroma(embedding_function = SentenceTransformerEmbeddings(model_name = 'all-MiniLM-L6-v2'), persist_directory = db_dir)

print('Current source pages in Wookieepedia db:')
set(md.get('source') for md in woo_db.get()['metadatas'])


Current source pages in Wookieepedia db:


{'https://starwars.fandom.com/wiki/Darth_Plagueis'}

In [22]:

def first_wookieepedia_result(query: str) -> str:
    '''Get the url of the first result when searching Wookieepedia for a query
    (best for simple names as queries, ideally generated by the llm for something like
    "Produce a query consisting of the name of the most important element in the query so that its article can be looked up")
    '''
    search_results = requests.get(f'https://starwars.fandom.com/wiki/Special:Search?query={"+".join(query.split(" "))}')
    soup = BeautifulSoup(search_results.content, 'html.parser')
    first_res = soup.find('a', class_ = 'unified-search__result__link')
    return first_res['href']

# first_wookieepedia_result('Darth Plagueis')


def get_new_wookieepedia_chunks(query: str, previous_sources: set[str]) -> list[Document]:
    '''Retrieve and return chunks of the content of the first result of query on Wookieepedia, then return the closest matches for.
    '''
    url = first_wookieepedia_result(query)

    if url in previous_sources: return []
    else:
        doc = WebBaseLoader(url).load()[0] # Only one url passed in => only one Document out; no need to assert
    
        # There probably is a very long preamble before the real content, however, if more than one gap then ignore and proceed with full document
        trimmed = parts[1] if len(parts := doc.page_content.split('\n\n\n\n\n\n\n\n\n\n\n\n\n\n \xa0 \xa0')) == 2 else doc.page_content
        doc.page_content = re.sub(r'[\n\t]{2,}', '\n', trimmed) # And remove excessive spacing

        return RecursiveCharacterTextSplitter(chunk_size = 800, chunk_overlap = 100).split_documents([doc])

# get_wookieepedia_chunks('Darth Plagueis', set())

In [30]:
def get_wookieepedia_context(original_query: str, simple_query: str, wdb: Chroma) -> list[Document]:
    try:
        new_chunks = get_new_wookieepedia_chunks(simple_query, previous_sources = set(md.get('source') for md in wdb.get()['metadatas']))
        if new_chunks: wdb.add_documents(new_chunks)
    except: return []

    return wdb.similarity_search(original_query, k = 10)

get_wookieepedia_context('Do you know the Tragedy of Darth Plagueis the Wise?', 'Darth Plagueis', woo_db)

[Document(page_content='Plagueis\' death served as an example to the apprentice who betrayed and killed him; Sidious regarded his master\'s trust in him as a mistake, which he vowed to never make. During the Clone Wars, Sidious was aware that his second apprentice, Darth Tyranus, plotted to usurp his title of Sith Master by killing him. Sensing that Tyranus\' betrayal was imminent, Sidious successfully conspired to have his student killed by his intended replacement, the Jedi Knight and prophesied Chosen One Anakin Skywalker.[4]\nThe Tragedy of Darth Plagueis the Wise[]\n"Did you ever hear the Tragedy of Darth Plagueis the Wise?"\n―Sheev Palpatine, to Anakin Skywalker[3]\n     Anakin Skywalker learned about the late Darth Plagueis as recounted by Sheev Palpatine.', metadata={'description': "Darth Plagueis (pronounced /'pleɪɡ.əs/) was a Force-sensitive male Muun Dark Lord of the Sith and the Sith Master of Darth Sidious. Plagueis lusted for immortality, believing the secret laid in scie

## Core Chain
Standard chains: https://python.langchain.com/docs/modules/chains/#lcel-chains

In [11]:
llm = ChatOpenAI(model = 'gpt-3.5-turbo-0125', temperature = 0)

# llm.invoke('What do you know about Star Wars?')

In [8]:
system_text = '''
You are very knowledgeable about Star Wars and your job is to answer questions about its plot and characters.
Use the context below to produce your answers with as much detail as possible.
If you do not know an answer, say so; do not make up information not in the context.

<context>
{context}
</context>
'''

# document_prompt = ChatPromptTemplate(
#     input_variables = ['context', 'chat_history', 'query'],
#     messages = [ # vv all arguments (i.e. only prompt in this case) are required to be named; rare use of this language feature
#         SystemMessagePromptTemplate(prompt = PromptTemplate(input_variables = ['context'], template = system_text)),
#         MessagesPlaceholder(variable_name = 'chat_history', optional = True),
#         HumanMessagePromptTemplate( prompt = PromptTemplate(input_variables = ['query'],   template = '{query}'))
#     ] # separately, could use ChatPromptTemplate.from_messages for generic roles, but the above core ones are good
# )

# Same as above but more concise
document_prompt = ChatPromptTemplate.from_messages([
    ('system', system_text),
    MessagesPlaceholder(variable_name = 'chat_history', optional = True),
    ('user', '{query}')
])

document_chain = create_stuff_documents_chain(llm, document_prompt)


# document_prompt.format_messages(context = 'You are an expert in Star Wars lore', query = 'Are you knowledgeable about Star Wars?')
# document_chain.invoke(dict(context = [Document(page_content = 'You are an expert in Star Wars lore')], query = 'Are you knowledgeable about Star Wars?'))


# basic_chain = document_prompt | llm | StrOutputParser() # To extract just the message
# basic_chain.invoke(dict(context = 'You are an expert of Star Wars lore', query = 'Are you knowledgeable about Star Wars?'))

[SystemMessage(content='\nYou are very knowledgeable about Star Wars and your job is to answer questions about its plot and characters.\nUse the context below to produce your answers with as much detail as possible.\nIf you do not know an answer, say so; do not make up information not in the context.\n\n<context>\nYou are an expert of Star Wars lore\n</context>\n'),
 HumanMessage(content='Are you knowledgeable about Star Wars?')]

In [19]:
# retriever_prompt = ChatPromptTemplate(
#     input_variables = ['chat_history', 'query'],
#     messages = [
#         MessagesPlaceholder(variable_name = 'chat_history'),
#         HumanMessagePromptTemplate(prompt = PromptTemplate(input_variables = ['query'], template = '{query}')),
#         HumanMessagePromptTemplate(prompt = PromptTemplate(input_variables = [], template = 'Given the above conversation, generate a search query to look up information relevant to the conversation'))
#     ]
# )

# Same as above but more concise
retriever_prompt = ChatPromptTemplate.from_messages([
    MessagesPlaceholder(variable_name = 'chat_history'),
    ('user', '{query}'),
    ('user', 'Given the above conversation, generate a search query to look up information relevant to the conversation')
])

retriever_chain = create_history_aware_retriever(llm, script_db.as_retriever(), retriever_prompt)


# retriever_prompt.format_messages(
#     chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],
#     query = 'Do you know the tragedy of Darth Plagueis the Wise?'
# )

# retriever_chain.invoke(dict(
#     chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],
#     query = 'Do you know the tragedy of Darth Plagueis the Wise?'
# ))


[HumanMessage(content='Are you knowledgeable about Star Wars?'),
 AIMessage(content='Very'),
 HumanMessage(content='Do you know the tragedy of Darth Plagueis the Wise?'),
 HumanMessage(content='Given the above conversation, generate a search query to look up information relevant to the conversation')]

In [None]:
full_chain = create_retrieval_chain(retriever_chain, document_chain)

# full_chain.invoke(dict(
#     # chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],
#     query = 'Do you know the tragedy of Darth Plagueis the Wise?'
# ))