## Load reviews

In [29]:
import pandas as pd
reviews_df = pd.read_pickle('reviews_df.pkl')
reviews_df

Unnamed: 0,url,review,comments
0,https://www.jammersreviews.com/st-ds9/s1/emiss...,"""Emissary"" | Star Trek: DS9 | Jammer's Reviews...",
1,https://www.jammersreviews.com/st-ds9/s1/prolo...,"""Past Prologue"" | Star Trek: DS9 | Jammer's Re...",
2,https://www.jammersreviews.com/st-ds9/s1/alone...,"""A Man Alone"" | Star Trek: DS9 | Jammer's Revi...",
3,https://www.jammersreviews.com/st-ds9/s1/babel...,"""Babel"" | Star Trek: DS9 | Jammer's Reviews\n\...",
4,https://www.jammersreviews.com/st-ds9/s1/capti...,"""Captive Pursuit"" | Star Trek: DS9 | Jammer's ...",
...,...,...,...
171,https://www.jammersreviews.com/st-ds9/s7/tacki...,"""Tacking into the Wind"" | Star Trek: DS9 | Jam...",
172,https://www.jammersreviews.com/st-ds9/s7/extre...,"""Extreme Measures"" | Star Trek: DS9 | Jammer's...",
173,https://www.jammersreviews.com/st-ds9/s7/dogs.php,"""The Dogs of War"" | Star Trek: DS9 | Jammer's ...",
174,https://www.jammersreviews.com/st-ds9/s7/leave...,"""What You Leave Behind"" | Star Trek: DS9 | Jam...",


from pandas import option_context

with option_context('display.max_colwidth', None):
    display(reviews_df.head())

## initial attempt to resolve request too big error, but 
## solution was to reduce chunk size per
## https://community.pinecone.io/t/i-am-getting-this-weird-error-does-anybody-know-why-this-is-happening-and-how-to-solve-it/3702

"""
import re

def filter(val):
    res = re.search(r'/s[6-7]/', val)
    if res:
        return True
    else:
        return False
reviews_df = reviews_df[reviews_df['url'].apply(filter)]
"""

## Create documents

In [31]:
from groq import Groq

from langchain.text_splitter import TokenTextSplitter
from langchain.docstore.document import Document
from langchain_pinecone import PineconeVectorStore
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

text_splitter = TokenTextSplitter(

    # cutting this in half per https://community.pinecone.io/t/i-am-getting-this-weird-error-does-anybody-know-why-this-is-happening-and-how-to-solve-it/3702
    chunk_size=250, # 500 tokens is the max
    chunk_overlap=20 # Overlap of N tokens between chunks (to reduce chance of cutting out relevant connected text like middle of sentence)
)

documents = []
for index, row in reviews_df.iterrows():
    review_text = row['review']
    chunks = text_splitter.split_text(review_text)
    for chunk in chunks:
        header = f"Episode URL: {row['url']}\n\n"
        documents.append(Document(page_content=header + chunk, metadata={"source": "local"}))

print('# Transcription Chunks: ', len(documents))

# Transcription Chunks:  24331


## Create embeddings and populate vector store

prior to running this,
- get Pinecone key from pinecone.io
- `export PINECONE_API_KEY=<pinecone key>` in local environment
- create index with name "ds9-document" with dimension 384 at pinecone.io

In [32]:
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
pinecone_index_name = "ds9-documents"
docsearch = PineconeVectorStore.from_documents(documents, embedding_function, index_name=pinecone_index_name)

## Let's try some queries

In [33]:
def transcript_chat_completion(client, transcript, user_question):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": '''Use this transcript or transcripts to answer any user questions, citing specific quotes:

                {transcript}
                '''.format(transcript=transcript)
            },
            {
                "role": "user",
                "content": user_question,
            }
        ],
        model="llama3-8b-8192",
    )

    print(chat_completion.choices[0].message.content)

In [34]:
import os
client = Groq(api_key = os.getenv('GROQ_API_KEY'))
#model = 'llama-3.3-70b-versatile'

In [35]:
user_question = "Based on the reviews, what are the reviewer's favorite episodes?"
relevent_docs = docsearch.similarity_search(user_question)
delimiter =  '\n\n------------------------------------------------------\n\n'
num_docs = 3
relevant_transcripts = delimiter.join([doc.page_content for doc in relevent_docs[:num_docs]])
transcript_chat_completion(client, relevant_transcripts, user_question)

According to the reviews, the reviewer's favorite episodes are:

* "Far Beyond the Stars" (Season 6, Episode 13) - Rated 4 out of 4
* "In the Pale Moonlight" (Season 6, Episode 19) - Rated 4 out of 4

Additionally, the reviewer mentions that "Duet" (Season 1, Episode 19) is one of their favorite episodes of the whole series. However, they only provide a link to their review and do not give a specific rating for this episode.


In [36]:
user_question = "Based on the reviews, what are the reviewer's favorite character in the series?"
relevent_docs = docsearch.similarity_search(user_question)
delimiter =  '\n\n------------------------------------------------------\n\n'
num_docs = 3
relevant_transcripts = delimiter.join([doc.page_content for doc in relevent_docs[:num_docs]])
transcript_chat_completion(client, relevant_transcripts, user_question)

Based on the reviews, it appears that the reviewers' favorite character in the series is Odo. One of the reviewers mentions Odo and Quark as a relationship they enjoy watching, and another reviewer lists Odo as a favorite character alongside Garak, Damar, and Vic. The reviewers also praise the character development of Odo and the other secondary characters, which suggests that they have a high opinion of the character.
