## Set up environment

In [1]:
from dotenv import load_dotenv
load_dotenv('../.env')

True

## Load reviews

In [2]:
import pandas as pd
reviews_df = pd.read_pickle('reviews_df.pkl')
reviews_df

Unnamed: 0,url,review,comments
0,https://www.jammersreviews.com/st-ds9/s1/emiss...,"""Emissary"" | Star Trek: DS9 | Jammer's Reviews...",
1,https://www.jammersreviews.com/st-ds9/s1/prolo...,"""Past Prologue"" | Star Trek: DS9 | Jammer's Re...",
2,https://www.jammersreviews.com/st-ds9/s1/alone...,"""A Man Alone"" | Star Trek: DS9 | Jammer's Revi...",
3,https://www.jammersreviews.com/st-ds9/s1/babel...,"""Babel"" | Star Trek: DS9 | Jammer's Reviews\n\...",
4,https://www.jammersreviews.com/st-ds9/s1/capti...,"""Captive Pursuit"" | Star Trek: DS9 | Jammer's ...",
...,...,...,...
171,https://www.jammersreviews.com/st-ds9/s7/tacki...,"""Tacking into the Wind"" | Star Trek: DS9 | Jam...",
172,https://www.jammersreviews.com/st-ds9/s7/extre...,"""Extreme Measures"" | Star Trek: DS9 | Jammer's...",
173,https://www.jammersreviews.com/st-ds9/s7/dogs.php,"""The Dogs of War"" | Star Trek: DS9 | Jammer's ...",
174,https://www.jammersreviews.com/st-ds9/s7/leave...,"""What You Leave Behind"" | Star Trek: DS9 | Jam...",


In [3]:
# from pandas import option_context

#with option_context('display.max_colwidth', None):
#    display(reviews_df.head())

In [4]:
## initial attempt to resolve request too big error was to limit the number of downloads
## to seasons 6 and 7.
## The proper olution was to reduce chunk size as per
## https://community.pinecone.io/t/i-am-getting-this-weird-error-does-anybody-know-why-this-is-happening-and-how-to-solve-it/3702

#"""
#import re
#
#def filter(val):
#    res = re.search(r'/s[6-7]/', val)
#    if res:
#        return True
#    else:
#        return False
#reviews_df = reviews_df[reviews_df['url'].apply(filter)]
#"""

## Create documents

In [5]:
from groq import Groq

from langchain.text_splitter import TokenTextSplitter
from langchain.docstore.document import Document
from langchain_pinecone import PineconeVectorStore
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

text_splitter = TokenTextSplitter(

    # cutting this in half per https://community.pinecone.io/t/i-am-getting-this-weird-error-does-anybody-know-why-this-is-happening-and-how-to-solve-it/3702
    chunk_size=250, # 500 tokens is the max
    chunk_overlap=20 # Overlap of N tokens between chunks (to reduce chance of cutting out relevant connected text like middle of sentence)
)

documents = []
for index, row in reviews_df.iterrows():
    review_text = row['review']
    chunks = text_splitter.split_text(review_text)
    for chunk in chunks:
        header = f"Episode URL: {row['url']}\n\n"
        documents.append(Document(page_content=header + chunk, metadata={"source": "local"}))

print('# Transcription Chunks: ', len(documents))

# Transcription Chunks:  24331


## Create embeddings and populate vector store

prior to running this,
- get Pinecone key from pinecone.io
- add `PINECONE_API_KEY=<pinecone key>` to `.env` file
- create index with name "ds9-document" with dimension 384 at pinecone.io

In [6]:
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
pinecone_index_name = "ds9-documents"
docsearch = PineconeVectorStore.from_documents(documents, embedding_function, index_name=pinecone_index_name)

  embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


## Let's try some queries

In [7]:
def transcript_chat_completion(client, transcript, user_question):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": '''Use this transcript or transcripts to answer any user questions, citing specific quotes:

                {transcript}
                '''.format(transcript=transcript)
            },
            {
                "role": "user",
                "content": user_question,
            }
        ],
        model="llama3-8b-8192",
    )

    print(chat_completion.choices[0].message.content)

In [8]:
import os
client = Groq(api_key = os.getenv('GROQ_API_KEY'))
#model = 'llama-3.3-70b-versatile'

In [9]:
user_question = "Based on the reviews, what are the reviewer's favorite episodes?"
relevent_docs = docsearch.similarity_search(user_question)
delimiter =  '\n\n------------------------------------------------------\n\n'
num_docs = 3
relevant_transcripts = delimiter.join([doc.page_content for doc in relevent_docs[:num_docs]])
transcript_chat_completion(client, relevant_transcripts, user_question)

According to the reviews, the reviewer's favorite episodes are:

1. "Far Beyond the Stars" - rated 4 out of 5
2. "In the Pale Moonlight" - rated 4 out of 5
3. "The Sound of Her Voice" - rated 3.5 out of 5

These episodes are highlighted as standouts by the reviewer, who mentions that "Far Beyond the Stars" is their favorite episode of the season.


In [10]:
user_question = "Based on the reviews, what are the reviewer's favorite character in the series?"
relevent_docs = docsearch.similarity_search(user_question)
delimiter =  '\n\n------------------------------------------------------\n\n'
num_docs = 3
relevant_transcripts = delimiter.join([doc.page_content for doc in relevent_docs[:num_docs]])
transcript_chat_completion(client, relevant_transcripts, user_question)

Based on the reviews, it seems that Rom is a well-liked character. The reviewer "denchik" praises denchik's performance as Rom, calling him "wholly admirable" and saying that he is one of the greatest disappointments that the writers turned him into a parody of himself, making him one of the most annoying characters on the show.

It is also worth noting that Rom is not explicitly mentioned as a favorite character by other reviewers, but the fact that denchik praises Rom's performance so highly suggests that Rom may be a well-liked character in the series.
