In [1]:
print('Ok')

Ok


In [8]:
import os
from dotenv import load_dotenv

from langchain_groq import ChatGroq
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import CSVLoader
import langsmith

# ---------- Load Env ----------
load_dotenv()
groq_api_key = os.getenv("GROQ_API_KEY")

# ---------- STEP 1: Load CSV ----------
file_path = "sentiment-analysis.csv"
loader = CSVLoader(
    file_path=file_path,
    source_column="Text",
    metadata_columns=["Sentiment", "Source", "Date/Time", "User ID", "Location", "Confidence Score"]
)
data = loader.load()

# ---------- STEP 2: Build FAISS DB for RAG ----------
def prepare_csv(docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    chunks = text_splitter.split_documents(docs)

    embeddings = FastEmbedEmbeddings(model_name="BAAI/bge-large-en-v1.5")
    db_faiss = FAISS.from_documents(chunks, embeddings)
    return db_faiss

db_csv = prepare_csv(data)

# ---------- STEP 3: Define RAG Function ----------
def rag_tool(query: str):
    output_retrieval = db_csv.similarity_search(query, k=5)
    
    # Extract just the page content without metadata for the prompt
    retrieved_texts = [doc.page_content for doc in output_retrieval]
    context_for_llm = "\n---\n".join(retrieved_texts) # Use a separator to distinguish between different documents

    prompt = f"""
    You are an AI assistant tasked with answering questions about customer sentiment.
    Based on the following reviews, answer the user's question.
    
    Reviews context:
    {context_for_llm}

    Question:
    {query}

    Guidelines for your answer:
    - Synthesize the information from the reviews to provide a concise, natural-sounding answer.
    - Do not mention or include any of the metadata like 'Sentiment', 'Source', 'Date', etc.
    - If the reviews don't contain the answer, say "I don't have enough information to answer that question."
    """

    llm = ChatGroq(model="llama-3.1-8b-instant", temperature=0, api_key=groq_api_key)
    response = llm.invoke(prompt)
    return response.content

In [12]:
# ---------- STEP 4: Direct Usage ----------
if __name__ == "__main__":
    query = "Which sources have the highest proportion of positive reviews?"
    answer = rag_tool(query)
    print("Answer:", answer)

Answer: Based on the reviews, it appears that Goodreads and IMDb have the most positive reviews. However, to determine which sources have the highest proportion of positive reviews, we need to consider the total number of reviews for each source.

Unfortunately, the provided reviews do not contain information about the total number of reviews for each source. Therefore, I don't have enough information to accurately determine which sources have the highest proportion of positive reviews.
