In [16]:
from dotenv import load_dotenv
import os
import re
import json
import faiss
import numpy as np
import pandas as pd
from dateutil import parser
from sentence_transformers import SentenceTransformer
from typing import List
from pydantic import Field
from openai import OpenAI

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.in_memory import InMemoryDocstore
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.chains import LLMChain

from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever

# RAG

## 1. Load Data

Load the daily summary data.

In [2]:
# Load the structured data
unified_df = pd.read_parquet("unified_dataset.parquet")
# Make sure the 'date' column is in a friendly format
unified_df['date'] = pd.to_datetime(unified_df['date'])

Load the FAISS index and metadata from disk.

In [3]:
# Load Pre-trained Embedding Model (same as used for index)
# Used for similarity search API compatibility — not for recomputing
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

# Load FAISS Index
faiss_index = faiss.read_index("faiss_index.index")

# Load Metadata and Construct LangChain Documents
with open("faiss_metadata.json", "r") as f:
    metadata = json.load(f)

docs = []
docstore_dict = {}
index_to_docstore_id = {}

for i, md in enumerate(metadata):
    text = md.get("text_chunk") or md.get("content") or ""
    doc = Document(page_content=text, metadata=md)
    doc_id = str(i)
    docs.append(doc)
    docstore_dict[doc_id] = doc
    index_to_docstore_id[i] = doc_id

# Build the In-Memory Docstore
docstore = InMemoryDocstore(docstore_dict)

# Rebuild the FAISS Vector Store
faiss_store = FAISS(
    index=faiss_index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
    embedding_function=embedding_model
)

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


## 2. Create a Retriever

Convert the FAISS vector store into a retriever that, given a user query, will return the most relevant document chunks.

In [4]:
def get_structured_summary_for_date(df, target_date_str):
    """
    Return structured summary for a specific date if available.
    """
    try:
        target_date = pd.to_datetime(target_date_str).date()
    except:
        return "Unable to parse date from input."

    # Filter DataFrame
    matched_row = df[df['date'] == pd.to_datetime(target_date)]

    if matched_row.empty:
        return f"No structured metrics found for {target_date}."

    row = matched_row.iloc[0]
    summary = f"Structured Metrics on {target_date}:\n"
    summary += f" - Stock: Open = {row['stock_open']}, Close = {row['stock_close']}, Volume = {row['stock_volume']}\n"
    summary += f" - Reviews: {row['num_reviews']} reviews, Avg Playtime = {row['avg_playtime_hours']:.2f} hrs, % Positive = {row['percent_positive']*100:.1f}%\n"
    summary += f" - Reddit: {row['num_reddit_posts']} posts (avg score = {row['avg_reddit_score']}), {row['num_reddit_comments']} comments\n"
    summary += f" - News: {row['num_news_articles']} articles\n"
    return summary


def semantic_search(query, k=5):
    """
    Given a text query, this function:
      - Embeds the query using SentenceTransformer
      - Searches the FAISS index
      - Returns the top-k chunks along with distances and metadata
    """
    query_embedding = semantic_model.encode(query).astype("float32").reshape(1, -1)
    
    distances, indices = faiss_index.search(query_embedding, k)

    results = []
    for dist, idx in zip(distances[0], indices[0]):
        record = metadata[idx]
        record["distance"] = float(dist)
        results.append(record)
    
    return results


def extract_date_from_question(question):
    """
    Extract the first valid date string from the user question using regex and dateutil.
    """
    date_candidates = re.findall(r"\b\d{4}-\d{2}-\d{2}\b|\b\w+ \d{1,2}, \d{4}\b", question)
    for d in date_candidates:
        try:
            parsed = parser.parse(d)
            return str(parsed.date())
        except:
            continue
    return None


In [5]:
class CombinedRetriever(BaseRetriever):
    df: any = Field(...)  # structured DataFrame
    k: int = Field(default=5)  # top-k for semantic search

    def get_relevant_documents(self, query: str) -> List[Document]:
        # Extract structured summary
        date_str = extract_date_from_question(query)
        structured_summary = (
            get_structured_summary_for_date(self.df, date_str)
            if date_str else get_latest_structured_summary(self.df)
        )
        structured_doc = Document(
            page_content=structured_summary,
            metadata={"source": "structured_summary"}
        )

        # Run semantic search (unstructured retrieval)
        faiss_results = semantic_search(query, self.k)
        unstructured_docs = [
            Document(page_content=r.get("text_chunk", r.get("content", "")), metadata=r)
            for r in faiss_results
        ]

        return [structured_doc] + unstructured_docs

    async def aget_relevant_documents(self, query: str) -> List[Document]:
        raise NotImplementedError("Async not implemented yet")

  class CombinedRetriever(BaseRetriever):
  class CombinedRetriever(BaseRetriever):
  warn(


## 3. Define the Prompt Template
Build a prompt template that instructs the LLM to use the retrieved context to answer the question. You can customize this prompt to include guidelines, a fixed format, or even further instructions (such as including metadata details).

In [6]:
from langchain.prompts import PromptTemplate

rag_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful Ubisoft data assistant. 
Use the following CONTEXT to answer the question below. 
If you do not find sufficient information from CONTEXT, say you don't know.

CONTEXT:
{context}

Question: {question}

Answer:"""
)

## 4. Initialise The LLM
Set up a large language model that will generate answers based on the context provided.

In [None]:
# Load OpenAI API key from .env
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Initialise the LLM
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY, 
    temperature=0,
    model_name="gpt-4o-mini"
)

## 5. Assemble the RetrievalQA Chain.
Use LangChain’s built-in RetrievalQA chain (or a custom chain) to combine retrieval and generation. This chain will take a user query, retrieve relevant documents, inject them into the prompt, and then pass the prompt to the LLM for the final answer.

In [18]:
# Create custom retriever instance
combined_retriever = CombinedRetriever(df=unified_df, k=5)

# Build RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=combined_retriever,
    chain_type="stuff",  # 'stuff' is great if the combined context is under the token limit
    return_source_documents=True,
    chain_type_kwargs={"prompt": rag_prompt}
)

In [19]:
query = "What was the stock volume and review sentiment on April 02, 2025?"
response = qa_chain({"query": query})

print("Answer:\n", response["result"])
print("\nSource Documents:\n")
for doc in response["source_documents"]:
    print(f"- Source: {doc.metadata.get('source', 'unstructured')} -> {doc.page_content[:200]}...\n")


Answer:
 On April 02, 2025, the stock volume was 524,163.0, and the review sentiment was 85.3% positive based on 143 reviews.

Source Documents:

- Source: structured_summary -> Structured Metrics on 2025-04-02:
 - Stock: Open = 10.694999694824219, Close = 10.704999923706055, Volume = 524163.0
 - Reviews: 143.0 reviews, Avg Playtime = 41.54 hrs, % Positive = 85.3%
 - Reddit: ...

- Source: steam_review -> so far i'm having a great time. not sure how much stock you should put into any bad reviews because barely half the people on steam have even finished the prologue as of 3/21. game is running great on...

- Source: steam_review -> ubisoft everyone would be losing their minds. what a good reminder for all of us to not put so much stock into pre-release reviews. they are trying to make money, too....

- Source: reddit_comment -> you think ubisoft’s stock was stellar before march the 20th and ac shadows crashed it?...

- Source: reddit_comment -> this is the most braindead take i’ve read