In [4]:
import os
from dotenv import load_dotenv
import re
import json
import faiss
import pandas as pd
from dateutil import parser
from sentence_transformers import SentenceTransformer
from langchain.chat_models import ChatOpenAI
from langchain.tools import tool
from langchain.agents import initialize_agent, AgentType

# RAG

## 1. Load Data & Model

In [14]:
# Load unified DataFrame (structured data)
unified_df = pd.read_parquet("unified_dataset.parquet")
unified_df['date'] = pd.to_datetime(unified_df['date'])

# Load FAISS index and metadata
faiss_index = faiss.read_index("faiss_index.index")
with open("faiss_metadata.json", "r") as f:
    metadata = json.load(f)

# Instantiate the SentenceTransformer for semantic search
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

## 2. Define Data Retrieval Functions

In [16]:
def get_structured_summary_for_date(df, target_date_str):
    """
    Return structured summary for a specific date if available.
    """
    try:
        target_date = pd.to_datetime(target_date_str).date()
    except Exception as e:
        return f"Unable to parse date from input: {e}"
    
    matched_row = df[df['date'] == pd.to_datetime(target_date)]
    if matched_row.empty:
        return f"No structured metrics found for {target_date}."
    
    row = matched_row.iloc[0]
    summary = f"Structured Metrics on {target_date}:\n"
    summary += f" - Stock: Open = {row['stock_open']}, Close = {row['stock_close']}, Volume = {row['stock_volume']}\n"
    summary += f" - Reviews: {row['num_reviews']} reviews, Avg Playtime = {row['avg_playtime_hours']:.2f} hrs\n"
    summary += f" - Reddit: {row['num_reddit_posts']} posts (avg score = {row['avg_reddit_score']}), {row['num_reddit_comments']} comments\n"
    summary += f" - News: {row['num_news_articles']} articles\n"
    return summary

def semantic_search(query, k=5):
    """
    Given a text query, this function:
      - Embeds the query using SentenceTransformer
      - Searches the FAISS index
      - Returns the top-k chunks along with distances and metadata
    """
    query_embedding = semantic_model.encode(query).astype("float32").reshape(1, -1)
    distances, indices = faiss_index.search(query_embedding, k)
    
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        record = metadata[idx]
        record["distance"] = float(dist)
        results.append(record)
    
    return results

def extract_date_from_question(question):
    """
    Extract the first valid date string from the user question using regex and dateutil.
    """
    date_candidates = re.findall(r"\b\d{4}-\d{2}-\d{2}\b|\b\w+ \d{1,2}, \d{4}\b", question)
    for d in date_candidates:
        try:
            parsed = parser.parse(d)
            return str(parsed.date())
        except Exception:
            continue
    return None



## 3. Define Tool Functions

In [None]:
@tool
def retrieve_structured_data(target_date: str) -> str:
    """
    Returns a structured data summary for the specified date.
    Input target_date should be in a format like 'YYYY-MM-DD'.
    """
    return get_structured_summary_for_date(unified_df, target_date)

@tool
def perform_semantic_search(query: str) -> str:
    """
    Returns a concatenation of the top text chunks from the unstructured data 
    that are relevant to the input query.
    """
    results = semantic_search(query, k=5)
    texts = [r.get("text_chunk", r.get("content", "")) for r in results]
    return "\n\n".join(texts)

@tool
def extract_date(query: str) -> str:
    """
    Extracts and returns the first valid date found in the query.
    """
    date_str = extract_date_from_question(query)
    if date_str:
        return date_str
    else:
        return "No valid date found in the query."

## 4. Initialise LLM and Agent

In [None]:
# Load OpenAI API key from .env
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Initialise the LLM
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY, 
    temperature=0.5,
    model_name="gpt-4o-mini"
)

# List of tools available to the agent
tools = [retrieve_structured_data, perform_semantic_search, extract_date]

# Initialise the agent using the Zero-Shot React Description method
agent = initialize_agent(
    tools,
    llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

  llm = ChatOpenAI(
  agent = initialize_agent(


## 5. Example Queries

In [18]:
query = (
    "What was the stock volume on April 02, 2025 and how did unstructured sentiment look?"
)

# Run the agent
result = agent.run(query)
print("Agent's Answer:")
print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo answer the question, I need to retrieve structured data for the stock volume on April 02, 2025. Additionally, I need to perform a semantic search to understand the unstructured sentiment around that date. 

Action: retrieve_structured_data  
Action Input: '2025-04-02'  [0m
Observation: [36;1m[1;3mStructured Metrics on 2025-04-02:
 - Stock: Open = 10.694999694824219, Close = 10.704999923706055, Volume = 524163.0
 - Reviews: 143.0 reviews, Avg Playtime = 41.54 hrs
 - Reddit: 271.0 posts (avg score = 32.313653136531364), 2287.0 comments
 - News: 1.0 articles
[0m
Thought:[32;1m[1;3mI have obtained the structured data for April 02, 2025, which indicates that the stock volume was 524,163. Additionally, I need to gather the unstructured sentiment data related to that date. 

Action: perform_semantic_search  
Action Input: "sentiment on April 02, 2025"  [0m
Observation: [33;1m[1;3m2025 goty

early 2025 goty contender.

on

In [9]:
query = (
    "How did Ubisoft's stock price and trading volume react in the week following the release "
    "of Assassin’s Creed Shadows on March 27, 2025?"
)

# Run the agent
result = agent.run(query)
print("Agent's Answer:")
print(result)

  result = agent.run(query)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to find the structured data related to Ubisoft's stock price and trading volume around the date of the release of Assassin’s Creed Shadows, which is March 27, 2025. I'll retrieve the structured data for that date and the week following it to analyze the changes. 

Action: retrieve_structured_data  
Action Input: '2025-03-27'  [0m
Observation: [36;1m[1;3mStructured Metrics on 2025-03-27:
 - Stock: Open = 13.0, Close = 12.920000076293945, Volume = 763231.0
 - Reviews: 324.0 reviews, Avg Playtime = 39.20 hrs, % Positive = 81.8%
 - Reddit: nan posts (avg score = nan), 22.0 comments
 - News: 1.0 articles
[0m
Thought:[32;1m[1;3mI have the structured data for Ubisoft's stock price and trading volume on the release date of March 27, 2025. Now, I need to retrieve the structured data for the week following this date to see how the stock price and trading volume changed.

Action: retrieve_structured_data  
Action Input: '20

In [15]:
query = (
    "What does the data suggest about investor sentiment toward Ubisoft compared to player sentiment on March 31, 2025?"
)

# Run the agent
result = agent.run(query)
print("Agent's Answer:")
print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to gather structured data from March 31, 2025, to analyze investor sentiment toward Ubisoft and compare it with player sentiment. 
Action: retrieve_structured_data
Action Input: '2025-03-31'[0m

KeyError: 'percent_positive'

In [12]:
query = (
    "On which day in early April 2025 did Ubisoft experience the highest average playtime per user, "
    "and how did player sentiment compare on that day?"
)

# Run the agent
result = agent.run(query)
print("Agent's Answer:")
print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to find structured data for early April 2025 to determine which day Ubisoft had the highest average playtime per user. I'll start by retrieving the structured data for that time frame.

Action: retrieve_structured_data  
Action Input: '2025-04-01'  [0m
Observation: [36;1m[1;3mStructured Metrics on 2025-04-01:
 - Stock: Open = 11.1899995803833, Close = 10.805000305175781, Volume = 1406544.0
 - Reviews: 186.0 reviews, Avg Playtime = 39.02 hrs, % Positive = 81.7%
 - Reddit: 75.0 posts (avg score = 86.36), 1536.0 comments
 - News: nan articles
[0m
Thought:[32;1m[1;3mI need to check the structured data for more days in early April 2025 to find the day with the highest average playtime per user. I'll continue retrieving structured data for the subsequent days.

Action: retrieve_structured_data  
Action Input: '2025-04-02'  [0m
Observation: [36;1m[1;3mStructured Metrics on 2025-04-02:
 - Stock: Open = 10.694999694824

In [13]:
query = (
    "What was the general mood on Reddit about Ubisoft between March 30 and April 2, 2025?"
    "Were there any particularly high-scoring or controversial posts?"
)

# Run the agent
result = agent.run(query)
print("Agent's Answer:")
print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo answer the question about the general mood on Reddit regarding Ubisoft between March 30 and April 2, 2025, I will need to gather relevant information from unstructured data, as this will provide insights into user sentiments and any notable posts during that time period.

Action: perform_semantic_search  
Action Input: "Ubisoft Reddit sentiment March 30 to April 2, 2025"  [0m
Observation: [33;1m[1;3mgdc 2025: ubisoft’s complete schedule

ubisoft forward 2024 – how and where to watch

ubisoft everyone would be losing their minds. what a good reminder for all of us to not put so much stock into pre-release reviews. they are trying to make money, too.

it's ubisoft. you know this will be 50% off by the summer.

have weather, but since when’s that the main draw? ubisoft’s just slapping stuff together—ui and gameplay design’s legit gone downhill. post-unity, what’s even changed in how this series plays? it’s just the same gr