In [4]:
import os
from dotenv import load_dotenv
import re
import json
import faiss
import pandas as pd
from dateutil import parser
from sentence_transformers import SentenceTransformer
from langchain.chat_models import ChatOpenAI
from langchain.tools import tool
from langchain.agents import initialize_agent, AgentType

# RAG

## 1. Load Data & Model

In [14]:
# Load unified DataFrame (structured data)
unified_df = pd.read_parquet("unified_dataset.parquet")
unified_df['date'] = pd.to_datetime(unified_df['date'])

# Load FAISS index and metadata
faiss_index = faiss.read_index("faiss_index.index")
with open("faiss_metadata.json", "r") as f:
    metadata = json.load(f)

# Instantiate the SentenceTransformer for semantic search
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

## 2. Define Data Retrieval Functions

In [31]:
def get_structured_summary_for_date(df, target_date_str):
    """
    Return structured summary for a specific date if available.
    """
    try:
        target_date = pd.to_datetime(target_date_str).date()
    except Exception as e:
        return f"Unable to parse date from input: {e}"
    
    matched_row = df[df['date'] == pd.to_datetime(target_date)]
    if matched_row.empty:
        return f"No structured metrics found for {target_date}."
    
    row = matched_row.iloc[0]
    summary = f"Structured Metrics on {target_date}:\n"
    summary += f" - Stock: Open = {row['stock_open']}, Close = {row['stock_close']}, Volume = {row['stock_volume']}\n"
    summary += f" - Reviews: {row['num_reviews']} reviews, Avg Playtime = {row['avg_playtime_hours']:.2f} hrs\n"
    summary += f" - Reddit: {row['num_reddit_posts']} posts (avg score = {row['avg_reddit_score']}), {row['num_reddit_comments']} comments\n"
    summary += f" - News: {row['num_news_articles']} articles\n"
    return summary


def semantic_search(query, k=5):
    """
    Given a text query, this function:
      - Embeds the query using SentenceTransformer
      - Searches the FAISS index
      - Returns the top-k chunks along with distances and metadata
    """
    query_embedding = semantic_model.encode(query).astype("float32").reshape(1, -1)
    distances, indices = faiss_index.search(query_embedding, k)
    
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        record = metadata[idx]
        record["distance"] = float(dist)
        results.append(record)
    
    return results


def extract_date_from_question(question):
    """
    Extract the first valid date string from the user question using regex and dateutil.
    """
    date_candidates = re.findall(r"\b\d{4}-\d{2}-\d{2}\b|\b\w+ \d{1,2}, \d{4}\b", question)
    for d in date_candidates:
        try:
            parsed = parser.parse(d)
            return str(parsed.date())
        except Exception:
            continue
    return None


def summarize_unstructured_sentiment(records):
    """
    Given a list of metadata dicts from semantic_search, compute average sentiment scores.
    Uses polarity, subjectivity, objectivity, and VADER scores (neg, neu, pos, compound).
    """
    if not records:
        return "No sentiment records found."

    total_scores = {
        "polarity": 0.0,
        "subjectivity": 0.0,
        "objectivity": 0.0,
        "vader_neg": 0.0,
        "vader_neu": 0.0,
        "vader_pos": 0.0,
        "vader_compound": 0.0,
    }
    count = 0

    for r in records:
        try:
            for key in total_scores:
                total_scores[key] += float(r.get(key, 0.0))
            count += 1
        except Exception:
            continue

    if count == 0:
        return "No valid sentiment scores found."

    # Calculate averages
    avg_scores = {key: total / count for key, total in total_scores.items()}

    # Format the result
    summary = f"Aggregated Unstructured Sentiment (from {count} documents):\n"
    summary += f" - Polarity: {avg_scores['polarity']:.3f}\n"
    summary += f" - Subjectivity: {avg_scores['subjectivity']:.3f}\n"
    summary += f" - Objectivity: {avg_scores['objectivity']:.3f}\n"
    summary += f" - VADER Negative: {avg_scores['vader_neg']:.3f}\n"
    summary += f" - VADER Neutral: {avg_scores['vader_neu']:.3f}\n"
    summary += f" - VADER Positive: {avg_scores['vader_pos']:.3f}\n"
    summary += f" - VADER Compound: {avg_scores['vader_compound']:.3f}"

    return summary


def aggregate_structured_data_for_date_range(df, start_date_str, end_date_str):
    """
    Aggregate the structured data (from unified_df) for a date range.
    Computes averages for stock open/close, total volume, and other key metrics.
    
    Expected date format: 'YYYY-MM-DD'
    """
    try:
        start_date = pd.to_datetime(start_date_str).date()
        end_date = pd.to_datetime(end_date_str).date()
    except Exception as e:
        return f"Error parsing dates: {str(e)}"

    # Filter the DataFrame for the date range
    df_range = df[(df['date'] >= pd.to_datetime(start_date)) & (df['date'] <= pd.to_datetime(end_date))]
    if df_range.empty:
        return f"No structured data found between {start_date} and {end_date}."
    
    # Compute aggregates
    avg_open = df_range['stock_open'].mean()
    avg_close = df_range['stock_close'].mean()
    total_volume = df_range['stock_volume'].sum()
    avg_num_reviews = df_range['num_reviews'].mean()
    avg_playtime = df_range['avg_playtime_hours'].mean()
    avg_percent_positive = df_range['percent_positive'].mean() * 100
    total_reddit_posts = df_range['num_reddit_posts'].sum()
    total_reddit_comments = df_range['num_reddit_comments'].sum()
    total_news_articles = df_range['num_news_articles'].sum()

    # Build a summary string
    summary = f"Aggregated Structured Data from {start_date} to {end_date}:\n"
    summary += f" - Average Stock Open: {avg_open:.2f}, Average Stock Close: {avg_close:.2f}\n"
    summary += f" - Total Stock Volume: {total_volume:.0f}\n"
    summary += f" - Average Reviews: {avg_num_reviews:.1f} reviews, Average Playtime: {avg_playtime:.2f} hrs, % Positive: {avg_percent_positive:.1f}%\n"
    summary += f" - Total Reddit Posts: {total_reddit_posts}, Total Reddit Comments: {total_reddit_comments}\n"
    summary += f" - Total News Articles: {total_news_articles}"
    return summary


## 3. Define Tool Functions

In [33]:
@tool
def retrieve_structured_data(target_date: str) -> str:
    """
    Returns a structured data summary for the specified date.
    Input target_date should be in a format like 'YYYY-MM-DD'.
    """
    return get_structured_summary_for_date(unified_df, target_date)

@tool
def perform_semantic_search(query: str) -> str:
    """
    Returns a concatenation of the top text chunks from the unstructured data 
    that are relevant to the input query.
    """
    results = semantic_search(query, k=5)
    texts = [r.get("text_chunk", r.get("content", "")) for r in results]
    return "\n\n".join(texts)

@tool
def aggregate_sentiment(query: str) -> str:
    """
    Tool to aggregate sentiment scores from unstructured data based on the query.
    """
    records = semantic_search(query, k=5)
    return summarize_unstructured_sentiment(records)

@tool
def extract_date(query: str) -> str:
    """
    Extracts and returns the first valid date found in the query.
    """
    date_str = extract_date_from_question(query)
    if date_str:
        return date_str
    else:
        return "No valid date found in the query."

@tool
def aggregate_structured_data_range(date_range_query: str) -> str:
    """
    Tool to aggregate structured data over a date range.
    Expects input in the format: 'YYYY-MM-DD to YYYY-MM-DD'
    """
    try:
        parts = date_range_query.split("to")
        if len(parts) != 2:
            return "Invalid date range format. Please provide as 'YYYY-MM-DD to YYYY-MM-DD'."
        start_date = parts[0].strip()
        end_date = parts[1].strip()
        return aggregate_structured_data_for_date_range(unified_df, start_date, end_date)
    except Exception as e:
        return f"Error: {str(e)}"


## 4. Initialise LLM and Agent

In [34]:
# Load OpenAI API key from .env
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Initialise the LLM
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY, 
    temperature=0.5,
    model_name="gpt-4o-mini"
)

# List of tools available to the agent
tools = [
    retrieve_structured_data,
    perform_semantic_search,
    aggregate_sentiment,
    extract_date,
    aggregate_structured_data_range
]

# Initialise the agent using the Zero-Shot React Description method
agent = initialize_agent(
    tools,
    llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

## 5. Example Queries

In [None]:
# Set the query
query = (
    "What was the stock volume on April 02, 2025 and how did the overall sentiment look?"
)

# Run the agent
result = agent.run(query)

# Print result
print("Agent's Answer:")
print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo answer the question, I will first retrieve the structured data for April 02, 2025 to find the stock volume. Then, I will perform a semantic search to understand the unstructured sentiment related to that date.

Action: retrieve_structured_data  
Action Input: '2025-04-02'  [0m
Observation: [36;1m[1;3mStructured Metrics on 2025-04-02:
 - Stock: Open = 10.694999694824219, Close = 10.704999923706055, Volume = 524163.0
 - Reviews: 143.0 reviews, Avg Playtime = 41.54 hrs
 - Reddit: 271.0 posts (avg score = 32.313653136531364), 2287.0 comments
 - News: 1.0 articles
[0m
Thought:[32;1m[1;3mI have retrieved the structured data for April 02, 2025, which indicates that the stock volume was 524,163. Now, I will perform a semantic search to analyze the unstructured sentiment related to that date.

Action: perform_semantic_search  
Action Input: 'April 02, 2025 sentiment'  [0m
Observation: [33;1m[1;3m2025 goty

early 2025 goty 

In [None]:
query = (
    "How did Ubisoft's stock price and trading volume react in the week following the release "
    "of Assassin's Creed Shadows on March 27, 2025?"
)

# Run the agent
result = agent.run(query)
print("Agent's Answer:")
print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to gather structured data about Ubisoft's stock price and trading volume around the date of March 27, 2025. This will help me understand how the stock reacted in the week following the game's release. 

Action: retrieve_structured_data  
Action Input: '2025-03-27'  [0m
Observation: [36;1m[1;3mStructured Metrics on 2025-03-27:
 - Stock: Open = 13.0, Close = 12.920000076293945, Volume = 763231.0
 - Reviews: 324.0 reviews, Avg Playtime = 39.20 hrs
 - Reddit: nan posts (avg score = nan), 22.0 comments
 - News: 1.0 articles
[0m
Thought:[32;1m[1;3mI have the stock price and trading volume for Ubisoft on the release date of Assassin’s Creed Shadows, but I need to gather data for the week following that date to see how the stock reacted. I will retrieve structured data for the subsequent days.

Action: retrieve_structured_data  
Action Input: '2025-03-28'  [0m
Observation: [36;1m[1;3mStructured Metrics on 2025-03-28:


In [None]:
query = (
    "What does the data suggest about investor sentiment toward Ubisoft compared to player sentiment on March 31, 2025?"
)

# Run the agent
result = agent.run(query)
print("Agent's Answer:")
print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo compare investor sentiment toward Ubisoft with player sentiment on March 31, 2025, I will first retrieve structured data for that date. This will provide insights into both investor and player sentiment on that specific day.

Action: retrieve_structured_data  
Action Input: '2025-03-31'  [0m
Observation: [36;1m[1;3mStructured Metrics on 2025-03-31:
 - Stock: Open = 12.109999656677246, Close = 11.154999732971191, Volume = 2431051.0
 - Reviews: 227.0 reviews, Avg Playtime = 35.53 hrs
 - Reddit: 8.0 posts (avg score = 507.0), 456.0 comments
 - News: nan articles
[0m
Thought:[32;1m[1;3mI have retrieved the structured data for March 31, 2025. The data includes stock performance metrics, player reviews, Reddit activity, and news articles. However, there are no news articles available for that date.

- **Investor Sentiment**: 
  - Stock opened at approximately 12.11 and closed at approximately 11.15, indicating a decrease i

In [29]:
query = (
    "On which day in early April 2025 did Ubisoft experience the highest average playtime per user, "
    "and how did player sentiment compare on that day?"
)

# Run the agent
result = agent.run(query)
print("Agent's Answer:")
print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo answer the question, I need to first identify the specific day in early April 2025 when Ubisoft had the highest average playtime per user. Then, I will need to assess player sentiment for that day to compare it with the average playtime. 

Since I don't have access to real-time data, I will start by retrieving structured data for early April 2025 to find the day with the highest average playtime.

Action: retrieve_structured_data  
Action Input: '2025-04-01'  # I will retrieve data starting from the first day of April 2025 to find the relevant information.  [0m
Observation: [36;1m[1;3mUnable to parse date from input: Unknown datetime string format, unable to parse: '2025-04-01'  # I will retrieve data starting from the first day of April 2025 to find the relevant information., at position 0[0m
Thought:[32;1m[1;3mIt seems that I need to retrieve data for a range of dates in early April 2025 rather than just a single d

In [30]:
query = (
    "What was the general mood on Reddit about Ubisoft between March 30 and April 2, 2025?"
    "Were there any particularly high-scoring or controversial posts?"
)

# Run the agent
result = agent.run(query)
print("Agent's Answer:")
print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo determine the general mood about Ubisoft on Reddit between March 30 and April 2, 2025, I should first gather sentiment data for that specific timeframe. This will help assess the overall sentiment as well as identify any notable posts. 

Action: aggregate_sentiment  
Action Input: "Ubisoft between March 30 and April 2, 2025"  [0m
Observation: [38;5;200m[1;3mAggregated Unstructured Sentiment (from 5 documents):
 - Average Positive: 0.000
 - Average Negative: 0.000
 - Average Compound: 0.000[0m
Thought:[32;1m[1;3mThe sentiment analysis for Ubisoft on Reddit between March 30 and April 2, 2025, shows an average sentiment score of 0.000 across positive, negative, and compound measures. This indicates a neutral sentiment overall, suggesting that there were likely no strong feelings either way during that period.

Next, I should look for any particularly high-scoring or controversial posts that might stand out despite the o