# Retail Analytics RAG with Google Gemini

This notebook demonstrates the Retrieval-Augmented Generation (RAG) system for querying retail analytics gold tables using Google Gemini.

## 1. Setup and Configuration

In [None]:
# Install required packages
import subprocess
import sys

packages = ['google-generativeai']
for package in packages:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', package, '-q'])

print("‚úÖ All packages installed")

In [None]:
# Import required libraries
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import sys

# Add project to path
sys.path.insert(0, '/Workspace/Users/mohammedthoufiq9360@gmail.com/Retail-And-Ecommerce-Analytics-Platform')

from src.rag_service import RetailAnalyticsRAG
from src.paths import FACT_SALES_PATH, DIM_CUSTOMERS_PATH, DIM_PRODUCTS_PATH

print("‚úÖ Libraries imported successfully")

In [None]:
# Set up Gemini API key
import getpass

# Option 1: Set from environment variable (recommended)
# os.environ['GEMINI_API_KEY'] = 'your-api-key-here'

# Option 2: Prompt for API key (for interactive sessions)
api_key = os.environ.get('GEMINI_API_KEY')
if not api_key:
    api_key = getpass.getpass("Enter your Google Gemini API key: ")
    os.environ['GEMINI_API_KEY'] = api_key

print("‚úÖ API key configured")

In [None]:
# Initialize RAG service
rag = RetailAnalyticsRAG()
print("‚úÖ RAG service initialized and ready for queries")

## 2. Quick Data Overview

In [None]:
# Check gold tables
spark = SparkSession.builder.appName("DataExplore").getOrCreate()

sales_df = spark.table(FACT_SALES_PATH)
customers_df = spark.table(DIM_CUSTOMERS_PATH)
products_df = spark.table(DIM_PRODUCTS_PATH)

print(f"üìä Sales records: {sales_df.count():,}")
print(f"üë• Customers: {customers_df.count():,}")
print(f"üì¶ Products: {products_df.count():,}")

print("\nüîç Sales Table Schema:")
sales_df.printSchema()

## 3. RAG Query Examples

### Example 1: Sales Performance

In [None]:
question = "What is our overall sales performance? How much revenue have we generated and how many transactions have we completed?"
response = rag.query(question)
print(response)

### Example 2: Customer Insights

In [None]:
question = "Can you provide insights about our customer base? Where are most of our customers located and what are their demographics?"
response = rag.query(question)
print(response)

### Example 3: Product Analysis

In [None]:
question = "What product categories do we offer and which ones are performing well in our sales?"
response = rag.query(question)
print(response)

## 4. SQL-Enhanced RAG Queries

In [None]:
# Query with specific SQL
sql_query = """
SELECT 
  dc.name as customer_name,
  COUNT(DISTINCT fs.invoice_id) as total_orders,
  SUM(fs.line_total) as total_spent,
  ROUND(AVG(fs.line_total), 2) as avg_order_value
FROM retail_analytics.gold.fact_sales fs
JOIN retail_analytics.gold.dim_customers dc ON fs.customer_sk = dc.customer_sk
GROUP BY dc.customer_sk, dc.name
ORDER BY total_spent DESC
LIMIT 10
"""

question = "Who are our top 10 customers by spending? What can you tell me about their purchasing behavior?"
response = rag.query_with_sql(question, sql_query)
print(response)

## 5. Interactive Mode

In [None]:
# Start interactive conversation
# rag.multi_turn_conversation()

# Or use individual queries:
questions = [
    "What's the total revenue for this year?",
    "Which stores are performing best?",
    "What are the top-selling products?"
]

for q in questions:
    print(f"\n{'='*60}")
    print(f"Q: {q}")
    print('='*60)
    response = rag.query(q)
    print(response)

## 6. Advanced SQL Analysis with RAG

In [None]:
# Product category analysis
sql_query = """
SELECT 
  dp.category,
  COUNT(DISTINCT fs.invoice_id) as total_orders,
  SUM(fs.quantity) as total_units_sold,
  SUM(fs.line_total) as revenue,
  ROUND(AVG(fs.unit_price), 2) as avg_unit_price
FROM retail_analytics.gold.fact_sales fs
JOIN retail_analytics.gold.dim_products dp ON fs.product_sk = dp.product_sk
GROUP BY dp.category
ORDER BY revenue DESC
"""

question = "Analyze sales by product category. Which categories are our best performers and why?"
response = rag.query_with_sql(question, sql_query)
print(response)