# Build Retrieval System

## 1. Imports

In [5]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from pathlib import Path
import time

print("‚úì Libraries loaded")


‚úì Libraries loaded


## 2. Load Data and Model


In [6]:
# Load data with summaries
df = pd.read_csv('../data/processed/data_with_summaries.csv')
print(f"Loaded {len(df)} policies with summaries")

# Load embedding model (this downloads ~80MB first time)
print("\nLoading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("‚úì Model loaded")

Loaded 58592 policies with summaries

Loading embedding model...
‚úì Model loaded


## 3. Generate Embeddings

In [5]:
# Extract all summaries
texts = df['summary'].tolist()

print(f"Encoding {len(texts)} summaries...")
start_time = time.time()

# Generate embeddings (vectors)
embeddings = model.encode(
    texts,
    show_progress_bar=True,
    convert_to_numpy=True,
    batch_size=32
)

elapsed = time.time() - start_time
print(f"‚úì Created embeddings in {elapsed:.1f}s")
print(f"Embedding shape: {embeddings.shape}")
print(f"Each summary is now a {embeddings.shape[1]}-dimensional vector")

Encoding 58592 summaries...


Batches:   0%|          | 0/1831 [00:00<?, ?it/s]

‚úì Created embeddings in 2433.3s
Embedding shape: (58592, 384)
Each summary is now a 384-dimensional vector


## 4. Save Embeddings

In [6]:
# Save embeddings for reuse
embeddings_path = '../models/embeddings.npy'
np.save(embeddings_path, embeddings)

print(f"‚úì Saved embeddings to {embeddings_path}")
print(f"File size: {Path(embeddings_path).stat().st_size / 1024 / 1024:.1f} MB")

‚úì Saved embeddings to ../models/embeddings.npy
File size: 85.8 MB


## 5.Build FAISS Index

In [7]:
# Create FAISS index for fast similarity search
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 distance

# Add all vectors to the index
index.add(embeddings)

print(f"‚úì FAISS index built")
print(f"Index contains {index.ntotal} vectors")

‚úì FAISS index built
Index contains 58592 vectors


## 6. Save FAISS Index

In [8]:
# Save the index
index_path = '../models/faiss_index.bin'
faiss.write_index(index, index_path)

print(f"‚úì Saved FAISS index to {index_path}")

‚úì Saved FAISS index to ../models/faiss_index.bin


## 7. Test Retrieval - Search Function

In [9]:
def search_similar_cases(query_text, k=5):
    """Find k most similar past policies"""
    
    # Encode the query
    query_vector = model.encode([query_text])
    
    # Search the index
    distances, indices = index.search(query_vector, k)
    
    # Get the similar cases
    results = df.iloc[indices[0]].copy()
    results['similarity_distance'] = distances[0]
    
    return results

# Test it
query = "30-year-old with a 5-year-old Petrol Toyota Corolla, 4 airbags, ESC, urban region"
print(f"Query: {query}\n")

results = search_similar_cases(query, k=3)
print("Top 3 similar cases:")
print(results[['policy_id', 'summary', 'claim_status', 'similarity_distance']])

Query: 30-year-old with a 5-year-old Petrol Toyota Corolla, 4 airbags, ESC, urban region

Top 3 similar cases:
       policy_id                                            summary  \
53596  POL022317  A 42-year-old driver in region C2 with a 0.8-y...   
47263  POL048677  A 38-year-old driver in region C2 with a 0.4-y...   
13971  POL049050  A 39-year-old driver in region C2 with a 2.8-y...   

       claim_status  similarity_distance  
53596             0             0.737614  
47263             0             0.737786  
13971             0             0.739616  


## 8. Analyze Results

In [10]:
# Calculate claim rate among retrieved cases
claim_rate = results['claim_status'].mean()
total = len(results)
claims = results['claim_status'].sum()

print(f"\nRisk Assessment:")
print(f"Among {total} similar past cases:")
print(f"- {claims} resulted in claims ({claim_rate:.0%})")
print(f"- Average similarity distance: {results['similarity_distance'].mean():.3f}")

print("\nDetailed breakdown:")
for idx, row in results.iterrows():
    status = "CLAIM" if row['claim_status'] == 1 else "NO CLAIM"
    print(f"\n{status} | Distance: {row['similarity_distance']:.3f}")
    print(f"  {row['summary']}")


Risk Assessment:
Among 3 similar past cases:
- 0 resulted in claims (0%)
- Average similarity distance: 0.738

Detailed breakdown:

NO CLAIM | Distance: 0.738
  A 42-year-old driver in region C2 with a 0.8-year-old Petrol M2. Vehicle has 2 airbags and ESC, brake assist, parking sensors. NCAP rating: 2 stars. Policy: 0.6 months. Claim filed: No.

NO CLAIM | Distance: 0.738
  A 38-year-old driver in region C2 with a 0.4-year-old Petrol M2. Vehicle has 2 airbags and ESC, brake assist, parking sensors. NCAP rating: 2 stars. Policy: 0.9 months. Claim filed: No.

NO CLAIM | Distance: 0.740
  A 39-year-old driver in region C2 with a 2.8-year-old Petrol M2. Vehicle has 2 airbags and ESC, brake assist, parking sensors. NCAP rating: 2 stars. Policy: 0.9 months. Claim filed: No.


## 9. Create Explanation Generator


In [11]:
def generate_explanation(query, similar_cases):
    """Create human-readable risk explanation"""
    
    total = len(similar_cases)
    claims = similar_cases['claim_status'].sum()
    claim_rate = claims / total
    
    # Determine risk level
    if claim_rate >= 0.6:
        risk_level = "HIGH"
        color = "üî¥"
    elif claim_rate >= 0.3:
        risk_level = "MEDIUM"
        color = "üü°"
    else:
        risk_level = "LOW"
        color = "üü¢"
    
    explanation = f"""
{color} RISK ASSESSMENT: {risk_level}

Query: {query}

Evidence from {total} similar past policies:
- Claims filed: {claims}/{total} ({claim_rate:.0%})
- Average similarity score: {similar_cases['similarity_distance'].mean():.3f}

Similar cases:
"""
    
    for i, (idx, row) in enumerate(similar_cases.iterrows(), 1):
        status_icon = "‚ùå" if row['claim_status'] == 1 else "‚úÖ"
        explanation += f"\n{i}. {status_icon} {row['summary']}"
    
    # Add recommendation
    explanation += f"\n\nRecommendation: "
    if risk_level == "HIGH":
        explanation += "Review manually. Consider higher premium or additional coverage restrictions."
    elif risk_level == "MEDIUM":
        explanation += "Standard processing with careful verification of safety features."
    else:
        explanation += "Low risk profile. Standard premium applicable."
    
    return explanation

# Test the explanation
print(generate_explanation(query, results))


üü¢ RISK ASSESSMENT: LOW

Query: 30-year-old with a 5-year-old Petrol Toyota Corolla, 4 airbags, ESC, urban region

Evidence from 3 similar past policies:
- Claims filed: 0/3 (0%)
- Average similarity score: 0.738

Similar cases:

1. ‚úÖ A 42-year-old driver in region C2 with a 0.8-year-old Petrol M2. Vehicle has 2 airbags and ESC, brake assist, parking sensors. NCAP rating: 2 stars. Policy: 0.6 months. Claim filed: No.
2. ‚úÖ A 38-year-old driver in region C2 with a 0.4-year-old Petrol M2. Vehicle has 2 airbags and ESC, brake assist, parking sensors. NCAP rating: 2 stars. Policy: 0.9 months. Claim filed: No.
3. ‚úÖ A 39-year-old driver in region C2 with a 2.8-year-old Petrol M2. Vehicle has 2 airbags and ESC, brake assist, parking sensors. NCAP rating: 2 stars. Policy: 0.9 months. Claim filed: No.

Recommendation: Low risk profile. Standard premium applicable.


## 10. Test Multiple Scenarios

In [12]:
# Test different risk profiles
test_queries = [
    "22-year-old with 10-year-old Diesel vehicle, 2 airbags, no ESC",
    "45-year-old with 2-year-old Electric Tesla, 6 airbags, all safety features",
    "35-year-old with 6-year-old Petrol Honda Civic, 4 airbags, ESC, brake assist"
]

for i, query in enumerate(test_queries, 1):
    print(f"\n{'='*70}")
    print(f"TEST CASE {i}")
    print('='*70)
    
    results = search_similar_cases(query, k=5)
    print(generate_explanation(query, results))


TEST CASE 1

üü¢ RISK ASSESSMENT: LOW

Query: 22-year-old with 10-year-old Diesel vehicle, 2 airbags, no ESC

Evidence from 5 similar past policies:
- Claims filed: 0/5 (0%)
- Average similarity score: 0.610

Similar cases:

1. ‚úÖ A 42-year-old driver in region C2 with a 2.2-year-old Diesel M4. Vehicle has 6 airbags and ESC, brake assist, parking sensors. NCAP rating: 3 stars. Policy: 1.1 months. Claim filed: No.
2. ‚úÖ A 42-year-old driver in region C12 with a 1.8-year-old Diesel M4. Vehicle has 6 airbags and ESC, brake assist, parking sensors. NCAP rating: 3 stars. Policy: 0.1 months. Claim filed: No.
3. ‚úÖ A 42-year-old driver in region C13 with a 2.6-year-old Diesel M4. Vehicle has 6 airbags and ESC, brake assist, parking sensors. NCAP rating: 3 stars. Policy: 1.1 months. Claim filed: No.
4. ‚úÖ A 42-year-old driver in region C13 with a 2.2-year-old Diesel M4. Vehicle has 6 airbags and ESC, brake assist, parking sensors. NCAP rating: 3 stars. Policy: 1.2 months. Claim filed: No

In [10]:
"""
LOAD SAVED EMBEDDINGS - Add this as a new cell
Use this instead of re-encoding (saves 40 minutes!)
"""

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from pathlib import Path

# Load the data
df = pd.read_csv('../data/processed/data_with_summaries.csv')
print(f"‚úì Loaded {len(df)} policies with summaries")

# Load the model (needed for new queries)
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("‚úì Model loaded")

# Load the SAVED embeddings (this is FAST - just a few seconds!)
print("\nLoading saved embeddings...")
embeddings_path = '../models/embeddings.npy'
embeddings = np.load(embeddings_path)
print(f"‚úì Loaded embeddings in seconds (avoided 40min re-encoding!)")
print(f"Embedding shape: {embeddings.shape}")

# Load the main FAISS index
print("\nLoading FAISS index...")
index_path = '../models/faiss_index.bin'
index = faiss.read_index(index_path)
print(f"‚úì Loaded FAISS index with {index.ntotal} vectors")

print("\n" + "="*70)
print("‚úÖ ALL COMPONENTS LOADED - Ready to build improved system!")
print("="*70)

‚úì Loaded 58592 policies with summaries
Loading embedding model...
‚úì Model loaded

Loading saved embeddings...
‚úì Loaded embeddings in seconds (avoided 40min re-encoding!)
Embedding shape: (58592, 384)

Loading FAISS index...
‚úì Loaded FAISS index with 58592 vectors

‚úÖ ALL COMPONENTS LOADED - Ready to build improved system!


In [8]:
import re

# ============================================================================
# STEP 1: Calculate Risk Factors from Historical Data
# ============================================================================
print("="*70)
print("STEP 1: Calculating risk factors from historical data...")
print("="*70)

risk_factors = {}
base_claim_rate = df['claim_status'].mean()

# Age-based risk
age_risk = df.groupby('age_risk')['claim_status'].agg(['mean', 'count'])
age_risk['risk_multiplier'] = age_risk['mean'] / base_claim_rate
risk_factors['age_risk'] = age_risk
print("\nüìä Age Risk Factors:")
print(age_risk)

# Vehicle age risk
vehicle_age_risk = df.groupby('vehicle_age_category')['claim_status'].agg(['mean', 'count'])
vehicle_age_risk['risk_multiplier'] = vehicle_age_risk['mean'] / base_claim_rate
risk_factors['vehicle_age'] = vehicle_age_risk
print("\nüìä Vehicle Age Risk Factors:")
print(vehicle_age_risk)

# Safety score impact
df['safety_category'] = pd.cut(df['safety_score'], bins=[0, 3, 6, 20], labels=['low', 'medium', 'high'])
safety_risk = df.groupby('safety_category')['claim_status'].agg(['mean', 'count'])
safety_risk['risk_multiplier'] = safety_risk['mean'] / base_claim_rate
risk_factors['safety'] = safety_risk
print("\nüìä Safety Score Risk Factors:")
print(safety_risk)

print(f"\n‚úì Risk factors calculated. Base claim rate: {base_claim_rate:.2%}")





STEP 1: Calculating risk factors from historical data...

üìä Age Risk Factors:
              mean  count  risk_multiplier
age_risk                                  
mature    0.066860  37272         1.045211
middle    0.057030  19814         0.891549
senior    0.083665   1506         1.307929

üìä Vehicle Age Risk Factors:
                          mean  count  risk_multiplier
vehicle_age_category                                  
medium                0.044621   4415         0.697548
new                   0.065586  54143         1.025291
old                   0.000000     29         0.000000

üìä Safety Score Risk Factors:
                     mean  count  risk_multiplier
safety_category                                  
low              0.061336  16157         0.958852
medium           0.064807  23516         1.013119
high             0.065173  18919         1.018834

‚úì Risk factors calculated. Base claim rate: 6.40%


  safety_risk = df.groupby('safety_category')['claim_status'].agg(['mean', 'count'])


In [11]:

# ============================================================================
# STEP 2: Build Separate Indices for Claims and No-Claims
# ============================================================================
print("\n" + "="*70)
print("STEP 2: Building separate indices for balanced retrieval...")
print("="*70)

# Split the data
claim_mask = df['claim_status'] == 1
claims_df = df[claim_mask].copy().reset_index(drop=True)
no_claims_df = df[~claim_mask].copy().reset_index(drop=True)

print(f"Claims: {len(claims_df)} ({len(claims_df)/len(df):.1%})")
print(f"No Claims: {len(no_claims_df)} ({len(no_claims_df)/len(df):.1%})")

# Split embeddings
claims_embeddings = embeddings[claim_mask]
no_claims_embeddings = embeddings[~claim_mask]

# Build separate FAISS indices
dimension = embeddings.shape[1]

claims_index = faiss.IndexFlatL2(dimension)
claims_index.add(claims_embeddings)

no_claims_index = faiss.IndexFlatL2(dimension)
no_claims_index.add(no_claims_embeddings)

print(f"\n‚úì Built separate indices")
print(f"   Claims index: {claims_index.ntotal} vectors")
print(f"   No-claims index: {no_claims_index.ntotal} vectors")



STEP 2: Building separate indices for balanced retrieval...
Claims: 3748 (6.4%)
No Claims: 54844 (93.6%)

‚úì Built separate indices
   Claims index: 3748 vectors
   No-claims index: 54844 vectors


In [12]:
# ============================================================================
# STEP 3: Define Helper Functions
# ============================================================================
print("\n" + "="*70)
print("STEP 3: Defining improved search and analysis functions...")
print("="*70)

def extract_features_from_query(query_text):
    """Extract key features from query text"""
    features = {
        'age_risk': None,
        'vehicle_age': None,
        'safety': None,
        'fuel_type': None
    }
    
    # Extract age
    age_match = re.search(r'(\d+)-year-old', query_text)
    if age_match:
        age = int(age_match.group(1))
        if age < 25:
            features['age_risk'] = 'young'
        elif age < 40:
            features['age_risk'] = 'middle'
        elif age < 60:
            features['age_risk'] = 'mature'
        else:
            features['age_risk'] = 'senior'
    
    # Extract vehicle age
    vehicle_age_match = re.search(r'with a (\d+)-year-old', query_text)
    if vehicle_age_match:
        v_age = int(vehicle_age_match.group(1))
        if v_age <= 3:
            features['vehicle_age'] = 'new'
        elif v_age <= 7:
            features['vehicle_age'] = 'medium'
        else:
            features['vehicle_age'] = 'old'
    
    # Detect safety features
    safety_keywords = ['ESC', 'brake assist', '6 airbags', '8 airbags', 'all safety']
    danger_keywords = ['no ESC', '2 airbags', 'basic safety']
    
    if any(keyword in query_text for keyword in danger_keywords):
        features['safety'] = 'low'
    elif any(keyword in query_text for keyword in safety_keywords):
        features['safety'] = 'high'
    else:
        features['safety'] = 'medium'
    
    # Extract fuel type
    for fuel in ['Diesel', 'Petrol', 'Electric', 'CNG']:
        if fuel in query_text:
            features['fuel_type'] = fuel
            break
    
    return features


def calculate_feature_based_risk(query_text):
    """Calculate risk based on features alone"""
    
    features = extract_features_from_query(query_text)
    
    risk_multiplier = 1.0
    explanations = []
    
    # Apply age risk
    if features['age_risk'] and features['age_risk'] in age_risk.index:
        age_mult = age_risk.loc[features['age_risk'], 'risk_multiplier']
        risk_multiplier *= age_mult
        explanations.append(f"Age ({features['age_risk']}): {age_mult:.2f}x")
    
    # Apply vehicle age risk
    if features['vehicle_age'] and features['vehicle_age'] in vehicle_age_risk.index:
        v_age_mult = vehicle_age_risk.loc[features['vehicle_age'], 'risk_multiplier']
        risk_multiplier *= v_age_mult
        explanations.append(f"Vehicle age ({features['vehicle_age']}): {v_age_mult:.2f}x")
    
    # Apply safety risk
    if features['safety'] and features['safety'] in safety_risk.index:
        safety_mult = safety_risk.loc[features['safety'], 'risk_multiplier']
        risk_multiplier *= safety_mult
        explanations.append(f"Safety ({features['safety']}): {safety_mult:.2f}x")
    
    estimated_risk = base_claim_rate * risk_multiplier
    
    return {
        'estimated_risk': estimated_risk,
        'base_rate': base_claim_rate,
        'risk_multiplier': risk_multiplier,
        'explanations': explanations,
        'features': features
    }


def search_dual_index(query_text, k_per_group=5):
    """Search both indices separately and combine results"""
    
    query_vector = model.encode([query_text])
    
    # Search claims index
    claim_distances, claim_indices = claims_index.search(query_vector, k_per_group)
    claim_results = claims_df.iloc[claim_indices[0]].copy()
    claim_results['similarity_distance'] = claim_distances[0]
    
    # Search no-claims index
    no_claim_distances, no_claim_indices = no_claims_index.search(query_vector, k_per_group)
    no_claim_results = no_claims_df.iloc[no_claim_indices[0]].copy()
    no_claim_results['similarity_distance'] = no_claim_distances[0]
    
    # Combine
    all_results = pd.concat([claim_results, no_claim_results]).sort_values('similarity_distance')
    
    return all_results


def calculate_weighted_risk_score(similar_cases):
    """Calculate risk score weighted by similarity"""
    
    # Convert distance to similarity (inverse)
    max_distance = similar_cases['similarity_distance'].max()
    if max_distance > 0:
        similar_cases['similarity_score'] = 1 - (similar_cases['similarity_distance'] / max_distance)
    else:
        similar_cases['similarity_score'] = 1.0
    
    # Weighted claim rate
    weighted_claims = (similar_cases['claim_status'] * similar_cases['similarity_score']).sum()
    total_weight = similar_cases['similarity_score'].sum()
    weighted_claim_rate = weighted_claims / total_weight if total_weight > 0 else 0
    
    # Regular claim rate
    regular_claim_rate = similar_cases['claim_status'].mean()
    
    return {
        'weighted_rate': weighted_claim_rate,
        'regular_rate': regular_claim_rate,
        'total_cases': len(similar_cases),
        'total_claims': int(similar_cases['claim_status'].sum())
    }


def hybrid_risk_assessment(query_text, k_per_group=5):
    """Combine feature-based risk with RAG retrieval"""
    
    # Step 1: Feature-based risk
    feature_risk = calculate_feature_based_risk(query_text)
    
    # Step 2: RAG retrieval
    similar_cases = search_dual_index(query_text, k_per_group=k_per_group)
    rag_risk = calculate_weighted_risk_score(similar_cases)
    
    # Step 3: Combine (60% RAG, 40% features)
    combined_risk = (0.6 * rag_risk['weighted_rate']) + (0.4 * feature_risk['estimated_risk'])
    
    # Determine risk level
    if combined_risk >= 0.15:
        risk_level = "HIGH"
        color = "üî¥"
    elif combined_risk >= 0.10:
        risk_level = "MEDIUM-HIGH"
        color = "üü†"
    elif combined_risk >= 0.07:
        risk_level = "MEDIUM"
        color = "üü°"
    elif combined_risk >= 0.05:
        risk_level = "MEDIUM-LOW"
        color = "üü¢"
    else:
        risk_level = "LOW"
        color = "üü¢"
    
    # Build explanation
    explanation = f"""
{color} HYBRID RISK ASSESSMENT: {risk_level}

Query: {query_text}

‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
üìä COMBINED RISK SCORE: {combined_risk:.2%}
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

üîç Component 1: FEATURE-BASED ANALYSIS (40% weight)
   Estimated Risk: {feature_risk['estimated_risk']:.2%}
   
   Risk Factors:
"""
    for exp in feature_risk['explanations']:
        explanation += f"   ‚Ä¢ {exp}\n"
    
    explanation += f"""
   Extracted Features: {feature_risk['features']}

üîç Component 2: RAG SIMILAR CASES (60% weight)
   Weighted Risk: {rag_risk['weighted_rate']:.2%}
   Regular Rate: {rag_risk['regular_rate']:.2%}
   Sample: {rag_risk['total_claims']}/{rag_risk['total_cases']} claims

   Top 5 Most Similar Cases:
"""
    
    for i, (idx, row) in enumerate(similar_cases.head(5).iterrows(), 1):
        status_icon = "‚ùå CLAIM" if row['claim_status'] == 1 else "‚úÖ NO CLAIM"
        sim_score = row.get('similarity_score', 0)
        explanation += f"   {i}. {status_icon} | Similarity: {sim_score:.3f}\n"
        summary = row['summary'][:100] + "..." if len(row['summary']) > 100 else row['summary']
        explanation += f"      {summary}\n"
    
    # Recommendations
    explanation += f"\n{'‚îÅ'*70}\nüí° RECOMMENDATION: "
    
    if risk_level == "HIGH":
        explanation += """
   ‚ö†Ô∏è HIGH RISK PROFILE
   ‚Ä¢ Require manual underwriter review
   ‚Ä¢ Consider premium increase: 25-40%
   ‚Ä¢ Request additional documentation
   ‚Ä¢ Stricter policy terms recommended"""
    elif risk_level == "MEDIUM-HIGH":
        explanation += """
   ‚ö†Ô∏è ELEVATED RISK
   ‚Ä¢ Manual review recommended
   ‚Ä¢ Consider premium increase: 15-25%
   ‚Ä¢ Verify all safety features"""
    elif risk_level == "MEDIUM":
        explanation += """
   ‚ö° MODERATE RISK
   ‚Ä¢ Standard processing acceptable
   ‚Ä¢ Consider premium increase: 5-15%
   ‚Ä¢ Regular verification process"""
    elif risk_level == "MEDIUM-LOW":
        explanation += """
   ‚úÖ ACCEPTABLE RISK
   ‚Ä¢ Standard processing
   ‚Ä¢ Base premium applicable"""
    else:
        explanation += """
   ‚úÖ LOW RISK PROFILE
   ‚Ä¢ Fast-track processing eligible
   ‚Ä¢ Competitive premium rates"""
    
    explanation += f"\n{'‚îÅ'*70}"
    
    return explanation

print("‚úì All functions defined successfully!")



STEP 3: Defining improved search and analysis functions...
‚úì All functions defined successfully!


In [6]:

# Load your data
df = pd.read_csv('../data/processed/data_with_summaries.csv')
print(f"Dataset: {len(df)} policies")
print(f"Claim rate: {df['claim_status'].mean():.2%}")
print(f"Claims: {df['claim_status'].sum()} | No claims: {(~df['claim_status'].astype(bool)).sum()}")

# Load model and embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = np.load('../models/embeddings.npy')

print("\n" + "="*70)
print("STRATEGY 1: DUAL INDEX APPROACH (RECOMMENDED)")
print("="*70)

# Create separate indices for claims and non-claims
claim_mask = df['claim_status'] == 1
no_claim_mask = df['claim_status'] == 0

# Split data
df_claims = df[claim_mask].copy()
df_no_claims = df[no_claim_mask].copy()

embeddings_claims = embeddings[claim_mask]
embeddings_no_claims = embeddings[no_claim_mask]

print(f"\n‚úì Claims index: {len(df_claims)} cases")
print(f"‚úì No-claims index: {len(df_no_claims)} cases")

# Build separate FAISS indices
dimension = embeddings.shape[1]

index_claims = faiss.IndexFlatL2(dimension)
index_claims.add(embeddings_claims)

index_no_claims = faiss.IndexFlatL2(dimension)
index_no_claims.add(embeddings_no_claims)

print("\n‚úì Both indices built successfully")

# Save indices
faiss.write_index(index_claims, '../models/faiss_index_claims.bin')
faiss.write_index(index_no_claims, '../models/faiss_index_no_claims.bin')
print("‚úì Indices saved")



Dataset: 58592 policies
Claim rate: 6.40%
Claims: 3748 | No claims: 54844

STRATEGY 1: DUAL INDEX APPROACH (RECOMMENDED)

‚úì Claims index: 3748 cases
‚úì No-claims index: 54844 cases

‚úì Both indices built successfully
‚úì Indices saved


In [7]:

def balanced_search(query_text, k_per_group=5):
    """
    Search both indices and return balanced results
    k_per_group: number of similar cases from each group (claims and no-claims)
    """
    # Encode query
    query_vector = model.encode([query_text])
    
    # Search claims index
    distances_claims, indices_claims = index_claims.search(query_vector, k_per_group)
    results_claims = df_claims.iloc[indices_claims[0]].copy()
    results_claims['similarity_distance'] = distances_claims[0]
    results_claims['source'] = 'claims_index'
    
    # Search no-claims index
    distances_no_claims, indices_no_claims = index_no_claims.search(query_vector, k_per_group)
    results_no_claims = df_no_claims.iloc[indices_no_claims[0]].copy()
    results_no_claims['similarity_distance'] = distances_no_claims[0]
    results_no_claims['source'] = 'no_claims_index'
    
    # Combine results
    combined = pd.concat([results_claims, results_no_claims], ignore_index=True)
    
    # Sort by similarity (lower distance = more similar)
    combined = combined.sort_values('similarity_distance').reset_index(drop=True)
    
    # Add similarity score
    combined['similarity_score'] = 1 / (1 + combined['similarity_distance'])
    
    return combined


print("\n" + "="*70)
print("STRATEGY 2: WEIGHTED RISK CALCULATION")
print("="*70)



STRATEGY 2: WEIGHTED RISK CALCULATION


In [8]:

def calculate_weighted_risk(similar_cases):
    """
    Calculate risk with weighted scores (closer matches count more)
    """
    # Normalize distances to 0-1 scale
    max_dist = similar_cases['similarity_distance'].max()
    min_dist = similar_cases['similarity_distance'].min()
    
    if max_dist > min_dist:
        normalized_dist = (similar_cases['similarity_distance'] - min_dist) / (max_dist - min_dist)
    else:
        normalized_dist = pd.Series([0.5] * len(similar_cases))
    
    # Convert to similarity scores (1 = most similar, 0 = least similar)
    similarity_weights = 1 - normalized_dist
    
    # Weighted claim rate
    weighted_sum = (similar_cases['claim_status'] * similarity_weights).sum()
    total_weight = similarity_weights.sum()
    weighted_rate = weighted_sum / total_weight if total_weight > 0 else 0
    
    # Regular claim rate (unweighted)
    regular_rate = similar_cases['claim_status'].mean()
    
    return {
        'weighted_rate': weighted_rate,
        'regular_rate': regular_rate,
        'total_cases': len(similar_cases),
        'total_claims': similar_cases['claim_status'].sum(),
        'weights': similarity_weights
    }


print("\n" + "="*70)
print("STRATEGY 3: ADJUSTED THRESHOLDS FOR 6.4% BASE RATE")
print("="*70)



STRATEGY 3: ADJUSTED THRESHOLDS FOR 6.4% BASE RATE


In [9]:

def determine_risk_level(weighted_rate, base_rate=0.064):
    """
    Adjusted thresholds based on actual base rate
    """
    # Risk multipliers relative to base rate
    if weighted_rate >= base_rate * 2.5:  # 16%+
        return "HIGH", "üî¥"
    elif weighted_rate >= base_rate * 2.0:  # 12.8%+
        return "MEDIUM-HIGH", "üü†"
    elif weighted_rate >= base_rate * 1.5:  # 9.6%+
        return "MEDIUM", "üü°"
    elif weighted_rate >= base_rate * 1.2:  # 7.7%+
        return "MEDIUM-LOW", "üü¢"
    else:  # < 7.7%
        return "LOW", "üü¢"


print(f"Base rate: {df['claim_status'].mean():.2%}")
print(f"Thresholds:")
print(f"  HIGH:        >= {df['claim_status'].mean() * 2.5:.1%} (2.5x base)")
print(f"  MEDIUM-HIGH: >= {df['claim_status'].mean() * 2.0:.1%} (2.0x base)")
print(f"  MEDIUM:      >= {df['claim_status'].mean() * 1.5:.1%} (1.5x base)")
print(f"  MEDIUM-LOW:  >= {df['claim_status'].mean() * 1.2:.1%} (1.2x base)")
print(f"  LOW:         <  {df['claim_status'].mean() * 1.2:.1%}")


print("\n" + "="*70)
print("COMPLETE BALANCED ASSESSMENT FUNCTION")
print("="*70)


Base rate: 6.40%
Thresholds:
  HIGH:        >= 16.0% (2.5x base)
  MEDIUM-HIGH: >= 12.8% (2.0x base)
  MEDIUM:      >= 9.6% (1.5x base)
  MEDIUM-LOW:  >= 7.7% (1.2x base)
  LOW:         <  7.7%

COMPLETE BALANCED ASSESSMENT FUNCTION


In [11]:

def balanced_risk_assessment(query_text, k_per_group=5):
    """
    Complete balanced RAG assessment
    """
    # Get balanced similar cases
    similar_cases = balanced_search(query_text, k_per_group)
    
    # Calculate weighted risk
    risk_metrics = calculate_weighted_risk(similar_cases)
    
    # Determine risk level
    risk_level, emoji = determine_risk_level(risk_metrics['weighted_rate'])
    
    # Build explanation
    explanation = f"""
{emoji} BALANCED RISK ASSESSMENT: {risk_level}

Query: {query_text}

‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
üìä RISK SCORES
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
Weighted Claim Rate:  {risk_metrics['weighted_rate']:.2%}
Regular Claim Rate:   {risk_metrics['regular_rate']:.2%}
Dataset Base Rate:    {df['claim_status'].mean():.2%}
Risk Multiplier:      {risk_metrics['weighted_rate'] / df['claim_status'].mean():.2f}x

Sample Composition:   {risk_metrics['total_claims']}/{risk_metrics['total_cases']} claims
                      ({k_per_group} from claims, {k_per_group} from no-claims)

‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
üîç TOP {min(10, len(similar_cases))} MOST SIMILAR CASES (BALANCED SAMPLE)
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
"""
    
    for i, (idx, row) in enumerate(similar_cases.head(10).iterrows(), 1):
        status_icon = "‚ùå CLAIM" if row['claim_status'] == 1 else "‚úÖ NO CLAIM"
        weight = risk_metrics['weights'].iloc[i-1] if i-1 < len(risk_metrics['weights']) else 0
        
        explanation += f"\n{i}. {status_icon} | Similarity: {row['similarity_score']:.3f} | Weight: {weight:.3f} | Source: {row['source']}\n"
        explanation += f"   {row['summary'][:120]}...\n"
    
    # Recommendation
    explanation += f"\n{'‚îÅ'*70}\nüí° RECOMMENDATION:\n"
    
    if risk_level == "HIGH":
        explanation += f"""
‚ö†Ô∏è HIGH RISK (>{df['claim_status'].mean() * 2.5:.1%})
‚Ä¢ Similar cases are {risk_metrics['weighted_rate'] / df['claim_status'].mean():.1f}x more likely to claim
‚Ä¢ Require manual underwriter review
‚Ä¢ Premium increase: +30-50%
‚Ä¢ Additional documentation required
‚Ä¢ Consider coverage limitations
"""
    elif risk_level == "MEDIUM-HIGH":
        explanation += f"""
‚ö†Ô∏è ELEVATED RISK ({df['claim_status'].mean() * 2.0:.1%}-{df['claim_status'].mean() * 2.5:.1%})
‚Ä¢ Similar cases are {risk_metrics['weighted_rate'] / df['claim_status'].mean():.1f}x more likely to claim
‚Ä¢ Manual review recommended
‚Ä¢ Premium increase: +20-30%
‚Ä¢ Verify all safety features
"""
    elif risk_level == "MEDIUM":
        explanation += f"""
‚ö° MODERATE RISK ({df['claim_status'].mean() * 1.5:.1%}-{df['claim_status'].mean() * 2.0:.1%})
‚Ä¢ Similar cases are {risk_metrics['weighted_rate'] / df['claim_status'].mean():.1f}x more likely to claim
‚Ä¢ Standard processing with verification
‚Ä¢ Premium increase: +10-20%
"""
    elif risk_level == "MEDIUM-LOW":
        explanation += f"""
‚úÖ ACCEPTABLE RISK ({df['claim_status'].mean() * 1.2:.1%}-{df['claim_osity'].mean() * 1.5:.1%})
‚Ä¢ Risk similar to base rate
‚Ä¢ Standard processing
‚Ä¢ Standard premium
"""
    else:
        explanation += f"""
‚úÖ LOW RISK (<{df['claim_status'].mean() * 1.2:.1%})
‚Ä¢ Similar cases are at or below base rate
‚Ä¢ Fast-track eligible
‚Ä¢ Competitive rates applicable
"""
    
    return explanation


In [13]:


print("\n" + "="*70)
print("TESTING WITH YOUR TEST CASES")
print("="*70)

test_cases = [
    "22-year-old with 10-year-old Diesel vehicle, 2 airbags, no ESC",
    "45-year-old with 2-year-old Electric Tesla, 6 airbags, ESC, brake assist, parking sensors",
    "32-year-old with 6-year-old Petrol Honda Civic, 4 airbags, ESC",
    "28-year-old with 8-year-old Diesel vehicle, 2 airbags, basic safety",
    "50-year-old with 1-year-old Electric vehicle, 8 airbags, all safety features"
]

for i, query in enumerate(test_cases, 1):
    print(f"\n{'='*70}")
    print(f"TEST CASE {i}")
    print("="*70)
    print(balanced_risk_assessment(query, k_per_group=5))
    print("\n")


print("\n" + "="*70)
print("BONUS: COMPARISON - OLD VS NEW APPROACH")
print("="*70)

# Test with one high-risk case
high_risk_query = "22-year-old with 10-year-old Diesel vehicle, 2 airbags, no ESC"

print("\nüî¥ OLD APPROACH (Unbalanced):")
print("-" * 70)
# Old way: search all together
query_vector = model.encode([high_risk_query])
old_index = faiss.read_index('../models/faiss_index.bin')
distances, indices = old_index.search(query_vector, 10)
old_results = df.iloc[indices[0]]
print(f"Claims found: {old_results['claim_status'].sum()}/10")
print(f"Claim rate: {old_results['claim_status'].mean():.2%}")

print("\nüü¢ NEW APPROACH (Balanced):")
print("-" * 70)
new_results = balanced_search(high_risk_query, k_per_group=5)
print(f"Claims found: {new_results['claim_status'].sum()}/10")
print(f"Claim rate: {new_results['claim_status'].mean():.2%}")
print(f"Guaranteed balance: 5 claims + 5 no-claims")


print("\n" + "="*70)
print("SUMMARY: WHY THIS FIXES THE PROBLEM")
print("="*70)

print("""
‚úÖ DUAL INDEX APPROACH:
   ‚Ä¢ Searches claims and no-claims separately
   ‚Ä¢ Forces 50/50 representation (5 from each)
   ‚Ä¢ Eliminates sampling bias from 6.4% base rate

‚úÖ WEIGHTED SCORING:
   ‚Ä¢ Closer matches have more influence
   ‚Ä¢ Prevents distant irrelevant cases from diluting signal
   ‚Ä¢ More accurate risk estimation

‚úÖ ADJUSTED THRESHOLDS:
   ‚Ä¢ Based on actual 6.4% base rate
   ‚Ä¢ Uses risk multipliers (1.2x, 1.5x, 2.0x, 2.5x)
   ‚Ä¢ More meaningful risk levels

‚úÖ BALANCED SAMPLING:
   ‚Ä¢ Always sees both outcomes equally
   ‚Ä¢ Better differentiation between risk profiles
   ‚Ä¢ More robust to class imbalance

RESULT: Now you'll see HIGH, MEDIUM, and LOW risk cases properly distinguished!
""")


TESTING WITH YOUR TEST CASES

TEST CASE 1

üü¢ BALANCED RISK ASSESSMENT: LOW

Query: 22-year-old with 10-year-old Diesel vehicle, 2 airbags, no ESC

‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
üìä RISK SCORES
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
Weighted Claim Rate:  0.56%
Regular Claim Rate:   50.00%
Dataset Base Rate:    6.40%
Risk Multiplier:      0.09x

Sample Composition:   5/10 claims
                      (5 from claims, 5 from no-claims)

‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚

KeyError: 'claim_osity'