# Correlation Script Test - SEC x Regulations Field-by-Field Matching

This notebook tests field-by-field correlation between SEC company data and regulations using Jaccard similarity.

In [None]:
import pandas as pd
import re
from pathlib import Path
from datetime import datetime
import os

os.makedirs('/home/sagemaker-user/shared/outputs', exist_ok=True)

## Configuration and Helper Functions

In [6]:
COMP_PATH = Path("/home/sagemaker-user/shared/outputs/sec_matrix.csv")
REGS_PATH = Path("/home/sagemaker-user/shared/regulations_example.csv")
OUT_PATH = Path("/home/sagemaker-user/shared/outputs/sec_x_laws_matches_strong.csv")

def normalize_sector(x: str) -> str:
    if not isinstance(x, str): 
        return ""
    xl = x.strip().lower()
    aliases = {
        "tech": "information technology",
        "it": "information technology",
        "healthcare": "health care",
        "finance": "financials"
    }
    return aliases.get(xl, xl)

def tokenize(s: str):
    if not isinstance(s, str) or not s.strip():
        return set()
    return set(re.findall(r"[a-z0-9\-\+_/]+", s.lower()))

def jaccard(a: str, b: str) -> float:
    ta, tb = tokenize(a), tokenize(b)
    if not ta and not tb:
        return 0.0
    inter = len(ta & tb)
    uni = len(ta | tb)
    return inter / uni if uni else 0.0

def date_overlap(sa, ea, sb, eb) -> float:
    def parse(x):
        try:
            return datetime.fromisoformat(x) if x and isinstance(x, str) else None
        except Exception:
            return None
    sa, ea, sb, eb = parse(sa), parse(ea), parse(sb), parse(eb)
    if sa is None and ea is None and sb is None and eb is None:
        return 0.0
    ea = ea or datetime.max
    eb = eb or datetime.max
    sa = sa or datetime.min
    sb = sb or datetime.min
    latest_start = max(sa, sb)
    earliest_end = min(ea, eb)
    overlap = (earliest_end - latest_start).days
    if overlap <= 0:
        return 0.0
    union = (max(ea, eb) - min(sa, sb)).days
    return max(0.0, min(1.0, overlap / union)) if union > 0 else 0.0

## Test Similarity Functions

In [7]:
# Test similarity with sample data
test_cases = [
    ("US", "United States", "country"),
    ("USA", "United States", "country"),
    ("Technology", "Information Technology", "sector"),
    ("Healthcare", "Health Care Equipment & Services", "sector"),
    ("data processing", "consumer data", "activities")
]

print("Testing Jaccard Similarity Scores:")
for text1, text2, field_type in test_cases:
    jaccard_score = jaccard(text1, text2)
    print(f"{field_type}: '{text1}' vs '{text2}' -> Jaccard: {jaccard_score:.3f}")

Testing Jaccard Similarity Scores:
country: 'US' vs 'United States' -> Jaccard: 0.000
country: 'USA' vs 'United States' -> Jaccard: 0.000
sector: 'Technology' vs 'Information Technology' -> Jaccard: 0.500
sector: 'Healthcare' vs 'Health Care Equipment & Services' -> Jaccard: 0.000
activities: 'data processing' vs 'consumer data' -> Jaccard: 0.333


## Load and Inspect Data

In [8]:
print(f"SEC Matrix exists: {COMP_PATH.exists()}")
print(f"Regulations exists: {REGS_PATH.exists()}")

companies = pd.read_csv(COMP_PATH)
regulations = pd.read_csv(REGS_PATH)

print(f"\nCompanies data shape: {companies.shape}")
print(f"Companies columns: {list(companies.columns)}")
print("\nFirst few companies:")
print(companies.head(3))

print(f"\nRegulations data shape: {regulations.shape}")
print(f"Regulations columns: {list(regulations.columns)}")
print("\nFirst few regulations:")
print(regulations.head(3))

SEC Matrix exists: True
Regulations exists: True

Companies data shape: (500, 23)
Companies columns: ['ticker', 'company', 'sector', 'headquarters_country', 'revenues_total_usd', 'revenue_by_region_notes', 'region_exposure_US', 'region_exposure_Europe', 'region_exposure_China', 'region_exposure_India', 'supply_chain_regions', 'key_suppliers', 'key_customers', 'critical_dependencies', 'regulatory_dependencies', 'sanctions_exposure', 'environmental_regulatory_risk', 'labor_regulatory_risk', 'cybersecurity_regulatory_risk', 'ai_governance_risk', 'overall_regulatory_risk_score', 'confidence', 'sources']

First few companies:
  ticker                     company                            sector  \
0      A  Agilent Technologies, Inc.  Health Care Equipment & Services   
1   AAPL                  Apple Inc.                        Technology   
2   ABBV                 AbbVie Inc.                        Healthcare   

  headquarters_country revenues_total_usd  \
0        United States       

## Run Correlation Analysis

In [9]:
# Helper functions
comp_cols = {c.lower(): c for c in companies.columns}

def get_company_field(row, logical):
    mapping = {
        "ticker": ["ticker"],
        "company_name": ["company","company_name","name"],
        "jurisdiction_country": ["jurisdiction_country","country","headquarters_country"],
        "sector": ["sector","gics_sector","industry"],
        "activities": ["activity","activities","business_function"],
        "regulatory_domain": ["regulatory_dependencies","regulatory_theme","theme"],
        "impact_type": ["impact_type","impact","risk_type"],
        "regulator_entity": ["regulator","regulator_entity"]
    }
    for cand in mapping.get(logical, []):
        if cand in comp_cols:
            val = row[comp_cols[cand]]
            if pd.notna(val):
                return val
    return ""

WEIGHTS = {
    "jurisdiction_country": 2.0,
    "sector": 1.8,
    "activities": 1.8,
    "regulatory_domain": 1.0,
    "impact_type": 0.8
}

print("Running correlation analysis...")
rows = []

for i, crow in companies.iterrows():
    c_name = get_company_field(crow, "company_name")
    c_ticker = get_company_field(crow, "ticker")
    c_country = str(get_company_field(crow, "jurisdiction_country"))
    c_sector = normalize_sector(str(get_company_field(crow, "sector")))
    c_acts = str(get_company_field(crow, "activities"))
    c_theme = str(get_company_field(crow, "regulatory_domain"))
    c_impact = str(get_company_field(crow, "impact_type"))
    c_reg = str(get_company_field(crow, "regulator_entity"))

    for j, rrow in regulations.iterrows():
        # Field-by-field correlation matching
        m_country = jaccard(c_country, str(rrow.get("jurisdiction_country","")))
        m_sector = jaccard(c_sector, normalize_sector(str(rrow.get("sector",""))))
        m_acts = jaccard(c_acts + " " + c_sector, str(rrow.get("activity","")))
        m_theme = jaccard(c_theme, str(rrow.get("regulatory_domain","")))
        m_impact = jaccard(c_impact, str(rrow.get("impact_type","")))
        m_reg = jaccard(c_reg, str(rrow.get("regulator_entity","")))

        score = (
            WEIGHTS["jurisdiction_country"] * m_country +
            WEIGHTS["sector"] * m_sector +
            WEIGHTS["activities"] * m_acts +
            WEIGHTS["regulatory_domain"] * m_theme +
            WEIGHTS["impact_type"] * m_impact
        )

        rows.append({
            "company_ticker": c_ticker,
            "company_name": c_name,
            "law_id": rrow.get("law_id",""),
            "country_match": round(m_country, 3),
            "sector_match": round(m_sector, 3),
            "activities_match": round(m_acts, 3),
            "domain_match": round(m_theme, 3),
            "impact_match": round(m_impact, 3),
            "regulator_match": round(m_reg, 3),
            "score_total": round(score, 4),
        })

print(f"Generated {len(rows)} correlation pairs")

Running correlation analysis...
Generated 500 correlation pairs


## Analyze Results

In [10]:
matches = pd.DataFrame(rows)

if not matches.empty:
    matches = matches.sort_values(["company_ticker","score_total"], ascending=[True, False]).reset_index(drop=True)
    matches.to_csv(OUT_PATH, index=False)
    
    print("\n=== TOP CORRELATIONS ===")
    print(matches.head(10))
    
    print("\n=== SCORE DISTRIBUTION ===")
    print(matches['score_total'].describe())
    
    print("\n=== TOP MATCHES BY COMPANY ===")
    for ticker in matches['company_ticker'].unique():
        company_matches = matches[matches['company_ticker'] == ticker].head(3)
        print(f"\n{ticker}:")
        for _, row in company_matches.iterrows():
            print(f"  {row['law_id']} ({row['score_total']:.3f}) - Country:{row['country_match']:.2f} Sector:{row['sector_match']:.2f}")
    
    print(f"\nResults saved to: {OUT_PATH}")
else:
    print("No matches generated")


=== TOP CORRELATIONS ===
  company_ticker                company_name  \
0              A  Agilent Technologies, Inc.   
1           AAPL                  Apple Inc.   
2           ABBV                 AbbVie Inc.   
3           ABNB                Airbnb, Inc.   
4            ABT         Abbott Laboratories   
5           ACGL     Arch Capital Group Ltd.   
6            ACN                   Accenture   
7           ADBE                  Adobe Inc.   
8            ADI        Analog Devices, Inc.   
9            ADM                         ADM   

                                              law_id  country_match  \
0  1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉE...            0.0   
1  1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉE...            0.0   
2  1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉE...            0.0   
3  1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉE...            0.0   
4  1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉE...            0.0   
5  1.DIRECTIVE (UE)

## Field-by-Field Analysis

In [11]:
# Analyze field-by-field correlations
if not matches.empty:
    print("\n=== FIELD CORRELATION ANALYSIS ===")
    
    # Show distribution of each match type
    for field in ['country_match', 'sector_match', 'activities_match', 'domain_match', 'impact_match']:
        print(f"\n{field.upper()}:")
        print(f"  Mean: {matches[field].mean():.3f}")
        print(f"  Max: {matches[field].max():.3f}")
        print(f"  Non-zero: {(matches[field] > 0).sum()}/{len(matches)}")
    
    # Show best matches per field
    print("\n=== BEST MATCHES BY FIELD ===")
    for field in ['country_match', 'sector_match', 'activities_match']:
        best = matches.nlargest(3, field)[['company_ticker', 'law_id', field, 'score_total']]
        print(f"\nTop {field}:")
        print(best.to_string(index=False))


=== FIELD CORRELATION ANALYSIS ===

COUNTRY_MATCH:
  Mean: 0.000
  Max: 0.000
  Non-zero: 0/500

SECTOR_MATCH:
  Mean: 0.013
  Max: 0.059
  Non-zero: 115/500

ACTIVITIES_MATCH:
  Mean: 0.007
  Max: 0.035
  Non-zero: 180/500

DOMAIN_MATCH:
  Mean: 0.009
  Max: 0.068
  Non-zero: 188/500

IMPACT_MATCH:
  Mean: 0.000
  Max: 0.000
  Non-zero: 0/500

=== BEST MATCHES BY FIELD ===

Top country_match:
company_ticker                                                        law_id  country_match  score_total
             A 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL            0.0       0.1205
          AAPL 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL            0.0       0.0169
          ABBV 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL            0.0       0.0000

Top sector_match:
company_ticker                                                        law_id  sector_match  score_total
           BBY 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROP