In [None]:
# Cell 1: Install dependencies       
!pip install pyrate_limiter-3.9.0-py3-none-any.whl
!pip install sec_edgar_downloader-5.0.3-py3-none-any.whl
from sec_edgar_downloader import Downloader                                                                                                                                      
from snowflake.snowpark.context import get_active_session                                                                                                                        
import os                                                                                                                                                                        
from datetime import datetime, timedelta                                                                                                                                         
import tempfile                                                                                                                                                                  
                   

In [None]:
                                                                                                                                                               
# Get active Snowflake session                                                                                                                                                   
session = get_active_session()   

session.sql("USE ROLE SYSADMIN").collect()

In [None]:
                                                                                                                                                
                                                                                                                                                                                  
# Create a temporary directory for downloads                                                                                                                                     
temp_dir = tempfile.mkdtemp()                                                                                                                                                    
print(f"Temporary download directory: {temp_dir}")                                                                                                                               
                                                                                                                                                                                                                                                                                                                                                            
session.sql("""                                                                                                                                                                  
     CREATE STAGE IF NOT EXISTS ipo_research_db.public.sec_filings_stage                                                                                                                                 
     DIRECTORY = (ENABLE = TRUE)  
     ENCRYPTION = (TYPE = 'SNOWFLAKE_SSE')
     COMMENT = 'Stage for SEC EDGAR S-1 filings w/ SSE'                                                                                                                                  
 """).collect()                                                                                                                                                                   
                                                                                                                                                                                  
print("Stage created/verified: sec_filings_stage")                                                                                                                               
                                                               

In [None]:
USE ROLE ACCOUNTADMIN;

USE DATABASE IPO_RESEARCH_DB;
USE SCHEMA PUBLIC;

CREATE OR REPLACE NETWORK RULE sec_edgar_network_rule
    MODE = EGRESS
    TYPE = HOST_PORT
    VALUE_LIST = (
    'sec.gov',
    'www.sec.gov',
    'efts.sec.gov',
    'data.sec.gov'
    )
    COMMENT = 'Allow access to SEC EDGAR filing system'; 

CREATE OR REPLACE EXTERNAL ACCESS INTEGRATION sec_edgar_integration 
    ALLOWED_NETWORK_RULES = (sec_edgar_network_rule)
    ENABLED = TRUE
    COMMENT = 'Integration for accessing SEC EDGAR filings';  

GRANT USAGE ON INTEGRATION sec_edgar_integration TO ROLE SYSADMIN;

ALTER NOTEBOOK "Edgar DB Extract"
  SET EXTERNAL_ACCESS_INTEGRATIONS = (sec_edgar_integration);

CREATE OR REPLACE FILE FORMAT html_single_row_format  
     TYPE = 'CSV'
     FIELD_DELIMITER = NONE
     RECORD_DELIMITER = NONE 
     SKIP_HEADER = 0  
     BINARY_FORMAT = 'UTF8';   

In [None]:
                                                                                                                                                                                                    
import requests                                                                                                                                                                                    
                                                                                                                                                                                                    
 # Test with explicit DNS                                                                                                                                                                           
url = "https://www.sec.gov/files/company_tickers.json"                                                                                                                                             
headers = {                                                                                                                                                                                        
    'User-Agent': 'MyCompany myemail@company.com'                                                                                                                                                  
}                                                                                                                                                                                                  
                                                                                                                                                                                                    
try:                                                                                                                                                                                               
    response = requests.get(url, headers=headers, timeout=10)                                                                                                                                      
    print(f"Status: {response.status_code}")                                                                                                                                                       
    print(f"Response length: {len(response.content)}")                                                                                                                                             
    if response.status_code == 200:                                                                                                                                                                
        print("✓ Connection successful!")                                                                                                                                                          
    else:                                                                                                                                                                                          
        print(f"✗ HTTP error: {response.status_code}")                                                                                                                                             
except Exception as e:                                                                                                                                                                             
    print(f"✗ Connection failed: {str(e)}")   

# If you get an error, restart the notebook

In [None]:
import requests                                                                                                                                                                  
import pandas as pd                                                                                                                                                              
from datetime import datetime                                                                                                                                                    
                                                                                                                                                                                
# ========================================                                                                                                                                       
# CONFIGURE YOUR DATE RANGE HERE                                                                                                                                                 
# ========================================                                                                                                                                       
START_YEAR = 2025                                                                                                                                                                
START_QUARTER = 1  # 1, 2, 3, or 4                                                                                                                                               
                                                                                                                                                                                
END_YEAR = 2025                                                                                                                                                                  
END_QUARTER = 1    # 1, 2, 3, or 4                                                                                                                                               
# ========================================                                                                                                                                       
                                                                                                                                                                                
def get_s1_filers(year, quarter):                                                                                                                                                
    """                                                                                                                                                                          
    Fetch S-1 filers from SEC's quarterly index files                                                                                                                            
    Quarter: 1, 2, 3, or 4                                                                                                                                                       
    """                                                                                                                                                                          
    # SEC index URL format                                                                                                                                                       
    url = f"https://www.sec.gov/Archives/edgar/full-index/{year}/QTR{quarter}/form.idx"                                                                                          
                                                                                                                                                                                
    headers = {                                                                                                                                                                  
        'User-Agent': 'YourCompany your.email@company.com',  # Replace with your info                                                                                            
        'Accept-Encoding': 'gzip, deflate',                                                                                                                                      
        'Host': 'www.sec.gov'                                                                                                                                                    
    }                                                                                                                                                                            
                                                                                                                                                                                
    try:                                                                                                                                                                         
        response = requests.get(url, headers=headers)                                                                                                                            
        response.raise_for_status()                                                                                                                                              
    except Exception as e:                                                                                                                                                       
        print(f"Failed to fetch index for {year} Q{quarter}: {str(e)}")                                                                                                          
        return pd.DataFrame()                                                                                                                                                    
                                                                                                                                                                                
    # Parse the index file                                                                                                                                                       
    lines = response.text.split('\n')                                                                                                                                            
                                                                                                                                                                                
    # Find the separator line (dashes) that marks start of data                                                                                                                  
    data_start_idx = None                                                                                                                                                        
    for i, line in enumerate(lines):                                                                                                                                             
        if '---' in line and len(line.strip()) > 50:  # Separator line is long                                                                                                   
            data_start_idx = i + 1                                                                                                                                               
            break                                                                                                                                                                
                                                                                                                                                                                
    # If no separator found, try alternative parsing                                                                                                                             
    if data_start_idx is None:                                                                                                                                                   
        print(f"Warning: Could not find separator line for {year} Q{quarter}, trying alternative parsing")                                                                       
        # Skip first 10 lines as headers                                                                                                                                         
        data_start_idx = 10                                                                                                                                                      
                                                                                                                                                                                
    data_lines = lines[data_start_idx:]                                                                                                                                          
                                                                                                                                                                                
    # Parse each line looking for S-1 filings                                                                                                                                    
    # Format is fixed-width:                                                                                                                                                     
    # Form Type (0-17), Company Name (17-75), CIK (75-87), Date Filed (87-99), File Name (99+)                                                                                   
    filings = []                                                                                                                                                                 
    for line in data_lines:                                                                                                                                                      
        if not line.strip():                                                                                                                                                     
            continue                                                                                                                                                             
                                                                                                                                                                                
        # Must be long enough to contain all fields                                                                                                                              
        if len(line) < 99:                                                                                                                                                       
            continue                                                                                                                                                             
                                                                                                                                                                                
        # Check if line starts with S-1                                                                                                                                          
        form_type = line[0:17].strip()                                                                                                                                           
                                                                                                                                                                                
        if form_type not in ['S-1', 'S-1/A']:                                                                                                                                    
            continue                                                                                                                                                             
                                                                                                                                                                                
        try:                                                                                                                                                                     
            # Extract fields using fixed positions                                                                                                                               
            company_name = line[17:75].strip()                                                                                                                                   
            cik = line[75:87].strip()                                                                                                                                            
            date_filed = line[87:99].strip()                                                                                                                                     
            file_name = line[99:].strip()                                                                                                                                        
                                                                                                                                                                                
            # Remove leading zeros from CIK                                                                                                                                      
            cik_clean = cik.lstrip('0') or '0'                                                                                                                                   
                                                                                                                                                                                
            filings.append({                                                                                                                                                     
                'form_type': form_type,                                                                                                                                          
                'company_name': company_name,                                                                                                                                    
                'cik': cik_clean,                                                                                                                                                
                'date_filed': date_filed,                                                                                                                                        
                'file_name': file_name                                                                                                                                           
            })                                                                                                                                                                   
        except Exception as e:                                                                                                                                                   
            # Skip lines that don't parse correctly                                                                                                                              
            continue                                                                                                                                                             
                                                                                                                                                                                
    return pd.DataFrame(filings)                                                                                                                                                 
                                                                                                                                                                                
# Generate list of (year, quarter) tuples in the range                                                                                                                           
def generate_quarter_range(start_year, start_quarter, end_year, end_quarter):                                                                                                    
    """Generate all year/quarter combinations between start and end"""                                                                                                           
    quarters = []                                                                                                                                                                
                                                                                                                                                                                
    current_year = start_year                                                                                                                                                    
    current_quarter = start_quarter                                                                                                                                              
                                                                                                                                                                                
    while (current_year < end_year) or (current_year == end_year and current_quarter <= end_quarter):                                                                            
        quarters.append((current_year, current_quarter))                                                                                                                         
                                                                                                                                                                                
        # Move to next quarter                                                                                                                                                   
        current_quarter += 1                                                                                                                                                     
        if current_quarter > 4:                                                                                                                                                  
            current_quarter = 1                                                                                                                                                  
            current_year += 1                                                                                                                                                    
                                                                                                                                                                                
    return quarters                                                                                                                                                              
                                                                                                                                                                                
# Get quarters to process                                                                                                                                                        
quarters_to_process = generate_quarter_range(START_YEAR, START_QUARTER, END_YEAR, END_QUARTER)                                                                                   
                                                                                                                                                                                
print(f"Processing {len(quarters_to_process)} quarter(s): {START_YEAR} Q{START_QUARTER} to {END_YEAR} Q{END_QUARTER}")                                                           
print("-" * 60)                                                                                                                                                                  
                                                                                                                                                                                
all_s1_filers = []                                                                                                                                                               
                                                                                                                                                                                
for year, quarter in quarters_to_process:                                                                                                                                        
    print(f"Fetching S-1 filings for {year} Q{quarter}...")                                                                                                                      
    df = get_s1_filers(year, quarter)                                                                                                                                            
    if not df.empty:                                                                                                                                                             
        all_s1_filers.append(df)                                                                                                                                                 
        print(f"  ✓ Found {len(df)} S-1 filings")                                                                                                                                
    else:                                                                                                                                                                        
        print(f"  ⚠ No S-1 filings found")                                                                                                                                       
                                                                                                                                                                                
# Combine all quarters                                                                                                                                                           
if all_s1_filers:                                                                                                                                                                
    s1_filers_df = pd.concat(all_s1_filers, ignore_index=True)                                                                                                                   
                                                                                                                                                                                
    # Get unique CIKs (companies may file multiple times)                                                                                                                        
    unique_ciks = s1_filers_df['cik'].unique().tolist()                                                                                                                          
                                                                                                                                                                                
    print("\n" + "=" * 60)                                                                                                                                                       
    print(f"✓ Total S-1 filings found: {len(s1_filers_df)}")                                                                                                                     
    print(f"✓ Unique companies: {len(unique_ciks)}")                                                                                                                             
    print(f"✓ Estimated download time: {len(unique_ciks) * 2 // 60} - {len(unique_ciks) * 3 // 60} minutes")                                                                     
    print("=" * 60)                                                                                                                                                              
                                                                                                                                                                                
    # Display sample                                                                                                                                                             
    print("\nSample S-1 filers:")                                                                                                                                                
    display(s1_filers_df[['company_name', 'cik', 'date_filed', 'form_type']].head(20))                                                                                           
else:                                                                                                                                                                            
    print("\n⚠ No S-1 filings found in the specified range")                                                                                                                     
    unique_ciks = []   

In [None]:
# Create temporary directory                                                                                                                                                     
temp_dir = tempfile.mkdtemp()                                                                                                                                                    
print(f"Download directory: {temp_dir}")                                                                                                                                         
                                                                                                                                                                                
# Initialize downloader                                                                                                                                                          
dl = Downloader(                                                                                                                                                                 
    company_name="YourCompany",                                                                                                                                                  
    email_address="your.email@company.com",                                                                                                                                      
    download_folder=temp_dir                                                                                                                                                     
)                                                                                                                                                                                
                                                                                                                                                                                
# Calculate date range                                                                                                                                                           
end_date = datetime.now()                                                                                                                                                        
start_date = end_date - timedelta(days=365)                                                                                                                                      
                                                                                                                                                                                
print(f"\nDownloading S-1 filings for {len(unique_ciks)} companies...")                                                                                                          
print("This will take several minutes due to SEC rate limits (10 req/sec max)\n")                                                                                                
                                                                                                                                                                                
successful_downloads = 0                                                                                                                                                         
failed_downloads = 0                                                                                                                                                             
                                                                                                                                                                                
for i, cik in enumerate(unique_ciks):                                                                                                                                            
    try:                                                                                                                                                                         
        # Download S-1 filings for this CIK                                                                                                                                      
        dl.get(                                                                                                                                                                  
            "S-1",                                                                                                                                                               
            cik,  # Use CIK instead of ticker                                                                                                                                    
            after=start_date.strftime("%Y-%m-%d"),                                                                                                                               
            before=end_date.strftime("%Y-%m-%d"),                                                                                                                                
            download_details=True                                                                                                                                                
        )                                                                                                                                                                        
        successful_downloads += 1                                                                                                                                                
                                                                                                                                                                                
        if (i + 1) % 10 == 0:                                                                                                                                                    
            print(f"Progress: {i + 1}/{len(unique_ciks)} companies processed")                                                                                                   
                                                                                                                                                                                
    except Exception as e:                                                                                                                                                       
        failed_downloads += 1                                                                                                                                                    
        if "No filings" not in str(e):  # Don't print if just no filings found                                                                                                   
            print(f"  ✗ CIK {cik}: {str(e)}")                                                                                                                                    
                                                                                                                                                                                
print(f"\n✓ Download Summary:")                                                                                                                                                  
print(f"    Successful: {successful_downloads}")                                                                                                                                 
print(f"    Failed: {failed_downloads}")                                                                                                                                         
                                                                                                                                                                                 
                                                                   

In [None]:
import glob                                                                                                                                                                      
                                                                                                                                                                                
# Find all downloaded filing files                                                                                                                                               
downloaded_files = []                                                                                                                                                            
for root, dirs, files in os.walk(temp_dir):                                                                                                                                      
    for file in files:                                                                                                                                                           
        file_path = os.path.join(root, file)                                                                                                                                     
        downloaded_files.append(file_path)                                                                                                                                       
                                                                                                                                                                                
print(f"Found {len(downloaded_files)} files to upload")                                                                                                                          
                                                                                                                                                                                
if len(downloaded_files) == 0:                                                                                                                                                   
    print("⚠ No files found. Check if download completed successfully.")                                                                                                         
else:                                                                                                                                                                            
    # Upload files to stage                                                                                                                                                      
    uploaded_count = 0                                                                                                                                                           
    failed_count = 0                                                                                                                                                             
                                                                                                                                                                                
    for i, file_path in enumerate(downloaded_files):                                                                                                                             
        try:                                                                                                                                                                     
            # Get relative path to preserve directory structure                                                                                                                  
            rel_path = os.path.relpath(file_path, temp_dir)                                                                                                                      
            dir_path = os.path.dirname(rel_path)                                                                                                                                 
                                                                                                                                                                                
            # Upload using PUT command                                                                                                                                           
            session.sql(f"""                                                                                                                                                     
                PUT 'file://{file_path}' @sec_filings_stage/{dir_path if dir_path else ''}                                                                                       
                AUTO_COMPRESS = FALSE                                                                                                                                             
                OVERWRITE = TRUE                                                                                                                                                 
            """).collect()                                                                                                                                                       
                                                                                                                                                                                
            uploaded_count += 1                                                                                                                                                  
                                                                                                                                                                                
            # Progress update every 50 files                                                                                                                                     
            if (i + 1) % 50 == 0:                                                                                                                                                
                print(f"  Progress: {i + 1}/{len(downloaded_files)} files uploaded")                                                                                             
                                                                                                                                                                                
        except Exception as e:                                                                                                                                                   
            print(f"  ✗ Failed: {os.path.basename(file_path)}: {str(e)}")                                                                                                        
            failed_count += 1                                                                                                                                                    
                                                                                                                                                                                
    print(f"\n✓ Upload Summary:")                                                                                                                                                
    print(f"    Successfully uploaded: {uploaded_count}")                                                                                                                        
    print(f"    Failed uploads: {failed_count}")                 

In [None]:
import re
import os
import tempfile
import shutil

def truncate_and_save_documents():
    """
    Process all primary documents, truncate to first 100 pages, and save back to stage
    """
    
    # Create temporary directory for processing
    temp_dir = tempfile.mkdtemp()
    print(f"Processing directory: {temp_dir}")
    
    # Get all primary documents from stage
    documents_query = """
    SELECT 
        metadata$filename AS file_path,
        SPLIT_PART(metadata$filename, '/', 2) AS company_cik,
        SPLIT_PART(metadata$filename, '/', 4) AS accession_number,
        $1 as raw_content
    FROM @sec_filings_stage (file_format => html_single_row_format)
    WHERE metadata$filename LIKE '%/primary-document.html%'
    """
    
    documents_df = session.sql(documents_query).to_pandas()
    print(f"Retrieved {len(documents_df)} documents for processing")
    
    processed_files = 0
    truncated_count = 0
    errors = []
    
    for idx, row in documents_df.iterrows():
        try:
            file_path = row['FILE_PATH']
            company_cik = str(row['COMPANY_CIK'])
            accession_number = str(row['ACCESSION_NUMBER'])
            raw_content = row['RAW_CONTENT']
            
            #print(f"\nProcessing file {idx + 1}: {file_path}")
            #print(f"Content type: {type(raw_content)}")
            
            # Handle different data types
            if raw_content is None:
                #print(f"  ⚠ Skipping - content is None")
                continue
                
            # Convert to string if needed
            if isinstance(raw_content, bytes):
                html_content = raw_content.decode('utf-8', errors='ignore')
            elif isinstance(raw_content, str):
                html_content = raw_content
            else:
                html_content = str(raw_content)
            
            # Check if content is meaningful
            if len(html_content.strip()) < 100:
                #print(f"  ⚠ Skipping - content too short ({len(html_content)} chars)")
                continue
            
            #print(f"  Original content length: {len(html_content):,} characters")
            
            # Truncate to first 100 pages
            result = truncate_to_pages(html_content, max_pages=100)
            
            # Create directory structure in temp folder matching original location
            # Extract the directory path from the original file
            original_dir = os.path.dirname(file_path)
            full_dir = os.path.join(temp_dir, original_dir)
            os.makedirs(full_dir, exist_ok=True)
            
            # Save truncated file in the same directory as original
            truncated_path = os.path.join(full_dir, "primary-document-100pg.html")
            with open(truncated_path, 'w', encoding='utf-8') as f:
                f.write(result['content'])
            
            if result['was_truncated']:
                #print(f"  ✓ Truncated from {result['original_pages']} to {result['final_pages']} pages (method: {result['method']})")
                truncated_count += 1
            else:
                #print(f"  ✓ No truncation needed - {result['original_pages']} pages (method: {result['method']})")
                continue
                
            processed_files += 1
                
        except Exception as e:
            error_msg = f"Error processing {row['FILE_PATH']}: {str(e)}"
            print(f"  ✗ {error_msg}")
            errors.append(error_msg)
    
    # Upload truncated files back to stage
    print(f"\nUploading truncated files to stage...")
    upload_count = 0
    upload_errors = []
    
    for root, dirs, files in os.walk(temp_dir):
        for file in files:
            if file == 'primary-document-100pg.html':
                try:
                    local_file_path = os.path.join(root, file)
                    rel_path = os.path.relpath(local_file_path, temp_dir)
                    
                    # Upload to stage preserving directory structure
                    put_query = f"""
                        PUT 'file://{local_file_path}' @sec_filings_stage/{os.path.dirname(rel_path.replace(os.sep, '/'))}
                        AUTO_COMPRESS = FALSE
                        OVERWRITE = TRUE       
                    """
                    
                    session.sql(put_query).collect()     
                    upload_count += 1
                    #print(f"  ✓ Uploaded: {rel_path}")
                    
                except Exception as e:
                    error_msg = f"Upload failed for {file}: {str(e)}"
                    #print(f"  ✗ {error_msg}")
                    upload_errors.append(error_msg)
    
    print(f"\n" + "="*60)
    print("PROCESSING SUMMARY")
    print("="*60)
    print(f"Files processed: {processed_files}")
    print(f"Files truncated: {truncated_count}")
    print(f"Files uploaded to stage: {upload_count}")
    print(f"Processing errors: {len(errors)}")
    print(f"Upload errors: {len(upload_errors)}")
    
    if errors:
        print(f"\nFirst few processing errors:")
        for error in errors[:3]:
            print(f"  - {error}")
    
    # Cleanup temp directory
    shutil.rmtree(temp_dir)
    
    return processed_files, truncated_count, upload_count

def truncate_to_pages(html_content, max_pages=100, paragraphs_per_page=15):
    """
    Truncate HTML document to first N pages
    
    Args:
        html_content: HTML string to truncate
        max_pages: Maximum number of pages to keep
        paragraphs_per_page: Number of <p> tags to consider as one page (default: 15)
    
    Returns dict with:
    - content: truncated HTML content
    - original_pages: number of pages in original
    - final_pages: number of pages in result
    - was_truncated: whether truncation occurred
    - method: which method was used for truncation
    """
    try:
        # Ensure we have a string
        if not isinstance(html_content, str):
            raise ValueError(f"Expected string, got {type(html_content)}")
        
        # Method 1: Try SEC EDGAR <PAGE> tags first
        page_pattern = r'<PAGE>'
        page_breaks = list(re.finditer(page_pattern, html_content, re.IGNORECASE))
        
        if len(page_breaks) > 0:
            # Use <PAGE> tags
            original_pages = len(page_breaks)
            
            if original_pages <= max_pages:
                return {
                    'content': html_content,
                    'original_pages': original_pages,
                    'final_pages': original_pages,
                    'was_truncated': False,
                    'method': 'PAGE_TAGS'
                }
            
            truncate_at_break = min(max_pages, len(page_breaks)) - 1
            truncate_position = page_breaks[truncate_at_break].end()
            truncated_content = html_content[:truncate_position]
            truncated_content = ensure_html_closed(truncated_content)
            
            return {
                'content': truncated_content,
                'original_pages': original_pages,
                'final_pages': max_pages,
                'was_truncated': True,
                'method': 'PAGE_TAGS'
            }
        
        # Method 2: Fallback to paragraph-based pagination
        # Find all paragraph opening tags
        p_pattern = r'<p[\s>]'
        paragraphs = list(re.finditer(p_pattern, html_content, re.IGNORECASE))
        
        if len(paragraphs) >= paragraphs_per_page:
            # Calculate pages based on paragraph count
            total_paragraphs = len(paragraphs)
            estimated_pages = max(1, total_paragraphs // paragraphs_per_page)
            
            if estimated_pages <= max_pages:
                return {
                    'content': html_content,
                    'original_pages': estimated_pages,
                    'final_pages': estimated_pages,
                    'was_truncated': False,
                    'method': f'PARAGRAPH_COUNT ({paragraphs_per_page} per page)'
                }
            
            # Truncate after N paragraphs (where N = max_pages * paragraphs_per_page)
            target_paragraph_index = max_pages * paragraphs_per_page - 1
            
            if target_paragraph_index < len(paragraphs):
                # Find the end of the target paragraph's closing </p> tag
                truncate_start = paragraphs[target_paragraph_index].start()
                
                # Look for the closing </p> tag after this point
                closing_p = re.search(r'</p>', html_content[truncate_start:], re.IGNORECASE)
                
                if closing_p:
                    truncate_position = truncate_start + closing_p.end()
                else:
                    # If no closing tag found, just use the start position
                    truncate_position = truncate_start
                
                truncated_content = html_content[:truncate_position]
                truncated_content = ensure_html_closed(truncated_content)
                
                return {
                    'content': truncated_content,
                    'original_pages': estimated_pages,
                    'final_pages': max_pages,
                    'was_truncated': True,
                    'method': f'PARAGRAPH_COUNT ({paragraphs_per_page} per page)'
                }
        
        # Method 3: Final fallback to character-based estimation
        chars_per_page = 3000
        total_chars = len(html_content)
        estimated_pages = max(1, total_chars // chars_per_page)
        
        if estimated_pages <= max_pages:
            return {
                'content': html_content,
                'original_pages': estimated_pages,
                'final_pages': estimated_pages,
                'was_truncated': False,
                'method': 'CHARACTER_ESTIMATE'
            }
        
        # Truncate based on character count
        max_chars = max_pages * chars_per_page
        truncate_position = min(max_chars, len(html_content))
        
        # Try to truncate at a tag boundary
        tag_end = html_content.rfind('>', 0, truncate_position)
        if tag_end > truncate_position - 500:
            truncate_position = tag_end + 1
        
        truncated_content = html_content[:truncate_position]
        truncated_content = ensure_html_closed(truncated_content)
        
        return {
            'content': truncated_content,
            'original_pages': estimated_pages,
            'final_pages': max_pages,
            'was_truncated': True,
            'method': 'CHARACTER_ESTIMATE'
        }
        
    except Exception as e:
        print(f"Error in truncate_to_pages: {str(e)}")
        # Return original content on error
        return {
            'content': html_content if isinstance(html_content, str) else '',
            'original_pages': 0,
            'final_pages': 0,
            'was_truncated': False
        }



def ensure_html_closed(html_content):
    """
    Ensure HTML has proper closing tags
    """
    # Close common unclosed tags
    open_tags = []
    tag_pattern = r'<(/?)(\w+)[^>]*>'
    
    for match in re.finditer(tag_pattern, html_content):
        is_closing = match.group(1) == '/'
        tag_name = match.group(2).lower()
        
        # Skip self-closing tags
        if tag_name in ['br', 'hr', 'img', 'input', 'meta', 'link']:
            continue
            
        if is_closing:
            if open_tags and open_tags[-1] == tag_name:
                open_tags.pop()
        else:
            open_tags.append(tag_name)
    
    # Add closing tags for any unclosed tags
    closing_tags = ''.join([f'</{tag}>' for tag in reversed(open_tags)])
    
    return html_content + closing_tags


# Execute the truncation and upload
processed, truncated, uploaded = truncate_and_save_documents()

In [None]:
import requests                                                                                                                                                                  
import pandas as pd                                                                                                                                                              
                                                                                                                                                                                
# Fetch SEC's company ticker mapping                                                                                                                                             
url = "https://www.sec.gov/files/company_tickers.json"                                                                                                                           
headers = {                                                                                                                                                                      
    'User-Agent': 'YourCompany your.email@company.com'  # Replace with your info                                                                                                 
}                                                                                                                                                                                
                                                                                                                                                                                
response = requests.get(url, headers=headers)                                                                                                                                    
company_data = response.json()                                                                                                                                                   
                                                                                                                                                                                
# Convert to DataFrame                                                                                                                                                           
companies_df = pd.DataFrame.from_dict(company_data, orient='index')                                                                                                              
companies_df['cik_str'] = companies_df['cik_str'].astype(str)                                                                                                                    
                                                                                                                                                                                
# Rename columns for clarity                                                                                                                                                     
companies_df.rename(columns={                                                                                                                                                    
    'cik_str': 'cik',                                                                                                                                                            
    'ticker': 'ticker',                                                                                                                                                          
    'title': 'company_name'                                                                                                                                                      
}, inplace=True)                                                                                                                                                                 
                                                                                                                                                                                
print(f"Loaded {len(companies_df)} companies")                                                                                                                                   
display(companies_df[['cik', 'ticker', 'company_name']].head(10))                                                                                                                
                                                                                                                                                                                
# Create Snowflake table with company mappings                                                                                                                                   
                                                                                                                                          
session.create_dataframe(
    companies_df[['cik', 'ticker', 'company_name']].rename(columns=str.upper).reset_index(drop=True)
).write.mode('overwrite').save_as_table('sec_company_names')

print("✓ Company names saved to sec_company_names table")   

In [None]:
import requests
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import json

def get_company_description(cik, headers, max_retries=3):
    """
    Fetch company description from SEC EDGAR API
    """
    # Format CIK with leading zeros (10 digits)
    formatted_cik = str(cik).zfill(10)
    url = f"https://data.sec.gov/submissions/CIK{formatted_cik}.json"
    
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers)
            
            # Handle rate limiting
            if response.status_code == 429:
                time.sleep(1)  # Wait 1 second for rate limiting
                continue
                
            if response.status_code == 200:
                data = response.json()
                # Extract business description and other metadata
                return {
                    'cik': cik,
                    'business_description': data.get('description', ''),
                    'sic': data.get('sic', ''),
                    'sicDescription': data.get('sicDescription', ''),
                    'fiscalYearEnd': data.get('fiscalYearEnd', ''),
                    'stateOfIncorporation': data.get('stateOfIncorporation', '')
                }
            else:
                return {
                    'cik': cik, 
                    'business_description': '', 
                    'sic': '', 
                    'sicDescription': '', 
                    'fiscalYearEnd': '', 
                    'stateOfIncorporation': ''
                }
                
        except Exception as e:
            if attempt == max_retries - 1:
                return {
                    'cik': cik, 
                    'business_description': '', 
                    'sic': '', 
                    'sicDescription': '', 
                    'fiscalYearEnd': '', 
                    'stateOfIncorporation': ''
                }
            time.sleep(0.5)
    
    return {
        'cik': cik, 
        'business_description': '', 
        'sic': '', 
        'sicDescription': '', 
        'fiscalYearEnd': '', 
        'stateOfIncorporation': ''
    }

# Fetch SEC's company ticker mapping
url = "https://www.sec.gov/files/company_tickers.json"
headers = {
    'User-Agent': 'YourCompany your.email@company.com'  # Replace with your info
}

response = requests.get(url, headers=headers)
company_data = response.json()

# Convert to DataFrame
companies_df = pd.DataFrame.from_dict(company_data, orient='index')
companies_df['cik_str'] = companies_df['cik_str'].astype(str)

# Rename columns for clarity
companies_df.rename(columns={
    'cik_str': 'cik',
    'ticker': 'ticker',
    'title': 'company_name'
}, inplace=True)

print(f"Loaded {len(companies_df)} companies")

# Get company descriptions using threading for efficiency
print("Fetching company descriptions from SEC EDGAR API...")
print("This may take several minutes due to API rate limits...")

descriptions = []
with ThreadPoolExecutor(max_workers=10) as executor:  # Limit concurrent requests
    # Submit all tasks
    future_to_cik = {
        executor.submit(get_company_description, cik, headers): cik 
        for cik in companies_df['cik'].tolist()
    }
    
    # Process completed tasks
    completed = 0
    for future in as_completed(future_to_cik):
        result = future.result()
        descriptions.append(result)
        completed += 1
        
        # Progress update every 100 companies
        #if completed % 100 == 0:
            #print(f"  Progress: {completed}/{len(companies_df)} companies processed")

# Convert descriptions to DataFrame and merge
descriptions_df = pd.DataFrame(descriptions)
companies_enhanced_df = companies_df.merge(descriptions_df, on='cik', how='left')

# Display sample of enhanced data
print(f"\nEnhanced dataset with {len(companies_enhanced_df)} companies")
display(companies_enhanced_df[['cik', 'ticker', 'company_name', 'business_description', 'sicDescription']].head(5))

# Create Snowflake table with enhanced company data
final_df = companies_enhanced_df[[
    'cik', 'ticker', 'company_name', 'business_description', 
    'sic', 'sicDescription', 'fiscalYearEnd', 'stateOfIncorporation'
]]

session.create_dataframe(
    final_df.rename(columns=str.upper).reset_index(drop=True)
).write.mode('overwrite').save_as_table('sec_company_names')

print("✓ Enhanced company data saved to sec_company_names table")
print(f"  New columns: CIK, TICKER, COMPANY_NAME, BUSINESS_DESCRIPTION, SIC, SICDESCRIPTION, FISCALYEAREND, STATEOFINCORPORATION")

In [None]:
USE ROLE SYSADMIN;

CREATE OR REPLACE TABLE sec_s1_documents (
    file_path VARCHAR,
    trunc_file_path VARCHAR,
    company_cik number,
    accession_number VARCHAR,
    company_name VARCHAR,
    sicdescription VARCHAR,
    filing_date VARCHAR,
    raw_content VARCHAR,
    upload_date TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP()
);

-- Enable change tracking (required for Cortex Search)                                                                                                                           
ALTER TABLE sec_s1_documents
SET CHANGE_TRACKING = TRUE;

In [None]:
 -- load original documents                        
 COPY INTO sec_s1_documents (file_path, company_cik, accession_number, raw_content)    
 FROM (   
     SELECT 
         metadata$filename,  
         SPLIT_PART(metadata$filename, '/', 2),
         SPLIT_PART(metadata$filename, '/', 4),
         $1
     FROM @sec_filings_stage 
     --WHERE metadata$filename LIKE '%primary-document.html%'
 )
 FILE_FORMAT = html_single_row_format 
 PATTERN = '.*primary-document.html'
 ON_ERROR = CONTINUE;   

 -- Update truncated file paths from stage
MERGE INTO sec_s1_documents AS docs
USING (
    SELECT distinct
        SPLIT_PART(metadata$filename, '/', 2) AS company_cik,
        SPLIT_PART(metadata$filename, '/', 4) AS accession_number,
        metadata$filename AS trunc_file_path
    FROM @sec_filings_stage
    WHERE metadata$filename RLIKE '.*primary-document-[0-9]+pg\.html$'
) AS stage_data
ON docs.company_cik = CAST(stage_data.company_cik AS NUMBER)
   AND docs.accession_number = stage_data.accession_number
WHEN MATCHED THEN
    UPDATE SET trunc_file_path = stage_data.trunc_file_path;

 -- delete older accession_number filings per company
DELETE FROM sec_s1_documents 
WHERE accession_number NOT IN (
    SELECT accession_number 
    FROM (
        SELECT
            CAST(SPLIT_PART(metadata$filename, '/', 2) as number) AS company_cik,
            MAX(SPLIT_PART(metadata$filename, '/', 4)) AS accession_number
        FROM @sec_filings_stage 
        WHERE metadata$filename LIKE '%/primary-document.html%'
        AND metadata$filename NOT LIKE '%full-submission%'
        group by 1
    )
);

 -- go back and add in company name as attribute 
UPDATE sec_s1_documents
SET company_name = ccnm.company_name, sicdescription = ccnm.sicdescription
FROM(
        SELECT DISTINCT
            CIK,
            COMPANY_NAME,
            SICDESCRIPTION
        FROM sec_company_names
) ccnm
WHERE company_cik = ccnm.cik;

In [None]:
CREATE OR REPLACE PROCEDURE extract_s1_info( 
    company_name VARCHAR,  
    extraction_fields VARCHAR  -- Changed from OBJECT to VARCHAR
)
RETURNS OBJECT 
LANGUAGE SQL 
AS  
$$ 
DECLARE   
    file_path VARCHAR;
    extraction_result OBJECT;
    parsed_fields OBJECT;  -- New variable to hold parsed JSON
BEGIN
    -- Parse the JSON string into an OBJECT
    SELECT PARSE_JSON(:extraction_fields) INTO :parsed_fields;

    -- Get the file path, adding LIMIT to prevent multiple row errors
    SELECT 
        CASE WHEN trunc_file_path IS NOT NULL THEN trunc_file_path ELSE file_path END 
    INTO :file_path 
    FROM sec_s1_documents 
    WHERE JAROWINKLER_SIMILARITY(:company_name, company_name) > 80
    LIMIT 1;  -- Added LIMIT for safety

    IF (:file_path IS NULL) THEN                                                                                                                                                  
        RETURN OBJECT_CONSTRUCT('error', 'No S-1 filing found for: ' || :company_name);                                                                                        
    END IF; 

    -- Use parsed_fields instead of extraction_fields
    BEGIN
        SELECT AI_EXTRACT(                                                                                                                                                           
            file => TO_FILE('@sec_filings_stage', :file_path),
            responseFormat => :parsed_fields  -- Use the parsed OBJECT
        ) INTO :extraction_result;
    EXCEPTION
        WHEN OTHER THEN
            RETURN OBJECT_CONSTRUCT('error', 'AI_EXTRACT failed: ' || SQLERRM);
    END;
                                                                                                        
    RETURN :extraction_result;       
END;
$$;

In [None]:
CREATE OR REPLACE SEMANTIC VIEW sec_s1_semantic_view
TABLES (
    docs as sec_s1_documents
    PRIMARY KEY (company_cik, accession_number)
    WITH SYNONYMS = ('s1 documents', 'sec filings', 'registration statements')
    COMMENT = 'Core table containing SEC S-1 filing documents',
    
    companies as sec_company_names  
    PRIMARY KEY (cik)
    WITH SYNONYMS = ('company info', 'tickers', 'company mappings')
    COMMENT = 'Additional company information from SEC Edgar Database'
)
RELATIONSHIPS (
    docs (company_cik) REFERENCES companies (cik)
)
FACTS (
    PUBLIC docs.raw_content AS raw_content
    WITH SYNONYMS = ('filing content', 'document text', 'full text')
    COMMENT = 'Full text content of the S-1 filing document for AI extraction',
    
    PUBLIC docs.file_path AS file_path
    WITH SYNONYMS = ('document path', 'storage path')
    COMMENT = 'Original document file path in storage',
    
    PUBLIC docs.trunc_file_path AS trunc_file_path
    WITH SYNONYMS = ('short path', 'truncated path')
    COMMENT = 'Truncated document file path (first 100 pages) for faster processing'
)
DIMENSIONS (
    PUBLIC docs.company_cik AS company_cik
    WITH SYNONYMS = ('cik', 'central index key', 'company id')
    COMMENT = 'Central Index Key - unique SEC identifier for each company',
    
    PUBLIC docs.accession_number AS accession_number
    WITH SYNONYMS = ('filing number', 'document id', 'accession')
    COMMENT = 'SEC accession number - unique identifier for each filing',
    
    PUBLIC docs.company_name AS company_name
    WITH SYNONYMS = ('company', 'business name', 'corporation')
    COMMENT = 'Official company name as registered with SEC',

    PUBLIC docs.sicdescription AS sicdescription
    WITH SYNONYMS = ('company sic', 'company description', 'business type')
    COMMENT = 'Official SIC business description of the company',
    
    PUBLIC companies.ticker AS ticker
    WITH SYNONYMS = ('stock symbol', 'trading symbol')
    COMMENT = 'Stock ticker symbol for the company'
    
)
COMMENT = 'Semantic view for SEC S-1 filing documents analysis and AI extraction. Enables natural language queries about company filings, IPO documents, risk factors, and document characteristics.';