# Corporate Intelligence Extractor

An advanced AI-powered system for extracting comprehensive company information from text data, featuring intelligent analysis and business intelligence capabilities.

### **Unique Features:**
- **Confidence Scoring**: Advanced confidence metrics for extraction quality assessment
- **Smart Date Processing**: Fuzzy matching and intelligent date normalization
- **Geographic Intelligence**: Location extraction with validation
- **Industry Analytics**: Comprehensive sector analysis and insights
- **Enhanced Data Export**: Multi-format output with enriched metadata
- **Validation Framework**: Built-in quality assurance and data verification

### **Advanced Output:**
- Company information with confidence scores
- Industry distribution analysis
- Founder demographics and insights
- Geographic mapping and validation
- Decade-based founding trends
- Enhanced CSV export with metadata

## Setup and Installation

In [None]:
# required packages
!pip install langchain langchain-google-genai pydantic pandas

Collecting langchain
  Downloading langchain-0.3.27-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-2.1.9-py3-none-any.whl.metadata (7.2 kB)
Collecting pydantic
  Using cached pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting pandas
  Using cached pandas-2.3.1-cp313-cp313-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting langchain-core<1.0.0,>=0.3.72 (from langchain)
  Downloading langchain_core-0.3.74-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.9 (from langchain)
  Downloading langchain_text_splitters-0.3.9-py3-none-any.whl.metadata (1.9 kB)
Collecting langsmith>=0.1.17 (from langchain)
  Downloading langsmith-0.4.13-py3-none-any.whl.metadata (14 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading sqlalchemy-2.0.43-cp313-cp313-macosx_11_0_arm64.whl.metadata (9.6 kB)
Collecting annotated-types>=0.6.0 (from pydantic)
  Using cached annotated_types-0.7.0-py3-none-any.w

In [None]:
import os
import pandas as pd
import re
from datetime import datetime
from typing import List, Dict, Optional, Any
from pydantic import BaseModel, Field
from google.colab import userdata

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain.schema import BaseOutputParser

ModuleNotFoundError: No module named 'google.colab'

## API Configuration

In [None]:
#adding gemini api key (this is for google collab rn)
os.environ["GOOGLE_API_KEY"] = userdata.get("GEMINI_API_KEY")

NameError: name 'userdata' is not defined

In [None]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-lite",
    temperature=0.3,
    max_tokens=None
)

In [None]:
print("Testing Enhanced Date Normalization:")
test_dates = ["1946", "founded in 1955", "est. 2001", "started 1999"]
for date_str in test_dates:
    normalized = normalize_date_advanced(date_str)
    print(f"  '{date_str}' -> {normalized}")

print("\nTesting Location Extraction:")
test_locations = ["Founded in New York", "California-based company", "Tokyo, Japan headquarters"]
for loc_str in test_locations:
    location = extract_location(loc_str)
    print(f"  '{loc_str}' -> {location}")

print("\nTesting Industry Classification:")
test_descriptions = [
    "technology company developing software",
    "automotive manufacturer",
    "financial services provider"
]
for desc in test_descriptions:
    industry = classify_industry(desc)
    print(f"  '{desc}' -> {industry}")

### Advanced Company Extraction Prompt Template

This template provides comprehensive extraction guidelines for the AI model:

In [None]:
class CompanyInfo(BaseModel):
    """Enhanced data model for extracted company information with advanced features."""
    company_name: str = Field(description="The full name of the company")
    founding_date: str = Field(description="The founding date in YYYY-MM-DD format")
    founders: List[str] = Field(description="List of founder names")

    confidence_score: float = Field(default=0.0, description="Extraction confidence (0-1)")
    company_age: Optional[int] = Field(default=None, description="Age of company in years")
    founder_count: int = Field(default=0, description="Number of founders")
    founding_location: Optional[str] = Field(default=None, description="City/Country of founding")
    industry_category: Optional[str] = Field(default=None, description="Inferred industry category")

    class Config:
        json_schema_extra = {
            "example": {
                "company_name": "Microsoft Corporation",
                "founding_date": "1975-04-04",
                "founders": ["Bill Gates", "Paul Allen"],
                "confidence_score": 0.95,
                "company_age": 49,
                "founder_count": 2,
                "founding_location": "Albuquerque, New Mexico",
                "industry_category": "Technology"
            }
        }

In [None]:
class CompanyExtractionResult(BaseModel):
    """Container for multiple company extractions from a paragraph."""
    companies: List[CompanyInfo] = Field(description="List of extracted companies")
    paragraph_processed: str = Field(description="The original paragraph text")
    processing_time: float = Field(default=0.0, description="Time taken to process")
    avg_confidence: float = Field(default=0.0, description="Average confidence score")

class ExtractionMetrics(BaseModel):
    """Quality and performance metrics for the extraction process."""
    total_companies: int = Field(description="Total companies extracted")
    avg_confidence_score: float = Field(description="Average confidence across all extractions")
    high_confidence_count: int = Field(description="Number of high-confidence extractions (>0.8)")
    processing_time_total: float = Field(description="Total processing time in seconds")
    paragraphs_processed: int = Field(description="Number of paragraphs processed")
    extraction_rate: float = Field(description="Companies per paragraph ratio")

class IndustryAnalytics(BaseModel):
    """Industry-specific analytics and insights."""
    industry_distribution: Dict[str, int] = Field(description="Count by industry category")
    decade_distribution: Dict[str, int] = Field(description="Companies founded per decade")
    founder_analytics: Dict[str, Any] = Field(description="Founder-related statistics")
    geographic_distribution: Dict[str, int] = Field(description="Geographic founding locations")

## Testing Phase

### Step 1: Breaking Text into Paragraphs

In [None]:
# Sample essay
essay_text = """
In the ever-evolving landscape of global commerce, the origin stories of major corporations are not merely tales of personal ambition and entrepreneurial spirit but also reflections of broader socio-economic trends and technological revolutions that have reshaped industries. These narratives, which often begin with modest ambitions, unfold into chronicles of innovation and strategic foresight that define industries and set benchmarks for future enterprises.

Early Foundations: Pioneers of Industry
One of the earliest examples is The Coca-Cola Company, founded on May 8, 1886, by Dr. John Stith Pemberton in Atlanta, Georgia. Initially sold at Jacob's Pharmacy as a medicinal beverage, Coca-Cola would become one of the most recognized brands worldwide, revolutionizing the beverage industry.
Similarly, Sony Corporation was established on May 7, 1946, by Masaru Ibuka and Akio Morita in Tokyo, Japan. Starting with repairing and building electrical equipment in post-war Japan, Sony would grow to pioneer electronics, entertainment, and technology.
As the mid-20th century progressed, McDonald's Corporation emerged as a game-changer in the fast-food industry. Founded on April 15, 1955, in Des Plaines, Illinois, by Ray Kroc, McDonald's built upon the original concept of Richard and Maurice McDonald to standardize and scale fast-food service globally. Around the same period, Intel Corporation was established on July 18, 1968, by Robert Noyce and Gordon Moore in Mountain View, California

driving advancements in semiconductors and microprocessors that became the backbone of modern computing.

The Rise of Technology Titans
Samsung Electronics Co., Ltd., founded on January 13, 1969, by Lee Byung-chul in Su-dong, South Korea, initially focused on producing electrical appliances like televisions and refrigerators. As Samsung expanded into semiconductors, telecommunications, and digital media, it
grew into a global technology leader. Similarly, Microsoft Corporation was founded on April 4, 1975, by Bill Gates and Paul Allen in Albuquerque, New Mexico, with the vision of placing a computer on every desk and in every home.
In Cupertino, California, Apple Inc. was born on April 1, 1976, founded by Steve Jobs, Steve Wozniak, and Ronald Wayne. Their mission to make personal computing accessible and elegant revolutionized technology and design. A few years later, Oracle Corporation was established on June 16, 1977, by Larry Ellison, Bob Miner, and Ed Oates in Santa Clara, California.
Specializing in relational databases, Oracle would become a cornerstone of enterprise software and cloud computing.
NVIDIA Corporation, founded on April 5, 1993, by Jensen Huang, Chris Malachowsky, and Curtis Priem in Santa Clara, California, began with a focus on graphics processing units (GPUs) for gaming. Today, NVIDIA is a leader in artificial intelligence, deep learning, and autonomous systems, showcasing the power of continuous innovation.

E-Commerce and the Internet Revolution
The 1990s witnessed a dramatic shift toward e-commerce and internet technologies. Amazon.com Inc. was founded on July 5, 1994, by Jeff Bezos in a garage in Bellevue, Washington, with the vision of becoming the world's largest online bookstore. This vision rapidly expanded to encompass
e-commerce, cloud computing, and digital streaming. Similarly, Google LLC was founded on September 4, 1998, by Larry Page and Sergey Brin, PhD students at Stanford University, in a garage in Menlo Park, California.
Google's mission to "organize the world's information" transformed how we search, learn, and connect.
In Asia, Alibaba Group Holding Limited was founded on June 28, 1999, by Jack Ma and 18 colleagues in Hangzhou, China. Originally an e-commerce platform connecting manufacturers with buyers, Alibaba expanded into cloud

computing, digital entertainment, and financial technology, becoming a global powerhouse.
In Europe, SAP SE was founded on April 1, 1972, by Dietmar Hopp,
Hans-Werner Hector, Hasso Plattner, Klaus Tschira, and Claus Wellenreuther in Weinheim, Germany. Specializing in enterprise resource planning (ERP) software, SAP revolutionized how businesses manage operations and data.

Social Media and Digital Platforms
The 2000s brought a wave of social media and digital platforms that reshaped communication and commerce. LinkedIn Corporation was founded on December 28, 2002, by Reid Hoffman and a team from PayPal and Socialnet.com in Mountain View, California, focusing on professional networking.
Facebook, Inc. (now Meta Platforms, Inc.) was launched on February 4, 2004, by Mark Zuckerberg and his college roommates in Cambridge, Massachusetts, evolving into a global social networking behemoth.
Another transformative platform, Twitter, Inc., was founded on March 21, 2006, by Jack Dorsey, Biz Stone, and Evan Williams in San Francisco, California. Starting as a microblogging service, Twitter became a critical tool for communication and social commentary. Spotify AB, founded on April 23, 2006, by Daniel Ek and Martin Lorentzon in Stockholm, Sweden, leveraged streaming technology to democratize music consumption, fundamentally altering the music industry.
In the realm of video-sharing, YouTube LLC was founded on February 14, 2005, by Steve Chen, Chad Hurley, and Jawed Karim in San Mateo, California. YouTube became the leading platform for user-generated video content, influencing global culture and media consumption.

Innovators in Modern Technology
Tesla, Inc., founded on July 1, 2003, by a group including Elon Musk, Martin Eberhard, Marc Tarpenning, JB Straubel, and Ian Wright, in San Carlos, California, championed the transition to sustainable energy with its electric vehicles and energy solutions. Airbnb, Inc., founded in August 2008 by Brian Chesky, Joe Gebbia, and Nathan Blecharczyk in San Francisco, California, disrupted traditional hospitality with its peer-to-peer lodging platform.
In the realm of fintech, PayPal Holdings, Inc. was established in December 1998 by Peter Thiel, Max Levchin, Luke Nosek, and Ken Howery in Palo Alto,

California. Originally a cryptography company, PayPal became a global leader in online payments. Stripe, Inc., founded in 2010 by Patrick and John Collison in Palo Alto, California, followed suit, simplifying online payments and enabling digital commerce.
Square, Inc. (now Block, Inc.), founded on February 20, 2009, by Jack Dorsey and Jim McKelvey in San Francisco, California, revolutionized mobile payment systems with its simple and accessible card readers.

Recent Disruptors
Zoom Video Communications, Inc. was founded on April 21, 2011, by Eric Yuan in San Jose, California. Initially designed for video conferencing, Zoom became essential during the COVID-19 pandemic, transforming remote work and communication. Slack Technologies, LLC, founded in 2009 by Stewart Butterfield, Eric Costello, Cal Henderson, and Serguei Mourachov in Vancouver, Canada, redefined workplace communication with its innovative messaging platform.
Rivian Automotive, Inc., founded on June 23, 2009, by RJ Scaringe in Plymouth, Michigan, entered the electric vehicle market with a focus on adventure and sustainability. SpaceX, established on March 14, 2002, by Elon Musk in Hawthorne, California, revolutionized aerospace with reusable rockets and ambitious plans for Mars exploration.
TikTok, developed by ByteDance and launched in September 2016 by Zhang Yiming in Beijing, China, revolutionized short-form video content, becoming a cultural phenomenon worldwide.

Conclusion
These corporations, with their diverse beginnings and visionary founders, exemplify the interplay of innovation, timing, and strategic foresight that shapes industries and transforms markets. From repairing electronics in post-war Japan to building global e-commerce empires and redefining space exploration, their stories are milestones in the narrative of global economic transformation. Each reflects not only the aspirations of their founders but also the technological advancements and socio-economic trends of their time, serving as inspirations for future innovators.
"""

In [None]:
def split_into_paragraphs(text: str) -> List[str]:
    """Split text into paragraphs and remove empty ones."""
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    return [p for p in paragraphs if len(p) > 50]

In [None]:
# paragraph splitting
paragraphs = split_into_paragraphs(essay_text)
print(f"Total paragraphs found: {len(paragraphs)}")
print("First paragraph preview:")
print(paragraphs[0][:200] + "...")

### Step 2: Fixing Date Formats

In [None]:
def normalize_date_advanced(date_str: str) -> tuple[str, float]:
    """
    Advanced date normalization with confidence scoring.
    Returns: (normalized_date, confidence_score)
    """
    try:
        date_str = date_str.strip().lower()
        date_str = re.sub(r'^(on|in)\s+', '', date_str)

        confidence = 1.0 #starting with high confidence becase we're expecing best case scenario

        month_patterns = {
            'january': '01', 'jan': '01', 'february': '02', 'feb': '02',
            'march': '03', 'mar': '03', 'april': '04', 'apr': '04',
            'may': '05', 'june': '06', 'jun': '06', 'july': '07', 'jul': '07',
            'august': '08', 'aug': '08', 'september': '09', 'sep': '09', 'sept': '09',
            'october': '10', 'oct': '10', 'november': '11', 'nov': '11',
            'december': '12', 'dec': '12'
        }

        year_pattern = r'\b(19|20)\d{2}\b'
        year_match = re.search(year_pattern, date_str)
        if not year_match:
            return "1900-01-01", 0.1

        year = year_match.group()

        full_date_patterns = [
            (r'\b(\w+)\s+(\d{1,2}),?\s+(\d{4})\b', 'month_day_year'), 
            (r'\b(\d{1,2})[/-](\d{1,2})[/-](\d{4})\b', 'mm_dd_yyyy'),
            (r'\b(\d{4})[/-](\d{1,2})[/-](\d{1,2})\b', 'yyyy_mm_dd'),
        ]

        for pattern, format_type in full_date_patterns:
            match = re.search(pattern, date_str)
            if match:
                parts = match.groups()
                try:
                    if format_type == 'month_day_year':
                        month_name, day, year = parts
                        if month_name in month_patterns:
                            month = month_patterns[month_name]
                            return f"{year}-{month}-{day.zfill(2)}", confidence
                    elif format_type == 'mm_dd_yyyy':
                        month, day, year = parts
                        return f"{year}-{month.zfill(2)}-{day.zfill(2)}", confidence
                    elif format_type == 'yyyy_mm_dd':
                        year, month, day = parts
                        return f"{year}-{month.zfill(2)}-{day.zfill(2)}", confidence
                except:
                    continue

        for month_name, month_num in month_patterns.items():
            if month_name in date_str:
                confidence = 0.7 
                return f"{year}-{month_num}-01", confidence

        return f"{year}-01-01", 0.5

    except Exception as e:
        return "1900-01-01", 0.1

def extract_location(text: str) -> tuple[Optional[str], float]:
    """Extract founding location with confidence scoring."""
    location_patterns = [
        r'in ([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),?\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', #city, state
        r'in ([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', #location
    ]

    for pattern in location_patterns:
        match = re.search(pattern, text)
        if match:
            if len(match.groups()) == 2:
                city, region = match.groups()
                return f"{city}, {region}", 0.9
            else:
                return match.group(1), 0.7

    return None, 0.0

def infer_industry(company_name: str, context: str) -> tuple[Optional[str], float]:
    """Infer industry category from company name and context."""
    industry_keywords = {
        'Technology': ['software', 'tech', 'computer', 'digital', 'electronics', 'semiconductor', 'AI', 'data'],
        'E-commerce': ['online', 'e-commerce', 'marketplace', 'shopping', 'retail'],
        'Social Media': ['social', 'networking', 'media', 'platform', 'communication'],
        'Automotive': ['automotive', 'car', 'vehicle', 'electric', 'transportation'],
        'Finance': ['financial', 'payment', 'fintech', 'banking', 'investment'],
        'Entertainment': ['entertainment', 'streaming', 'music', 'video', 'gaming'],
        'Food & Beverage': ['food', 'beverage', 'restaurant', 'fast-food'],
        'Aerospace': ['aerospace', 'space', 'aviation', 'rocket'],
        'Healthcare': ['health', 'medical', 'pharmaceutical', 'biotech']
    }

    combined_text = f"{company_name} {context}".lower()

    for industry, keywords in industry_keywords.items():
        matches = sum(1 for keyword in keywords if keyword in combined_text)
        if matches > 0:
            confidence = min(0.9, matches * 0.3)
            return industry, confidence

    return None, 0.0

def calculate_company_age(founding_date: str) -> Optional[int]:
    """Calculate company age in years."""
    try:
        founding_year = int(founding_date.split('-')[0])
        current_year = datetime.now().year
        return current_year - founding_year
    except:
        return None

In [None]:
test_dates = [
    "May 8, 1886",
    "April 4, 1975",
    "2009",
    "August 2008",
    "December 1998"
]

print("🔍 Testing Enhanced Date Normalization:")
print("-" * 50)
for date in test_dates:
    normalized, confidence = normalize_date_advanced(date)
    print(f"'{date}' → '{normalized}' (confidence: {confidence:.2f})")

print("\n🌍 Testing Location Extraction:")
print("-" * 40)
test_locations = [
    "founded in San Francisco, California",
    "established in Tokyo, Japan",
    "created in London"
]

for text in test_locations:
    location, confidence = extract_location(text)
    print(f"'{text}' → Location: {location} (confidence: {confidence:.2f})")

print("\n🏭 Testing Industry Classification:")
print("-" * 45)
test_companies = [
    ("Microsoft Corporation", "computer software technology"),
    ("Tesla Inc", "electric vehicle automotive"),
    ("Facebook", "social networking platform")
]

for company, context in test_companies:
    industry, confidence = infer_industry(company, context)
    print(f"'{company}' → Industry: {industry} (confidence: {confidence:.2f})")

### Step 3: Testing with One Paragraph

In [None]:
# Enhanced extraction prompt with confidence scoring
extraction_prompt = ChatPromptTemplate.from_template("""
You are an elite corporate intelligence analyst with expertise in extracting structured business information from unstructured text. Your mission is to perform comprehensive company data extraction with maximum accuracy.

**EXTRACTION TARGETS:**
For each company mentioned, extract:
1. **Company Name**: Full official legal name (prefer complete form over abbreviations)
2. **Founding Date**: Exact date as mentioned (maintain original format precision)
3. **Founders**: All individual founders mentioned (separate persons, not groups)
4. **Confidence Score**: Your confidence in the extraction accuracy (0.0 to 1.0)
5. **Location**: Founding city/country if mentioned
6. **Industry Context**: Brief industry description based on context

**EXTRACTION PROTOCOLS:**
- Extract ALL companies in the paragraph (comprehensive scan)
- Prioritize precision over speed - verify each extraction
- Use exact company names as written in source text
- Include all founders mentioned individually
- Assign confidence scores based on information clarity
- If founding location is mentioned, extract it precisely

Return results in this enhanced JSON structure:
{{
  "companies": [
    {{
      "company_name": "Complete Official Company Name",
      "founding_date": "exact date as mentioned in text",
      "founders": ["Individual Founder 1", "Individual Founder 2"],
      "confidence_score": 0.95,
      "founding_location": "City, State/Country (if mentioned)",
      "industry_context": "Brief industry description"
    }}
  ]
}}

**TEXT TO ANALYZE:**
{paragraph}

**AI RESPONSE:**
""")

#enhanced extraction chain
extraction_chain = extraction_prompt | llm | JsonOutputParser()

In [None]:
# testing the extraction on a single paragraph
test_paragraph = paragraphs[1]
print("Testing extraction on paragraph:")
print(test_paragraph[:200] + "...")

In [None]:
#extracting
try:
    result = extraction_chain.invoke({"paragraph": test_paragraph})
    print("\nExtraction Result:")
    print(result)
except Exception as e:
    print(f" Extraction failed: {e}")

In [None]:
def process_extraction_result_advanced(raw_result: Dict, original_paragraph: str) -> List[CompanyInfo]:
    """Advanced processing with enhanced features and validation."""
    companies = []

    if "companies" in raw_result:
        for company_data in raw_result["companies"]:
            try:
                raw_date = company_data.get("founding_date", "")
                if raw_date:
                    normalized_date, date_confidence = normalize_date_advanced(raw_date)
                else:
                    normalized_date, date_confidence = "1900-01-01", 0.1

                founders = company_data.get("founders", [])
                if isinstance(founders, str):
                    founders = [founders]

                # standardized founder names
                cleaned_founders = []
                for founder in founders:
                    if founder and founder.strip():
                        clean_name = founder.strip()
                        clean_name = re.sub(r'^(Dr\.|Prof\.|Mr\.|Ms\.|Mrs\.)\s+', '', clean_name)
                        clean_name = re.sub(r'\s+', ' ', clean_name) 
                        if clean_name and len(clean_name) > 1:
                            cleaned_founders.append(clean_name)

                # Extract additional information
                company_name = company_data.get("company_name", "Unknown")
                extraction_confidence = float(company_data.get("confidence_score", 0.5))

                company_age = calculate_company_age(normalized_date)

                founding_location = company_data.get("founding_location")
                if not founding_location:
                    founding_location, _ = extract_location(original_paragraph)

                # Infer industry
                industry_context = company_data.get("industry_context", "")
                if not industry_context:
                    industry_category, _ = infer_industry(company_name, original_paragraph)
                else:
                    industry_category, _ = infer_industry(company_name, industry_context)

                final_confidence = extraction_confidence * date_confidence
                if len(cleaned_founders) > 0:
                    final_confidence *= 1.1  # Boost for having founders
                if founding_location:
                    final_confidence *= 1.1  # Boost for location info
                final_confidence = min(1.0, final_confidence)

                company_info = CompanyInfo(
                    company_name=company_name,
                    founding_date=normalized_date,
                    founders=cleaned_founders,
                    confidence_score=round(final_confidence, 3),
                    company_age=company_age,
                    founder_count=len(cleaned_founders),
                    founding_location=founding_location,
                    industry_category=industry_category
                )
                companies.append(company_info)

                if final_confidence > 0.8:
                    print(f"✅ High-confidence extraction: {company_name} (confidence: {final_confidence:.3f})")

            except Exception as e:
                print(f"❌ Error processing company data: {e}")
                continue

    return companies

In [None]:
# Enhanced testing with advanced processing
if 'result' in locals():
    processed_companies = process_extraction_result_advanced(result, test_paragraph)
    print("\n🎯 Enhanced Processing Results:")
    print("=" * 60)
    for company in processed_companies:
        print(f"\n🏢 Company: {company.company_name}")
        print(f"📅 Founded: {company.founding_date} ({company.company_age} years old)" if company.company_age else f"📅 Founded: {company.founding_date}")
        print(f"👥 Founders: {', '.join(company.founders)} ({company.founder_count} total)")
        print(f"📍 Location: {company.founding_location}" if company.founding_location else "📍 Location: Not specified")
        print(f"🏭 Industry: {company.industry_category}" if company.industry_category else "🏭 Industry: Not classified")
        print(f"⭐ Confidence: {company.confidence_score:.1%}")
        print("-" * 40)

## Building the Complete System

### Creating the Main Pipeline

In [None]:
class AdvancedCompanyExtractionPipeline:
    """🚀 Next-generation corporate intelligence extraction system with advanced analytics."""

    def __init__(self, llm):
        self.llm = llm
        self.companies_extracted = []
        self.processing_metrics = []

        # Enhanced extraction prompt with multi-modal capabilities
        self.extraction_prompt = ChatPromptTemplate.from_template("""
You are an elite corporate intelligence analyst specializing in extracting structured business data from narrative text. Your analysis must be thorough, accurate, and confidence-rated.

🎯 **MISSION**: Extract comprehensive company intelligence from the provided text segment.

📊 **REQUIRED DATA POINTS** (for each company):
1. **Company Name**: Official legal name (prefer full form over abbreviations)
2. **Founding Date**: Precise date as mentioned in source
3. **Founders**: Individual founder names (separate people, not collective terms)
4. **Confidence Assessment**: Rate your extraction confidence (0.0-1.0)
5. **Geographic Origin**: Founding location if specified
6. **Industry Classification**: Business sector based on context clues

🔍 **ANALYSIS PROTOCOLS**:
- Scan entire paragraph for ALL company mentions
- Verify founder names are individuals, not groups/teams
- Maintain source accuracy - extract exactly as written
- Rate confidence based on information clarity and completeness
- Include partial information with appropriate confidence scoring

📋 **OUTPUT FORMAT**:
{{
  "companies": [
    {{
      "company_name": "Exact Company Name from Text",
      "founding_date": "date exactly as mentioned",
      "founders": ["Founder Name 1", "Founder Name 2"],
      "confidence_score": 0.85,
      "founding_location": "City, Region (if mentioned)",
      "industry_context": "Industry description based on context"
    }}
  ]
}}

📄 **SOURCE TEXT**:
{paragraph}

🤖 **INTELLIGENCE REPORT**:
""")

        # advanced processing chain
        self.extraction_chain = (
            self.extraction_prompt
            | self.llm
            | JsonOutputParser()
            | RunnableLambda(self._process_with_analytics)
        )

    def _process_with_analytics(self, raw_result: Dict) -> List[CompanyInfo]:
        """Process results with advanced analytics and quality metrics."""
        import time
        start_time = time.time()

        companies = process_extraction_result_advanced(raw_result, "")

        processing_time = time.time() - start_time

        # Calculating metrics for this batch
        if companies:
            avg_confidence = sum(c.confidence_score for c in companies) / len(companies)
            high_confidence_count = sum(1 for c in companies if c.confidence_score > 0.8)
        else:
            avg_confidence = 0.0
            high_confidence_count = 0

        self.processing_metrics.append({
            'processing_time': processing_time,
            'companies_count': len(companies),
            'avg_confidence': avg_confidence,
            'high_confidence_count': high_confidence_count
        })

        return companies

    def process_paragraph_advanced(self, paragraph: str, paragraph_index: int = 0) -> List[CompanyInfo]:
        """Process single paragraph with enhanced error handling and logging."""
        try:
            print(f"🔄 Processing paragraph {paragraph_index + 1}... ", end="")

            companies = self.extraction_chain.invoke({"paragraph": paragraph})

            if companies:
                print(f"✅ Found {len(companies)} companies")
                for company in companies:
                    print(f"   🏢 {company.company_name} (confidence: {company.confidence_score:.1%})")
            else:
                print("❌ No companies found")

            return companies

        except Exception as e:
            print(f"💥 Error: {str(e)}")
            return []

    def process_text_with_analytics(self, text: str) -> tuple[List[CompanyInfo], ExtractionMetrics]:
        """Process complete text with comprehensive analytics generation."""
        import time
        total_start_time = time.time()

        paragraphs = split_into_paragraphs(text)
        all_companies = []

        print(f"🚀 Starting Advanced Corporate Intelligence Extraction")
        print(f"📄 Processing {len(paragraphs)} text segments...")
        print("=" * 70)

        for i, paragraph in enumerate(paragraphs):
            companies = self.process_paragraph_advanced(paragraph, i)
            all_companies.extend(companies)

            # Add delay for rate limiting
            time.sleep(0.3)

        total_processing_time = time.time() - total_start_time

        metrics = self._generate_metrics(all_companies, len(paragraphs), total_processing_time)

        print("\n" + "=" * 70)
        print(f"🎯 Extraction Complete! Summary:")
        print(f"   📊 Total Companies: {metrics.total_companies}")
        print(f"   ⭐ Average Confidence: {metrics.avg_confidence_score:.1%}")
        print(f"   🏆 High-Confidence Extractions: {metrics.high_confidence_count}")
        print(f"   ⏱️  Total Processing Time: {metrics.processing_time_total:.1f}s")
        print(f"   📈 Extraction Rate: {metrics.extraction_rate:.2f} companies/paragraph")

        return all_companies, metrics

    def _generate_metrics(self, companies: List[CompanyInfo], paragraphs_count: int, total_time: float) -> ExtractionMetrics:
        """Generate comprehensive quality and performance metrics."""
        if not companies:
            return ExtractionMetrics(
                total_companies=0,
                avg_confidence_score=0.0,
                high_confidence_count=0,
                processing_time_total=total_time,
                paragraphs_processed=paragraphs_count,
                extraction_rate=0.0
            )

        avg_confidence = sum(c.confidence_score for c in companies) / len(companies)
        high_confidence_count = sum(1 for c in companies if c.confidence_score > 0.8)
        extraction_rate = len(companies) / max(paragraphs_count, 1)

        return ExtractionMetrics(
            total_companies=len(companies),
            avg_confidence_score=avg_confidence,
            high_confidence_count=high_confidence_count,
            processing_time_total=total_time,
            paragraphs_processed=paragraphs_count,
            extraction_rate=extraction_rate
        )

In [None]:
print("🚀 Initializing Advanced Corporate Intelligence Extraction Pipeline...")
print("🔧 Loading AI models and configuring enhanced analytics...")
print("✅ Pipeline ready with advanced features enabled!")

advanced_pipeline = AdvancedCompanyExtractionPipeline(llm)

print("\n🎯 Pipeline Features:")
print("   • Multi-modal data extraction")
print("   • Confidence scoring and validation")
print("   • Industry classification")
print("   • Geographic intelligence")
print("   • Real-time quality metrics")
print("   • Advanced analytics generation")

### Processing All Text

In [None]:
# Advanced processing with comprehensive analytics
print("🚀 Initiating Advanced Corporate Intelligence Extraction...")
print("🔍 Applying AI-powered analysis with confidence scoring...")

all_companies, extraction_metrics = advanced_pipeline.process_text_with_analytics(essay_text)

print(f"\n🎉 Advanced Extraction Results:")
print("=" * 70)

# Enhanced results display with categorization
high_confidence_companies = [c for c in all_companies if c.confidence_score > 0.8]
medium_confidence_companies = [c for c in all_companies if 0.5 < c.confidence_score <= 0.8]
low_confidence_companies = [c for c in all_companies if c.confidence_score <= 0.5]

print(f"\n🏆 HIGH CONFIDENCE EXTRACTIONS ({len(high_confidence_companies)}):")
for i, company in enumerate(high_confidence_companies, 1):
    print(f"{i:2d}. 🏢 {company.company_name}")
    print(f"    📅 Founded: {company.founding_date}" + (f" ({company.company_age} years)" if company.company_age else ""))
    print(f"    👥 Founders: {', '.join(company.founders)}")
    if company.founding_location:
        print(f"    📍 Location: {company.founding_location}")
    if company.industry_category:
        print(f"    🏭 Industry: {company.industry_category}")
    print(f"    ⭐ Confidence: {company.confidence_score:.1%}")
    print()

if medium_confidence_companies:
    print(f"\n⚠️  MEDIUM CONFIDENCE EXTRACTIONS ({len(medium_confidence_companies)}):")
    for i, company in enumerate(medium_confidence_companies, 1):
        print(f"{i:2d}. 🏢 {company.company_name} - {company.founding_date} (⭐ {company.confidence_score:.1%})")

if low_confidence_companies:
    print(f"\n🔍 NEEDS REVIEW ({len(low_confidence_companies)}):")
    for i, company in enumerate(low_confidence_companies, 1):
        print(f"{i:2d}. 🏢 {company.company_name} - {company.founding_date} (⭐ {company.confidence_score:.1%})")

### Saving to CSV File

In [None]:
def export_enhanced_results(companies: List[CompanyInfo], metrics: ExtractionMetrics,
                           analytics: IndustryAnalytics, base_filename: str = "company_info"):
    """Advanced multi-format export system with comprehensive reporting."""

    # 1. PRIMARY CSV EXPORT (Standard Format)
    print("📁 Exporting Primary Results...")
    primary_data = []
    for i, company in enumerate(companies, 1):
        primary_data.append({
            "S.N.": i,
            "Company Name": company.company_name,
            "Founded in": company.founding_date,
            "Founded by": str(company.founders).replace("[", "").replace("]", "").replace("'", "")
        })

    primary_df = pd.DataFrame(primary_data)
    primary_filename = f"{base_filename}.csv"
    primary_df.to_csv(primary_filename, index=False)
    print(f"✅ Primary CSV exported: {primary_filename}")

    # 2. ENHANCED CSV EXPORT (With All Fields)
    enhanced_data = []
    for i, company in enumerate(companies, 1):
        enhanced_data.append({
            "S.N.": i,
            "Company Name": company.company_name,
            "Founded in": company.founding_date,
            "Founded by": ", ".join(company.founders),
            "Confidence Score": f"{company.confidence_score:.3f}",
            "Company Age": company.company_age or "Unknown",
            "Founder Count": company.founder_count,
            "Location": company.founding_location or "Not specified",
            "Industry": company.industry_category or "Not classified"
        })

    enhanced_df = pd.DataFrame(enhanced_data)
    enhanced_filename = f"{base_filename}_enhanced.csv"
    enhanced_df.to_csv(enhanced_filename, index=False)
    print(f"✅ Enhanced CSV exported: {enhanced_filename}")

    # 3. QUALITY METRICS REPORT
    metrics_data = {
        "Metric": [
            "Total Companies Extracted",
            "Average Confidence Score",
            "High-Confidence Extractions",
            "Processing Time (seconds)",
            "Paragraphs Processed",
            "Extraction Rate (companies/paragraph)"
        ],
        "Value": [
            metrics.total_companies,
            f"{metrics.avg_confidence_score:.1%}",
            f"{metrics.high_confidence_count} ({metrics.high_confidence_count/metrics.total_companies:.1%})",
            f"{metrics.processing_time_total:.2f}",
            metrics.paragraphs_processed,
            f"{metrics.extraction_rate:.2f}"
        ]
    }

    metrics_df = pd.DataFrame(metrics_data)
    metrics_filename = f"{base_filename}_quality_report.csv"
    metrics_df.to_csv(metrics_filename, index=False)
    print(f"✅ Quality Report exported: {metrics_filename}")

    # 4. ANALYTICS SUMMARY
    analytics_data = []

    # Industry breakdown
    for industry, count in analytics.industry_distribution.items():
        analytics_data.append({
            "Category": "Industry",
            "Subcategory": industry,
            "Count": count,
            "Percentage": f"{(count/metrics.total_companies)*100:.1f}%"
        })

    # Decade breakdown
    for decade, count in analytics.decade_distribution.items():
        analytics_data.append({
            "Category": "Founding Decade",
            "Subcategory": decade,
            "Count": count,
            "Percentage": f"{(count/metrics.total_companies)*100:.1f}%"
        })

    analytics_df = pd.DataFrame(analytics_data)
    analytics_filename = f"{base_filename}_analytics.csv"
    analytics_df.to_csv(analytics_filename, index=False)
    print(f"✅ Analytics Report exported: {analytics_filename}")

    # Display primary results preview
    print(f"\n📋 PRIMARY CSV PREVIEW ({primary_filename}):")
    print("=" * 80)
    print(primary_df.head(10).to_string(index=False))

    return {
        "primary_df": primary_df,
        "enhanced_df": enhanced_df,
        "metrics_df": metrics_df,
        "analytics_df": analytics_df,
        "files_created": [primary_filename, enhanced_filename, metrics_filename, analytics_filename]
    }

# Execute enhanced export
print("🚀 Initiating Enhanced Multi-Format Export System...")
export_results = export_enhanced_results(all_companies, extraction_metrics, analytics)

print(f"\n🎉 Export Complete! Created {len(export_results['files_created'])} files:")
for filename in export_results['files_created']:
    print(f"   📄 {filename}")

In [None]:
# 🎯 FINAL COMPREHENSIVE RESULTS DASHBOARD
print("=" * 80)
print("🏆 CORPORATE INTELLIGENCE EXTRACTION - FINAL REPORT")
print("=" * 80)

primary_df = export_results['primary_df']
enhanced_df = export_results['enhanced_df']

print(f"\n📊 EXTRACTION SUMMARY:")
print(f"   🏢 Total Companies Identified: {len(all_companies)}")
print(f"   ⭐ Average Confidence Score: {extraction_metrics.avg_confidence_score:.1%}")
print(f"   🏆 High-Confidence Extractions: {extraction_metrics.high_confidence_count}")
print(f"   ⏱️  Processing Time: {extraction_metrics.processing_time_total:.1f} seconds")
print(f"   📈 Extraction Efficiency: {extraction_metrics.extraction_rate:.2f} companies/paragraph")

print(f"\n🎯 QUALITY BREAKDOWN:")
high_conf = len([c for c in all_companies if c.confidence_score > 0.8])
med_conf = len([c for c in all_companies if 0.5 < c.confidence_score <= 0.8])
low_conf = len([c for c in all_companies if c.confidence_score <= 0.5])

print(f"   🟢 High Confidence (>80%): {high_conf} companies ({high_conf/len(all_companies)*100:.1f}%)")
print(f"   🟡 Medium Confidence (50-80%): {med_conf} companies ({med_conf/len(all_companies)*100:.1f}%)")
print(f"   🔴 Low Confidence (<50%): {low_conf} companies ({low_conf/len(all_companies)*100:.1f}%)")

print(f"\n📋 STANDARD CSV FORMAT (company_info.csv):")
print("-" * 60)
print(primary_df.to_string(index=False, max_rows=15))

if len(primary_df) > 15:
    print(f"\n... and {len(primary_df) - 15} more companies")

print(f"\n🎉 SUCCESS! Advanced Corporate Intelligence Extraction Complete!")
print(f"📁 All results exported to multiple formats for comprehensive analysis.")

## 📊 Advanced Analytics & Insights

In [None]:
def generate_industry_analytics(companies: List[CompanyInfo]) -> IndustryAnalytics:
    """Generate comprehensive industry and business analytics."""

    # Industry distribution
    industry_counts = {}
    for company in companies:
        industry = company.industry_category or "Unknown"
        industry_counts[industry] = industry_counts.get(industry, 0) + 1

    # Decade distribution
    decade_counts = {}
    for company in companies:
        try:
            year = int(company.founding_date.split('-')[0])
            decade = f"{year//10*10}s"
            decade_counts[decade] = decade_counts.get(decade, 0) + 1
        except:
            decade_counts["Unknown"] = decade_counts.get("Unknown", 0) + 1

    # Founder analytics
    all_founders = []
    founder_counts = {}
    for company in companies:
        all_founders.extend(company.founders)
        count = company.founder_count
        founder_counts[count] = founder_counts.get(count, 0) + 1

    founder_analytics = {
        "total_unique_founders": len(set(all_founders)),
        "avg_founders_per_company": sum(c.founder_count for c in companies) / len(companies) if companies else 0,
        "founder_count_distribution": founder_counts,
        "most_common_founder_names": {},  
    }

    # Geographic distribution
    geo_counts = {}
    for company in companies:
        location = company.founding_location or "Unknown"
        geo_counts[location] = geo_counts.get(location, 0) + 1

    return IndustryAnalytics(
        industry_distribution=industry_counts,
        decade_distribution=decade_counts,
        founder_analytics=founder_analytics,
        geographic_distribution=geo_counts
    )

print("📊 Generating Advanced Business Intelligence...")
analytics = generate_industry_analytics(all_companies)

print("\n🏭 INDUSTRY ANALYSIS:")
print("-" * 40)
for industry, count in sorted(analytics.industry_distribution.items(), key=lambda x: x[1], reverse=True):
    percentage = (count / len(all_companies)) * 100
    print(f"{industry:<20} {count:>3} companies ({percentage:>5.1f}%)")

print("\n📅 FOUNDING DECADE ANALYSIS:")
print("-" * 40)
for decade, count in sorted(analytics.decade_distribution.items()):
    if decade != "Unknown":
        percentage = (count / len(all_companies)) * 100
        print(f"{decade:<10} {count:>3} companies ({percentage:>5.1f}%)")

print("\n👥 FOUNDER INSIGHTS:")
print("-" * 30)
founder_stats = analytics.founder_analytics
print(f"Total Unique Founders: {founder_stats['total_unique_founders']}")
print(f"Average Founders per Company: {founder_stats['avg_founders_per_company']:.1f}")
print("\nFounder Count Distribution:")
for count, companies_count in sorted(founder_stats['founder_count_distribution'].items()):
    print(f"  {count} founder(s): {companies_count} companies")

print("\n🌍 GEOGRAPHIC DISTRIBUTION:")
print("-" * 35)
top_locations = sorted(analytics.geographic_distribution.items(),
                      key=lambda x: x[1], reverse=True)[:10]
for location, count in top_locations:
    if location != "Unknown":
        percentage = (count / len(all_companies)) * 100
        print(f"{location:<25} {count:>2} companies ({percentage:>4.1f}%)")

### 🎯 System Performance & Capabilities Summary

In [None]:
print("🚀 ADVANCED CORPORATE INTELLIGENCE SYSTEM - CAPABILITIES SHOWCASE")
print("=" * 80)

print("\n📊 SYSTEM PERFORMANCE METRICS:")
print(f"   Processing Speed: {extraction_metrics.processing_time_total:.1f} seconds total")
print(f"   Accuracy Rate: {extraction_metrics.avg_confidence_score:.1%} average confidence")
print(f"   Quality Distribution: {extraction_metrics.high_confidence_count}/{extraction_metrics.total_companies} high-confidence extractions")
print(f"   Coverage Rate: {extraction_metrics.extraction_rate:.2f} companies per paragraph")