# Software Business Model Study - Pipeline Analysis

This notebook analyzes the data collected by the auto-research pipeline to identify new business models in SMB software companies.

## 1. Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import os
import sys

# Add pipeline directory to path
sys.path.append('../pipeline')
from merge_evidence import merge_evidence_csvs

plt.style.use('default')
plt.rcParams['figure.figsize'] = (10, 6)

## 2. Merge and Load Data

In [None]:
# First, merge all individual company CSV files
df = merge_evidence_csvs()

if df is not None:
    print(f"\nDataset loaded successfully!")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
else:
    print("No data found. Make sure you've run the pipeline first.")

## 3. Basic Data Overview

In [None]:
if df is not None:
    # Basic statistics
    print("=== BASIC STATISTICS ===")
    print(f"Total companies: {df['Company'].nunique()}")
    print(f"Total search attempts: {len(df)}")
    print(f"Evidence found: {len(df[df['EvidenceQuote'].str.strip() != ''])}")
    print(f"Success rate: {len(df[df['EvidenceQuote'].str.strip() != '']) / len(df):.1%}")
    
    print("\n=== COMPANIES ANALYZED ===")
    print(df['Company'].value_counts())

## 4. Evidence Quality Analysis

In [None]:
if df is not None:
    # Add evidence quality columns
    df['has_evidence'] = df['EvidenceQuote'].str.strip() != ''
    df['evidence_length'] = df['EvidenceQuote'].str.len()
    
    # Company-level success rates
    company_stats = df.groupby('Company').agg({
        'has_evidence': ['sum', 'count', 'mean'],
        'evidence_length': 'sum'
    }).round(2)
    
    company_stats.columns = ['Evidence_Found', 'Total_Searches', 'Success_Rate', 'Total_Text']
    company_stats = company_stats.sort_values('Evidence_Found', ascending=False)
    
    print("=== COMPANY PERFORMANCE ===")
    print(company_stats)

## 5. Simple Visualizations

In [None]:
if df is not None:
    # Create simple charts
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 1. Evidence found by company
    company_stats['Evidence_Found'].plot(kind='bar', ax=axes[0,0])
    axes[0,0].set_title('Evidence Found by Company')
    axes[0,0].set_ylabel('Number of Evidence Quotes')
    
    # 2. Success rate by company
    company_stats['Success_Rate'].plot(kind='bar', ax=axes[0,1], color='orange')
    axes[0,1].set_title('Success Rate by Company')
    axes[0,1].set_ylabel('Success Rate')
    
    # 3. Keyword effectiveness
    keyword_stats = df.groupby('SearchKeyword')['has_evidence'].mean().sort_values(ascending=False).head(10)
    keyword_stats.plot(kind='barh', ax=axes[1,0], color='green')
    axes[1,0].set_title('Top 10 Most Effective Keywords')
    axes[1,0].set_xlabel('Success Rate')
    
    # 4. Evidence text length distribution
    evidence_data = df[df['has_evidence']]['evidence_length']
    axes[1,1].hist(evidence_data, bins=20, alpha=0.7, color='purple')
    axes[1,1].set_title('Evidence Text Length Distribution')
    axes[1,1].set_xlabel('Characters')
    axes[1,1].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

## 6. Business Model Detection

In [None]:
if df is not None:
    # Simple business model keywords
    business_models = {
        'SaaS': ['saas', 'subscription', 'monthly', 'cloud'],
        'Platform': ['platform', 'marketplace', 'api'],
        'Consulting': ['consulting', 'custom', 'implementation'],
        'Open Source': ['open source', 'github', 'community'],
        'AI/ML': ['ai', 'machine learning', 'artificial intelligence']
    }
    
    # Count mentions of each business model
    model_counts = {}
    evidence_text = ' '.join(df[df['has_evidence']]['EvidenceQuote'].str.lower())
    
    for model, keywords in business_models.items():
        count = sum(evidence_text.count(keyword) for keyword in keywords)
        model_counts[model] = count
    
    print("=== BUSINESS MODEL MENTIONS ===")
    for model, count in sorted(model_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"{model}: {count} mentions")
    
    # Simple bar chart
    plt.figure(figsize=(10, 6))
    models = list(model_counts.keys())
    counts = list(model_counts.values())
    plt.bar(models, counts, color=['skyblue', 'lightgreen', 'salmon', 'gold', 'plum'])
    plt.title('Business Model Keyword Mentions in Evidence')
    plt.ylabel('Number of Mentions')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## 7. Sample Evidence Review

In [None]:
if df is not None:
    # Show some example evidence quotes
    evidence_df = df[df['has_evidence']].copy()
    
    if len(evidence_df) > 0:
        print("=== SAMPLE EVIDENCE QUOTES ===")
        for i, row in evidence_df.head(5).iterrows():
            print(f"\nCompany: {row['Company']}")
            print(f"Keyword: {row['SearchKeyword']}")
            print(f"Evidence: {row['EvidenceQuote'][:200]}...")
            print("-" * 80)
    else:
        print("No evidence quotes found.")

## 8. Export Summary Results

In [None]:
if df is not None:
    # Save summary to CSV
    summary_file = '../out/analysis_summary.csv'
    company_stats.to_csv(summary_file)
    
    # Save business model analysis
    model_df = pd.DataFrame(list(model_counts.items()), columns=['Business_Model', 'Mentions'])
    model_df.to_csv('../out/business_model_analysis.csv', index=False)
    
    print(f"Results saved to:")
    print(f"- {summary_file}")
    print(f"- ../out/business_model_analysis.csv")
    
    print("\n=== ANALYSIS COMPLETE ===")
    print(f"Check the '../out/' directory for detailed results!")