In [1]:
# Imports for Modeling

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import (
    cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
)
from sklearn.metrics import (
    classification_report, confusion_matrix, precision_recall_curve,
    roc_auc_score, roc_curve, precision_score, recall_score, f1_score
)
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from scipy import stats

In [2]:
# Filter Warnings

warnings.filterwarnings('ignore')

## 1. Load Processed Data

### Loading Logic

- Stratified splits for academic replication
- Scaled versions for SVM algorithms
- Proper Series conversion for target variables

### Data Integrity

- Training: 28,860 samples × 22 features
- Test: 7,215 samples × 22 features
- Balanced success rates (7.72% both splits)
- Feature categories sum correctly (3 + 15 + 4 = 22)

### Methodological Rationale

- This follows the academic replication strategy from previous preprocessing and feature engineering notebook, using stratified splits to match Żbikowski & Antosiuk (2021) methodology

In [3]:
# Loading Data for Academic Paper Replication Modeling

print("Loading Data for Academic Paper Replication Modeling")

# Loads the stratified splits (academic paper replication)
print("Loading processed data first")
X_train = pd.read_csv('../data/processed/X_train_stratified.csv')
X_test = pd.read_csv('../data/processed/X_test_stratified.csv')
y_train = pd.read_csv('../data/processed/y_train_stratified.csv').iloc[:, 0]  # Convert to Series
y_test = pd.read_csv('../data/processed/y_test_stratified.csv').iloc[:, 0]    # Convert to Series

# Loads scaled versions for SVM
print("Loading sclaed versions for SVM second")
X_train_scaled = pd.read_csv('../data/processed/X_train_scaled.csv')
X_test_scaled = pd.read_csv('../data/processed/X_test_scaled.csv')

print(f"Training set: {X_train.shape[0]:,} samples, {X_train.shape[1]} features")
print(f"Test set: {X_test.shape[0]:,} samples, {X_test.shape[1]} features")
print(f"Training success rate: {y_train.mean()*100:.2f}%")
print(f"Test success rate: {y_test.mean()*100:.2f}%")

# Feature categories for analysis
geographic_features = ['region_startup_density', 'city_startup_density', 'is_usa']
industry_features = [col for col in X_train.columns if col.startswith('category_')]
temporal_features = ['founded_year_std', 'era_dotcom_era', 'era_post_crash', 'era_recovery']

print(f"\nFeature breakdown:")
print(f"- Geographic features: {len(geographic_features)}")
print(f"- Industry features: {len(industry_features)}")
print(f"- Temporal features: {len(temporal_features)}")
print(f"- Total features: {len(geographic_features) + len(industry_features) + len(temporal_features)}")


Loading Data for Academic Paper Replication Modeling
Loading processed data first
Loading sclaed versions for SVM second
Training set: 28,860 samples, 22 features
Test set: 7,215 samples, 22 features
Training success rate: 7.72%
Test success rate: 7.72%

Feature breakdown:
- Geographic features: 3
- Industry features: 15
- Temporal features: 4
- Total features: 22


## 2. Baseline Model Setup

### Cross Validation Strategy

- 5 fold stratified CV is a proper choice for imbalanced dataset (7.72% success rate)
- shuffle=True with random_state=42 ensures reproducible results
- Stratification maintains class balance across all folds

### Evaluation Metrics

- Proper choices for imbalanced classification: precision, recall, f1, roc_auc
- Matches academic paper's (Żbikowski & Antosiuk (2021)) evaluation framework
- Avoids accuracy (it would be misleading with 92.28% negative class)

### Academic Benchmarks

- Are correctly extracted from Żbikowski & Antosiuk (2021)
- Clear performance targets for validation
- F1-score of 43% reflects the precision/recall balance

In [7]:
# Baseline Model Setup/Configuration

print("Baseline Model Setup")

# Sets up cross validation strategy
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("Cross-validation: 5-fold stratified")

# Defines scoring metrics (academic paper replication metrics)
scoring_metrics = ['precision', 'recall', 'f1', 'roc_auc']
print(f"Evaluation metrics: {scoring_metrics}")

# Academic targets from academic paper: Żbikowski & Antosiuk (2021)
TARGET_PRECISION = 0.57
TARGET_RECALL = 0.34
TARGET_F1 = 0.43

print(f"\nAcademic targets to match/exceed:")
print(f"- Precision: {TARGET_PRECISION:.0%}")
print(f"- Recall: {TARGET_RECALL:.0%}")
print(f"- F1-Score: {TARGET_F1:.0%}")

Baseline Model Setup
Cross-validation: 5-fold stratified
Evaluation metrics: ['precision', 'recall', 'f1', 'roc_auc']

Academic targets to match/exceed:
- Precision: 57%
- Recall: 34%
- F1-Score: 43%


## 3. MODEL 1: Logistic Regression (With Regularization)

### Transformation Applied

- **Hyperparameter Grid Implementation**: Designed systematic parameter space exploration covering regularization strength (C: 0.001-100), penalty types (L1/L2), class weighting strategies (None/balanced), and solver configuration (liblinear) to identify optimal model configuration for imbalanced startup classification task
- **Grid Search Cross-Validation Execution**: Applied 5 fold stratified cross validation across 24 hyperparameter combinations (120 total model fits), optimizing for F1-score to balance precision-recall tradeoffs critical for startup success prediction where both false positives and false negatives carry significant business costs
- **Optimal Configuration Selection**: Identified best performing parameters through systematic evaluation: C=1 (moderate regularization), L1 penalty (feature selection capability), balanced class weighting (addressing 7.72% positive class imbalance), and liblinear solver (efficient for L1/L2 penalties)
- **Model Evaluation Pipeline**: Executed comprehensive performance assessment using test set predictions, calculating precision (16.9%), recall (70.9%), F1-score (27.3%), and AUC-ROC (78.1%) metrics against academic benchmarks from Żbikowski & Antosiuk (2021) research

### Methodological Rationale

- **Academic Replication Compliance**: Applied identical evaluation framework to published research methodology, enabling direct performance comparison with established benchmarks (57% precision, 34% recall, 43% F1-score) while maintaining consistent cross validation and scoring approaches
- **Imbalanced Classification Optimization**: Implemented balanced class weighting to address severe class imbalance (92.28% negative class), preventing model bias toward majority class predictions that would achieve high accuracy but fail to identify startup success patterns
- **Regularization Strategy Selection**: L1 penalty selection enables automatic feature selection during training, identifying most predictive features among 22 founding time variables while preventing overfitting in high dimensional feature space relative to positive class sample size
- **F1-Score Optimization Focus**: Targeted F1-score maximization during hyperparameter tuning to achieve optimal precision recall balance, reflecting real world venture capital decision-making where both missed opportunities (false negatives) and wasted due diligence (false positives) impose significant costs

### Performance Analysis

- **Recall Excellence vs Precision Challenge**: Achieved exceptional recall performance (70.9%) exceeding academic target by 2.1x, successfully identifying 71% of actual startup successes, but suffered from low precision (16.9%) indicating high false positive rate with only 17% of positive predictions being correct
- **Class Imbalance Impact**: Balanced class weighting strategy effectively addressed minority class detection but created precision recall tradeoff where model sensitivity improvement came at cost of prediction specificity, resulting in 83% false positive rate among predicted successes
- **Discriminative Capability Validation**: Strong AUC-ROC performance (78.1%) demonstrates robust ranking ability to distinguish successful from unsuccessful startups, indicating feature set contains meaningful predictive signals despite precision challenges
- **Academic Benchmark Comparison**: Performance gap relative to published targets (F1: 27.3% vs 43% target) suggests potential differences in dataset characteristics, feature engineering approaches, or evaluation methodologies between current implementation and original research

### ML Pipeline Impact

- **Feature Selection Insights**: L1 regularization with C=1 provides automatic feature selection capability, enabling identification of most predictive founding-time characteristics while eliminating noisy variables that could degrade model generalization performance
- **Threshold Optimization Potential**: High AUC-ROC (78.1%) combined with precision recall imbalance indicates significant opportunity for prediction threshold tuning to achieve business-specific cost-sensitive optimization balancing investor risk tolerance and opportunity identification
- **Baseline Model Establishment**: Results provide solid foundation for ensemble method comparison, with 70.9% recall representing upper bound for startup success detection while highlighting need for precision improvement through alternative algorithms
- **Production Deployment Considerations**: Model demonstrates strong sensitivity for screening applications where missing potential successes carries higher cost than investigating false positives, suitable for initial filtering in venture capital deal flow management systems
- **Cross-Validation Stability**: Best CV F1-score (26.7%) closely matches test performance (27.3%), indicating robust generalization without overfitting concerns and reliable performance estimation for production deployment scenarios
- **Business Interpretation Framework**: Balanced class weighting with L1 regularization creates interpretable model where feature coefficients directly indicate founding time characteristics that increase/decrease acquisition probability, supporting stakeholder communication and investment decision justification processes

In [None]:
# MODEL 1: Logisitic Regression w/ regularization

print("MODEL 1: Logistic Regression with Regularization")

# Hyperparameter grid for logistic regression
lr_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],  # Works with L1 and L2
    'class_weight': [None, 'balanced'],
    'max_iter': [1000]
}

print("Hyperparameter Grid:")
for param, values in lr_param_grid.items():
    print(f"  {param}: {values}")

# Grid search with cross validation
print("Running Grid Search Now")
lr_grid = GridSearchCV(
    LogisticRegression(random_state=42),
    lr_param_grid,
    cv=cv_strategy,
    scoring='f1',  # Optimize for F1 (balanced precision/recall)
    n_jobs=-1,
    verbose=1
)

lr_grid.fit(X_train, y_train)

print("Best Logistic Regression parameters:")
print(lr_grid.best_params_)
print(f"Best Cross Validation F1-score: {lr_grid.best_score_:.3f}")

# Evaluating the best model
lr_best = lr_grid.best_estimator_
lr_pred = lr_best.predict(X_test)
lr_pred_proba = lr_best.predict_proba(X_test)[:, 1]

# Calculates the metrics
lr_precision = precision_score(y_test, lr_pred)
lr_recall = recall_score(y_test, lr_pred)
lr_f1 = f1_score(y_test, lr_pred)
lr_auc = roc_auc_score(y_test, lr_pred_proba)

print(f"\nLogistic Regression Results:")
print(f"- Precision: {lr_precision:.3f} (Target: {TARGET_PRECISION:.3f})")
print(f"- Recall: {lr_recall:.3f} (Target: {TARGET_RECALL:.3f})")
print(f"- F1-Score: {lr_f1:.3f} (Target: {TARGET_F1:.3f})")
print(f"- AUC-ROC: {lr_auc:.3f}")

MODEL 1: Logistic Regression with Regularization
Hyperparameter Grid:
  C: [0.001, 0.01, 0.1, 1, 10, 100]
  penalty: ['l1', 'l2']
  solver: ['liblinear']
  class_weight: [None, 'balanced']
  max_iter: [1000]
Running Grid Search Running Now
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Logistic Regression parameters:
{'C': 1, 'class_weight': 'balanced', 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
Best Cross Validation F1-score: 0.267

Logistic Regression Results:
- Precision: 0.169 (Target: 0.570)
- Recall: 0.709 (Target: 0.340)
- F1-Score: 0.273 (Target: 0.430)
- AUC-ROC: 0.781


## 4. MODEL 2: SVM (Support Vector Machine) with RBF (Radial Basis Function) Kernel

### Transformation Applied

- **Reduced Hyperparameter Grid Strategy**: Implemented computationally efficient parameter space exploration with focused ranges: regularization parameter C (0.5-5), RBF kernel gamma values (0.05-0.5), fixed RBF kernel selection, and balanced class weighting, totaling 16 hyperparameter combinations for faster training while maintaining comprehensive coverage
- **Scaled Feature Preprocessing**: Applied StandardScaler transformation to training and test datasets ensuring zero mean unit variance normalization critical for SVM distance-based optimization, preventing feature scale bias that could dominate decision boundary formation in high dimensional startup characteristic space
- **Probability Calibration Integration**: Enabled probability estimation through SVC probability=True parameter for AUC-ROC calculation compatibility, allowing comprehensive model evaluation across multiple metrics while maintaining SVM's discriminative classification capabilities
- **Optimal Configuration Identification**: Selected best performing parameters through systematic grid search: C=1 (moderate regularization), gamma=0.1 (balanced kernel width), RBF kernel (non linear decision boundaries), and balanced class weighting, achieving 25.7% cross-validation F1-score across 80 total model fits

### Methodological Rationale

- **Feature Scaling Necessity**: SVM algorithm sensitivity to feature magnitude differences requires standardization to prevent geographic density features (1-5 range) from being overshadowed by standardized founding year variables, ensuring equal contribution to decision boundary optimization and preventing algorithmic bias toward specific feature scales
- **RBF Kernel Selection Justification**: Radial Basis Function kernel enables detection of non linear relationships between founding time characteristics and startup success probability, capturing complex interaction patterns (geographic-industry synergies, temporal-sector dependencies) that linear models cannot represent effectively
- **Gamma Parameter Optimization Focus**: Gamma controls RBF kernel width determining decision boundary complexity, with optimal value (0.1) balancing model flexibility to capture startup success patterns against overfitting risk in 22-dimensional feature space with 7.72% positive class samples
- **Computational Efficiency Strategy**: Reduced parameter grid (16 vs potential 100+ combinations) maintains thorough hyperparameter exploration while enabling practical training time on 28,860-sample dataset, balancing model optimization thoroughness with computational resource constraints

### Performance Analysis

- **Precision-Recall Tradeoff Consistency**: Achieved similar performance profile to Logistic Regression with moderate precision (15.5%) and strong recall (68.9%), indicating both algorithms identify similar startup success patterns but struggle with false positive control in severely imbalanced classification scenario
- **Performance Gap vs Linear Model**: SVM F1-score (25.2%) trails Logistic Regression (27.3%) by 2.1 percentage points, suggesting linear decision boundaries may be sufficient for founding time feature relationships, questioning complexity benefit of non linear kernel approach for this specific prediction task
- **Academic Benchmark Underperformance**: Results fall significantly short of published targets (F1: 25.2% vs 43% target, precision: 15.5% vs 57% target), indicating either dataset differences, feature engineering variations, or need for advanced ensemble methods to achieve competitive performance levels
- **AUC-ROC Discriminative Assessment**: Moderate AUC-ROC performance (74.0%) demonstrates reasonable ranking ability but 4.1 percentage point decline vs Logistic Regression (78.1%) suggests RBF kernel complexity may introduce noise rather than capturing meaningful non linear patterns in startup success prediction

### ML Pipeline Impact

- **Algorithm Comparison Baseline**: SVM results provide critical comparison point validating that linear relationships dominate founding-time startup characteristics, informing ensemble strategy to emphasize linear models over complex non-linear approaches for optimal performance balance
- **Feature Engineering Validation**: Similar precision recall patterns across linear and non-linear models confirm feature set captures primary startup success signals effectively, indicating preprocessing pipeline success while highlighting need for advanced sampling techniques rather than feature complexity increases
- **Computational Resource Assessment**: Reduced training time through focused hyperparameter grid demonstrates practical deployment considerations for real time prediction systems, where SVM computational overhead may not justify marginal performance improvements over efficient linear alternatives
- **Class Imbalance Handling Effectiveness**: Balanced class weighting produces consistent recall performance (68.9%) across algorithm types, validating imbalanced classification strategy while emphasizing need for precision improvement through threshold optimization or ensemble resampling techniques
- **Production Deployment Considerations**: SVM model provides reliable backup algorithm with different algorithmic assumptions, enabling ensemble diversity and robustness against edge cases where linear assumptions fail, while scaled feature requirements ensure consistent preprocessing pipeline across deployment scenarios
- **Model Interpretability Limitations**: RBF kernel decision boundaries lack direct feature coefficient interpretation compared to L1 regularized Logistic Regression, reducing stakeholder communication effectiveness and limiting business insight generation for venture capital investment decision support frameworks

In [None]:
# MODEL 2: SVM w/ RBF Kernel

print("MODEL 2: SVM w/ RBF Kernel")

# Hyperparameter grid for SVM (using scaled features)
svm_param_grid = {
    'C': [0.5, 1, 2, 5],
    'gamma': [0.05, 0.1, 0.2, 0.5],
    'kernel': ['rbf'],
    'class_weight': ['balanced']
}

print("Reduced Hyperparameter Grid (faster training):")
for param, values in svm_param_grid.items():
    print(f"  {param}: {values}")

# Grid search with cross validation (using scaled data)
print("\nRunning Grid Search on scaled features...")
svm_grid = GridSearchCV(
    SVC(probability=True, random_state=42),  # Enable probability for AUC
    svm_param_grid,
    cv=cv_strategy,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

svm_grid.fit(X_train_scaled, y_train)

print("Best SVM parameters:")
print(svm_grid.best_params_)
print(f"Best CV F1-score: {svm_grid.best_score_:.3f}")

# Evaluating the best model
svm_best = svm_grid.best_estimator_
svm_pred = svm_best.predict(X_test_scaled)
svm_pred_proba = svm_best.predict_proba(X_test_scaled)[:, 1]

# Calculates the metrics
svm_precision = precision_score(y_test, svm_pred)
svm_recall = recall_score(y_test, svm_pred)
svm_f1 = f1_score(y_test, svm_pred)
svm_auc = roc_auc_score(y_test, svm_pred_proba)

print(f"\nSVM Results:")
print(f"- Precision: {svm_precision:.3f} (Target: {TARGET_PRECISION:.3f})")
print(f"- Recall: {svm_recall:.3f} (Target: {TARGET_RECALL:.3f})")
print(f"- F1-Score: {svm_f1:.3f} (Target: {TARGET_F1:.3f})")
print(f"- AUC-ROC: {svm_auc:.3f}")

Reduced Hyperparameter Grid (faster training):
  C: [0.5, 1, 2, 5]
  gamma: [0.05, 0.1, 0.2, 0.5]
  kernel: ['rbf']
  class_weight: ['balanced']

Running Grid Search on scaled features...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best SVM parameters:
{'C': 1, 'class_weight': 'balanced', 'gamma': 0.1, 'kernel': 'rbf'}
Best CV F1-score: 0.257

SVM Results:
- Precision: 0.155 (Target: 0.570)
- Recall: 0.689 (Target: 0.340)
- F1-Score: 0.252 (Target: 0.430)
- AUC-ROC: 0.740


## 5. MODEL 3: XGBoost

In [None]:
# MODEL 3: XGBoost (Academic Paper Best Preforming Model)

print("MODEL 3: XGBoost")

# Hyperparamter Grid for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'scale_pos_weight': [1, 5, 10]  # Handles class imbalance (12:1 ratio)
}

print("Hyperparameter Grid:")
for param, values in xgb_param_grid.items():
    print(f"  {param}: {values}")

# Randomized search (more efficient than grid search for XGBoost)
print("\nRunning Randomized Search...")
xgb_random = RandomizedSearchCV(
    xgb.XGBClassifier(
        random_state=42,
        eval_metric='logloss',
        use_label_encoder=False
    ),
    xgb_param_grid,
    n_iter=50,  # 50 random combinations
    cv=cv_strategy,
    scoring='f1',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

xgb_random.fit(X_train, y_train)

print("Best XGBoost parameters:")
print(xgb_random.best_params_)
print(f"Best CV F1-score: {xgb_random.best_score_:.3f}")

# Evaluates best model
xgb_best = xgb_random.best_estimator_
xgb_pred = xgb_best.predict(X_test)
xgb_pred_proba = xgb_best.predict_proba(X_test)[:, 1]

# Calculates the metrics
xgb_precision = precision_score(y_test, xgb_pred)
xgb_recall = recall_score(y_test, xgb_pred)
xgb_f1 = f1_score(y_test, xgb_pred)
xgb_auc = roc_auc_score(y_test, xgb_pred_proba)

print(f"\nXGBoost Results:")
print(f"- Precision: {xgb_precision:.3f} (Target: {TARGET_PRECISION:.3f})")
print(f"- Recall: {xgb_recall:.3f} (Target: {TARGET_RECALL:.3f})")
print(f"- F1-Score: {xgb_f1:.3f} (Target: {TARGET_F1:.3f})")
print(f"- AUC-ROC: {xgb_auc:.3f}")

MODEL 3: XGBoost
Hyperparameter Grid:
  n_estimators: [100, 200, 300]
  max_depth: [3, 4, 5]
  learning_rate: [0.01, 0.1]
  subsample: [0.8, 0.9, 1.0]
  colsample_bytree: [0.8, 0.9, 1.0]
  scale_pos_weight: [1, 5, 10]

Running Randomized Search...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best XGBoost parameters:
{'subsample': 0.8, 'scale_pos_weight': 5, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
Best CV F1-score: 0.297

XGBoost Results:
- Precision: 0.234 (Target: 0.570)
- Recall: 0.388 (Target: 0.340)
- F1-Score: 0.291 (Target: 0.430)
- AUC-ROC: 0.790
