In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import json

## 1. Load Data

In [3]:
try:
    df = pd.read_csv('startup_valuation_dataset.csv')
    print(f"Dataset loaded: {len(df)} records")
except FileNotFoundError:
    print("Error: Dataset not found.")
    raise

Dataset loaded: 50000 records


## 2. Define Target Variable - Valuation Success

In [4]:
# Define "Success" as being in the top 25% of valuations
df['estimated_valuation_usd'] = df['estimated_valuation_usd'].fillna(0)
valuation_threshold = df['estimated_valuation_usd'].quantile(0.75)
print(f"Success Threshold: Valuation > ${valuation_threshold:,.0f}")

df['is_successful'] = (df['estimated_valuation_usd'] > valuation_threshold).astype(int)
print(f"\nClass distribution:")
print(df['is_successful'].value_counts())

Success Threshold: Valuation > $5,465,729,126

Class distribution:
is_successful
0    37500
1    12500
Name: count, dtype: int64


## 3. Filter Training Data (Mature Companies)

In [5]:
current_year = 2024
df['startup_age'] = current_year - df['founded_year']
training_df = df[df['startup_age'] >= 3].copy()
print(f"Training on {len(training_df)} companies (age >= 3 years)")

Training on 44809 companies (age >= 3 years)


## 4. Feature Engineering

In [6]:
# Funding features
training_df['funding_amount_usd'] = training_df['funding_amount_usd'].fillna(0)
training_df['log_funding'] = np.log1p(training_df['funding_amount_usd'])

# Revenue features
training_df['estimated_revenue_usd'] = training_df['estimated_revenue_usd'].fillna(0)
training_df['log_revenue'] = np.log1p(training_df['estimated_revenue_usd'])

# Efficiency metric
training_df['revenue_per_employee'] = training_df['estimated_revenue_usd'] / training_df['employee_count'].replace(0, 1)

# Investor features
training_df['investor_count'] = training_df['co_investors'].fillna('').apply(lambda x: x.count(',') + 1 if x != '' else 0)

# Round encoding
round_map = {'Pre-Seed': 1, 'Seed': 2, 'Angel': 2, 'Series A': 3, 'Series B': 4, 'Series C': 5, 'Series D': 6, 'IPO': 7}
training_df['round_encoded'] = training_df['funding_round'].map(round_map).fillna(0)

# Categorical encoding
features_to_encode = ['industry', 'region']
X_encoded = pd.get_dummies(training_df[features_to_encode], drop_first=True)

# Combine features
X = pd.concat([
    training_df[['log_funding', 'log_revenue', 'revenue_per_employee', 'employee_count', 'investor_count', 'round_encoded']],
    X_encoded
], axis=1)

y = training_df['is_successful']

print(f"Feature matrix shape: {X.shape}")
print(f"\nFeatures: {list(X.columns)}")

Feature matrix shape: (44809, 17)

Features: ['log_funding', 'log_revenue', 'revenue_per_employee', 'employee_count', 'investor_count', 'round_encoded', 'industry_Blockchain', 'industry_E-commerce', 'industry_Fintech', 'industry_Healthcare', 'industry_Logistics', 'industry_SaaS', 'region_Europe', 'region_Latin America', 'region_MENA', 'region_North America', 'region_Oceania']


## 5. Train-Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set: {len(X_train)} records")
print(f"Test set: {len(X_test)} records")

Training set: 35847 records
Test set: 8962 records


## 6. Train Random Forest Model

In [8]:
print("Training Random Forest model...")

rf_model = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',
    min_samples_leaf=3,
    random_state=42
)

rf_model.fit(X_train, y_train)
print("Model training complete!")

Training Random Forest model...
Model training complete!


## 7. Evaluate Model

In [9]:
y_pred = rf_model.predict(X_test)

print("\n" + "="*40)
print("   VALUATION PREDICTION REPORT")
print("="*40)
print(classification_report(y_test, y_pred, target_names=['Standard', 'High Value (Top 25%)']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"\nOverall Accuracy: {accuracy:.2%}")


   VALUATION PREDICTION REPORT
                      precision    recall  f1-score   support

            Standard       0.96      0.89      0.93      6722
High Value (Top 25%)       0.74      0.89      0.81      2240

            accuracy                           0.89      8962
           macro avg       0.85      0.89      0.87      8962
        weighted avg       0.91      0.89      0.90      8962


Confusion Matrix:
[[6002  720]
 [ 239 2001]]

Overall Accuracy: 89.30%


## 8. Feature Importance Analysis

In [10]:
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\n" + "="*40)
print("      WHAT DRIVES VALUATION?")
print("="*40)
print(feature_importances.head(10).to_string(index=False))


      WHAT DRIVES VALUATION?
             Feature  Importance
         log_funding    0.574743
         log_revenue    0.217890
revenue_per_employee    0.129814
      employee_count    0.035726
       round_encoded    0.011635
      investor_count    0.006942
region_North America    0.002368
      region_Oceania    0.002245
region_Latin America    0.002198
       region_Europe    0.002181


## 9. Save Model and Configuration

In [11]:
# Save the trained model
joblib.dump(rf_model, 'startup_success_model.pkl')
print("Model saved as 'startup_success_model.pkl'")

# Save feature columns and configuration
model_config = {
    'feature_columns': list(X.columns),
    'valuation_threshold': float(valuation_threshold),
    'round_map': round_map,
    'current_year': current_year,
    'features_to_encode': features_to_encode
}

with open('model_config.json', 'w') as f:
    json.dump(model_config, f, indent=2)
print("Model configuration saved as 'model_config.json'")

print("\n✅ Model training and saving complete!")

Model saved as 'startup_success_model.pkl'
Model configuration saved as 'model_config.json'

✅ Model training and saving complete!
