In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. LOAD DATA
try:
    df = pd.read_csv('startup_valuation_dataset.csv')
except FileNotFoundError:
    print("Error: Dataset not found.")
    exit()

# ---------------------------------------------------------
# NEW TARGET: VALUATION SUCCESS
# ---------------------------------------------------------
# We define "Success" as being in the top 25% of valuations.
# This captures both high-value Exits AND high-value Operating companies.
df['estimated_valuation_usd'] = df['estimated_valuation_usd'].fillna(0)
valuation_threshold = df['estimated_valuation_usd'].quantile(0.75) # Top 25% cutoff
print(f"Success Threshold: Valuation > ${valuation_threshold:,.0f}")

df['is_successful'] = (df['estimated_valuation_usd'] > valuation_threshold).astype(int)

# 2. FILTERING (Keep Mature Companies)
# We still filter young companies so we don't penalize them for being small *yet*.
current_year = 2024
df['startup_age'] = current_year - df['founded_year']
training_df = df[df['startup_age'] >= 3].copy() # Reduced age filter slightly

# 3. FEATURE ENGINEERING
training_df['funding_amount_usd'] = training_df['funding_amount_usd'].fillna(0)
training_df['log_funding'] = np.log1p(training_df['funding_amount_usd'])

# Revenue (Critical)
training_df['estimated_revenue_usd'] = training_df['estimated_revenue_usd'].fillna(0)
training_df['log_revenue'] = np.log1p(training_df['estimated_revenue_usd'])

# Efficiency (Revenue per Employee)
training_df['revenue_per_employee'] = training_df['estimated_revenue_usd'] / training_df['employee_count'].replace(0, 1)

training_df['investor_count'] = training_df['co_investors'].fillna('').apply(lambda x: x.count(',') + 1 if x != '' else 0)

# Round Encoding
round_map = {'Pre-Seed': 1, 'Seed': 2, 'Angel': 2, 'Series A': 3, 'Series B': 4, 'Series C': 5, 'Series D': 6, 'IPO': 7}
training_df['round_encoded'] = training_df['funding_round'].map(round_map).fillna(0)

features_to_encode = ['industry', 'region']
X_encoded = pd.get_dummies(training_df[features_to_encode], drop_first=True)

# Combine Features
X = pd.concat([
    training_df[['log_funding', 'log_revenue', 'revenue_per_employee', 'employee_count', 'investor_count', 'round_encoded']],
    X_encoded
], axis=1)

y = training_df['is_successful']

# 4. SPLIT DATA
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 5. TRAIN MODEL (Random Forest is sufficient here)
print(f"Training Model on {len(X_train)} records...")

rf_model = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced', # Still useful, but data is less imbalanced now (75/25 split)
    min_samples_leaf=3,
    random_state=42
)

rf_model.fit(X_train, y_train)

# 6. EVALUATION
y_pred = rf_model.predict(X_test)

print("\n" + "="*40)
print("   VALUATION PREDICTION REPORT")
print("="*40)
print(classification_report(y_test, y_pred, target_names=['Standard', 'High Value (Top 25%)']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature Importance
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\n" + "="*40)
print("      WHAT DRIVES VALUATION?")
print("="*40)
print(feature_importances.head(5).to_string(index=False))

Success Threshold: Valuation > $5,465,729,126
Training Model on 35847 records...

   VALUATION PREDICTION REPORT
                      precision    recall  f1-score   support

            Standard       0.96      0.89      0.93      6722
High Value (Top 25%)       0.74      0.89      0.81      2240

            accuracy                           0.89      8962
           macro avg       0.85      0.89      0.87      8962
        weighted avg       0.91      0.89      0.90      8962


Confusion Matrix:
[[6003  719]
 [ 240 2000]]

      WHAT DRIVES VALUATION?
             Feature  Importance
         log_funding    0.576341
         log_revenue    0.214013
revenue_per_employee    0.132098
      employee_count    0.035721
       round_encoded    0.011760
