In [None]:
import pandas as pd
import numpy as np
import joblib
import json

## 1. Load Trained Model and Configuration

In [None]:
# Load the trained model
model = joblib.load('startup_success_model.pkl')
print("✅ Model loaded successfully")

# Load configuration
with open('model_config.json', 'r') as f:
    config = json.load(f)

print(f"\nModel Configuration:")
print(f"- Valuation Threshold: ${config['valuation_threshold']:,.0f}")
print(f"- Number of Features: {len(config['feature_columns'])}")
print(f"- Reference Year: {config['current_year']}")

## 2. Load New Data for Prediction

In [None]:
# Load the dataset (or use a new dataset for prediction)
try:
    df = pd.read_csv('startup_valuation_dataset.csv')
    print(f"Dataset loaded: {len(df)} records")
except FileNotFoundError:
    print("Error: Dataset not found.")
    raise

# Display first few rows
print("\nSample data:")
df.head()

## 3. Preprocess Data (Same as Training)

In [None]:
def preprocess_data(df, config):
    """
    Apply the same preprocessing steps used during training.
    """
    df = df.copy()
    
    # Funding features
    df['funding_amount_usd'] = df['funding_amount_usd'].fillna(0)
    df['log_funding'] = np.log1p(df['funding_amount_usd'])
    
    # Revenue features
    df['estimated_revenue_usd'] = df['estimated_revenue_usd'].fillna(0)
    df['log_revenue'] = np.log1p(df['estimated_revenue_usd'])
    
    # Efficiency metric
    df['revenue_per_employee'] = df['estimated_revenue_usd'] / df['employee_count'].replace(0, 1)
    
    # Investor features
    df['investor_count'] = df['co_investors'].fillna('').apply(lambda x: x.count(',') + 1 if x != '' else 0)
    
    # Round encoding
    round_map = config['round_map']
    df['round_encoded'] = df['funding_round'].map(round_map).fillna(0)
    
    # Categorical encoding
    features_to_encode = config['features_to_encode']
    X_encoded = pd.get_dummies(df[features_to_encode], drop_first=True)
    
    # Combine features
    X = pd.concat([
        df[['log_funding', 'log_revenue', 'revenue_per_employee', 'employee_count', 'investor_count', 'round_encoded']],
        X_encoded
    ], axis=1)
    
    # Ensure all training features are present
    for col in config['feature_columns']:
        if col not in X.columns:
            X[col] = 0
    
    # Reorder columns to match training
    X = X[config['feature_columns']]
    
    return X

# Preprocess the data
X_predict = preprocess_data(df, config)
print(f"Preprocessed data shape: {X_predict.shape}")

## 4. Make Predictions

In [None]:
# Predict probabilities
predictions_proba = model.predict_proba(X_predict)

# Predict classes
predictions = model.predict(X_predict)

# Add predictions to dataframe
df['predicted_success'] = predictions
df['success_probability'] = predictions_proba[:, 1]  # Probability of being high-value

print("✅ Predictions complete!")
print(f"\nPredicted Distribution:")
print(df['predicted_success'].value_counts())

## 5. View Predictions

In [None]:
# Select relevant columns for display
result_columns = ['company_name', 'industry', 'founded_year', 'funding_amount_usd', 
                  'estimated_revenue_usd', 'employee_count', 'predicted_success', 'success_probability']

results_df = df[result_columns].copy()
results_df['prediction_label'] = results_df['predicted_success'].map({0: 'Standard', 1: 'High Value'})

# Sort by success probability
results_df = results_df.sort_values('success_probability', ascending=False)

print("\nTop 10 Predicted High-Value Startups:")
print("="*80)
results_df.head(10)

## 6. Analyze High-Probability Predictions

In [None]:
# Filter for high-confidence predictions (>75% probability)
high_confidence = results_df[results_df['success_probability'] > 0.75]

print(f"\nHigh-Confidence High-Value Predictions: {len(high_confidence)} startups")
print("="*80)
high_confidence.head(20)

## 7. Save Predictions

In [None]:
# Save all predictions to CSV
results_df.to_csv('startup_predictions.csv', index=False)
print("✅ Predictions saved to 'startup_predictions.csv'")

# Save high-confidence predictions separately
high_confidence.to_csv('high_value_predictions.csv', index=False)
print("✅ High-confidence predictions saved to 'high_value_predictions.csv'")

## 8. Predict on Custom Input (Optional)

In [None]:
# Example: Create a custom startup to predict
custom_startup = pd.DataFrame([{
    'company_name': 'AI Innovations Inc.',
    'founded_year': 2020,
    'industry': 'Artificial Intelligence',
    'region': 'North America',
    'funding_amount_usd': 50000000,
    'estimated_revenue_usd': 15000000,
    'employee_count': 100,
    'funding_round': 'Series B',
    'co_investors': 'Sequoia Capital,Andreessen Horowitz,Accel'
}])

# Preprocess and predict
X_custom = preprocess_data(custom_startup, config)
custom_prediction = model.predict(X_custom)[0]
custom_probability = model.predict_proba(X_custom)[0, 1]

print("\n" + "="*80)
print("CUSTOM STARTUP PREDICTION")
print("="*80)
print(f"Company: {custom_startup['company_name'].values[0]}")
print(f"Industry: {custom_startup['industry'].values[0]}")
print(f"Funding: ${custom_startup['funding_amount_usd'].values[0]:,.0f}")
print(f"Revenue: ${custom_startup['estimated_revenue_usd'].values[0]:,.0f}")
print(f"\nPrediction: {'High Value (Top 25%)' if custom_prediction == 1 else 'Standard'}")
print(f"Success Probability: {custom_probability:.1%}")