In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import pickle
import warnings
warnings.filterwarnings('ignore')

In [19]:
# SIMULATED DATA CREATION (REPLACE WITH YOUR ACTUAL DATA LOADING)
np.random.seed(42)
N = 8500
data = {
    'Total_Funding_USD': np.exp(np.random.normal(loc=14.5, scale=2.5, size=N)),
    'Industry_Sector': np.random.choice(['Software', 'Biotech', 'E-commerce', 'Hardware', 'Media'], N, p=[0.45, 0.15, 0.20, 0.10, 0.10]),
    'Country': np.random.choice(['USA', 'UK', 'India', 'Germany', 'Other'], N, p=[0.55, 0.15, 0.10, 0.10, 0.10]),
    'Founders_Count': np.random.randint(1, 6, N),
    'Previous_Success': np.random.choice([1, 0], N, p=[0.1, 0.9]),
    'Company_Age_Years': np.random.randint(1, 10, N),
    'Is_Successful': np.random.choice([1, 0], N, p=[0.165, 0.835]) # Simulating Imbalance
}
df = pd.DataFrame(data)

# Introduce simulated missing values (as identified in Phase 1 EDA)
df.loc[df.sample(frac=0.15).index, 'Total_Funding_USD'] = np.nan
df.loc[df.sample(frac=0.08).index, 'Company_Age_Years'] = np.nan

# Drop the (simulated) few records where the target is missing
df.dropna(subset=['Is_Successful'], inplace=True)
df['Is_Successful'] = df['Is_Successful'].astype(int)

# Separate features (X) and target (y)
X = df.drop('Is_Successful', axis=1)
y = df['Is_Successful']

print(f"Dataset loaded/simulated. Shape: {X.shape}")
print(f"Success Ratio: {y.sum() / len(y) * 100:.2f}%")

Dataset loaded/simulated. Shape: (8500, 6)
Success Ratio: 16.09%


In [20]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print(f"Train set shape: {X_train.shape}, Test set shape: {X_test.shape}")

Train set shape: (6800, 6), Test set shape: (1700, 6)


In [21]:
# Identify feature types
numerical_features = ['Total_Funding_USD', 'Founders_Count', 'Company_Age_Years']
categorical_features = ['Industry_Sector', 'Country']
binary_features = ['Previous_Success']

from sklearn.preprocessing import FunctionTransformer

# Apply log transform to 'Total_Funding_USD' separately before preprocessing
X_train_transformed = X_train.copy()
X_test_transformed = X_test.copy()

# Use np.log1p and fill NaN values after transformation
X_train_transformed['Total_Funding_USD'] = np.log1p(X_train_transformed['Total_Funding_USD'])
X_test_transformed['Total_Funding_USD'] = np.log1p(X_test_transformed['Total_Funding_USD'])


# 1. Pipeline for Numerical Features (Imputation, Scaling)
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with median
    ('scaler', StandardScaler())                    # Standardize features
])

# 2. Pipeline for Categorical Features (Imputation, One-Hot Encoding)
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Impute missing with mode
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # OHE
])

# 3. Combine Preprocessing Steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features),
        ('bin', 'passthrough', binary_features) # Pass binary features through
    ],
    remainder='drop',
    verbose_feature_names_out=True # Set to True here for compatibility
)

# Fit and Transform the Training Data
X_train_processed = preprocessor.fit_transform(X_train_transformed)
X_test_processed = preprocessor.transform(X_test_transformed)

# Convert back to DataFrame for better inspection (Optional)
feature_names = preprocessor.get_feature_names_out()
X_train_processed_df = pd.DataFrame(X_train_processed, columns=feature_names)

print("Preprocessing complete. Final feature count:", X_train_processed.shape[1])

Preprocessing complete. Final feature count: 14


In [22]:
# Apply SMOTE to the training data only
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)

print(f"Original training shape: {X_train_processed.shape}")
print(f"Resampled training shape: {X_train_resampled.shape}")
print(f"New Success Ratio (Resampled): {y_train_resampled.sum() / len(y_train_resampled) * 100:.2f}%")

Original training shape: (6800, 14)
Resampled training shape: (11412, 14)
New Success Ratio (Resampled): 50.00%


In [23]:
# Initialize the model
rf_model = RandomForestClassifier(random_state=42)

# Define the parameter grid for GridSearch (tuning for imbalance)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 15],
    'min_samples_split': [5, 10],
    'class_weight': ['balanced', None] # Crucial for imbalanced data
}

# Use GridSearchCV for tuning (optimizing for ROC AUC is common with imbalance)
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='roc_auc',  # Using ROC AUC score
    cv=3,
    n_jobs=-1,
    verbose=1
)

print("Starting Grid Search...")
grid_search.fit(X_train_resampled, y_train_resampled)

# Select the best model
best_rf_model = grid_search.best_estimator_
print("\nBest Model Parameters:", grid_search.best_params_)

Starting Grid Search...
Fitting 3 folds for each of 16 candidates, totalling 48 fits

Best Model Parameters: {'class_weight': 'balanced', 'max_depth': 15, 'min_samples_split': 5, 'n_estimators': 200}


In [24]:
# Predict probabilities on the test set
y_pred_proba = best_rf_model.predict_proba(X_test_processed)[:, 1]

# Predict class labels
y_pred = best_rf_model.predict(X_test_processed)

# Calculate key metrics
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print a comprehensive classification report
print("\n" + "="*40)
print("     FINAL MODEL EVALUATION (TEST SET)")
print("="*40)
print(f"ROC AUC Score: {roc_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Failure (0)', 'Success (1)']))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Interpretation of the Report:
# - Precision (Success): Of all the startups predicted to succeed, how many actually did? (Important for investors)
# - Recall (Success): Of all the startups that actually succeeded, how many did the model find? (Important for finding all gems)


     FINAL MODEL EVALUATION (TEST SET)
ROC AUC Score: 0.5646

Classification Report:
              precision    recall  f1-score   support

 Failure (0)       0.86      0.84      0.85      1426
 Success (1)       0.25      0.27      0.26       274

    accuracy                           0.75      1700
   macro avg       0.55      0.56      0.55      1700
weighted avg       0.76      0.75      0.75      1700


Confusion Matrix:
[[1199  227]
 [ 199   75]]


In [25]:
# Save the preprocessor
with open('preprocessor.pkl', 'wb') as file:
    pickle.dump(preprocessor, file)

# Save the best trained model
with open('best_rf_model.pkl', 'wb') as file:
    pickle.dump(best_rf_model, file)

print("\nModel and Preprocessor saved successfully.")


Model and Preprocessor saved successfully.


In [26]:
# Load the saved model and preprocessor
loaded_preprocessor = pd.read_pickle('preprocessor.pkl')
loaded_model = pd.read_pickle('best_rf_model.pkl')

# New Startup Data (Hypothetical Input)
new_startup_data = pd.DataFrame([{
    'Total_Funding_USD': 50000000.0,  # $50 Million
    'Industry_Sector': 'Software',
    'Country': 'USA',
    'Founders_Count': 3,
    'Previous_Success': 1,
    'Company_Age_Years': 4
}])

# 1. Preprocess the new data
new_startup_processed = loaded_preprocessor.transform(new_startup_data)

# 2. Predict the success probability
prediction_proba = loaded_model.predict_proba(new_startup_processed)[:, 1][0]
prediction_class = loaded_model.predict(new_startup_processed)[0]

print("\n" + "="*40)
print("     NEW STARTUP PREDICTION RESULT")
print("="*40)
print(f"Predicted Success Probability: {prediction_proba * 100:.2f}%")
print(f"Predicted Outcome: {'SUCCESS' if prediction_class == 1 else 'FAILURE'}")


     NEW STARTUP PREDICTION RESULT
Predicted Success Probability: 14.26%
Predicted Outcome: FAILURE


In [27]:
# Get feature names after preprocessing
feature_names = loaded_preprocessor.get_feature_names_out()
# Get importances from the Random Forest model
importances = loaded_model.feature_importances_
# Create a DataFrame and sort
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print("\nTop 10 Feature Importances:\n", feature_importances.head(10))


Top 10 Feature Importances:
                           Feature  Importance
2          num__Company_Age_Years    0.265784
0          num__Total_Funding_USD    0.240603
1             num__Founders_Count    0.193229
13          bin__Previous_Success    0.079386
12               cat__Country_USA    0.036325
7   cat__Industry_Sector_Software    0.025522
5   cat__Industry_Sector_Hardware    0.022386
6      cat__Industry_Sector_Media    0.021865
3    cat__Industry_Sector_Biotech    0.020632
11                cat__Country_UK    0.020235
