In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score

# Load and clean
df = pd.read_excel("startup_funding.xlsx")
df = df.drop(columns=['Sr No', 'Startup Name', 'Remarks', 'Date dd/mm/yyyy'])
df = df.dropna(subset=['Industry Vertical', 'City Location', 'SubVertical', 'Investors Name', 'Amount in USD'])

In [None]:
# Outlier removal (top 5%)
cap = df['Amount in USD'].quantile(0.95)
df = df[df['Amount in USD'] <= cap]

In [None]:
# Feature: Num_Investors
df['Num_Investors'] = df['Investors Name'].apply(lambda x: len(str(x).split(',')))

# Group rare values
def group_top(series, top_n=10):
    top = series.value_counts().nlargest(top_n).index
    return series.apply(lambda x: x if x in top else 'Other')

for col in ['City Location', 'Industry Vertical', 'SubVertical']:
    df[col] = group_top(df[col])

In [None]:
# Frequency encode
for col in ['City Location', 'Industry Vertical', 'SubVertical']:
    freq = df[col].value_counts()
    df[col + '_freq'] = df[col].map(freq)

In [None]:
# Final feature set
features = ['City Location_freq', 'Industry Vertical_freq', 'SubVertical_freq', 'Num_Investors']
X = df[features]
y = df['Amount in USD'].values.reshape(-1, 1)

# Tier assignment (only for model selection, not used in input!)
def assign_tier(amount):
    if amount <= 5e5:
        return 'Low'
    elif amount <= 5e6:
        return 'Medium'
    else:
        return 'High'

df['Tier_Label'] = df['Amount in USD'].apply(assign_tier)


In [None]:
# Split
X_train, X_test, y_train_raw, y_test_raw = train_test_split(X, y, test_size=0.2, random_state=42)
tier_train = [assign_tier(a[0]) for a in y_train_raw]
tier_test = [assign_tier(a[0]) for a in y_test_raw]

# Train 3 models (with PowerTransformer per tier)
models = {}
pts = {}

for tier in ['Low', 'Medium', 'High']:
    # Get samples of that tier
    X_tier = X_train[np.array(tier_train) == tier]
    y_tier = y_train_raw[np.array(tier_train) == tier]

    pt = PowerTransformer()
    y_trans = pt.fit_transform(y_tier)

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_tier, y_trans.ravel())

    models[tier] = model
    pts[tier] = pt


In [None]:
# Predict using corresponding model
y_preds = []
y_trues = []

for i in range(len(X_test)):
    x_sample = X_test.iloc[i:i+1]
    true_val = y_test_raw[i][0]
    tier = assign_tier(true_val)

    model = models[tier]
    pt = pts[tier]

    y_pred_trans = model.predict(x_sample)[0]
    y_pred = pt.inverse_transform([[y_pred_trans]])[0][0]

    y_preds.append(y_pred)
    y_trues.append(true_val)

# Evaluation
r2 = r2_score(y_trues, y_preds)
mae = mean_absolute_error(y_trues, y_preds)
mape = mean_absolute_percentage_error(y_trues, y_preds)
rmse = np.sqrt(mean_squared_error(y_trues, y_preds))

print("📊 Final Unified Evaluation Across All Tiers:")
print(f"R² Score: {r2:.4f}")
print(f"MAE: ₹{mae:,.2f}")
print(f"MAPE: {mape*100:.2f}%")
print(f"RMSE: ₹{rmse:,.2f}")


📊 Final Unified Evaluation Across All Tiers:
R² Score: 0.4037
MAE: ₹3,781,212.35
MAPE: 68.91%
RMSE: ₹8,870,585.02


In [None]:
import pickle

# Save each model and transformer
for tier in ['Low', 'Medium', 'High']:
    with open(f'{tier}_model.pkl', 'wb') as f:
        pickle.dump(models[tier], f)
    with open(f'{tier}_pt.pkl', 'wb') as f:
        pickle.dump(pts[tier], f)
