# Improved Uruti ML model for Funding Class Prediction using LogisticRegression, RandomForest,GradientBoosting,SVC,XGBoost ML algorithims

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Assume 'data' is already generated or loaded
# If you're running this independently, load your actual dataset here
data = pd.read_csv('handled_big_startup_secsees_dataset.csv')
print("✅ Packages loaded.")
data.head()


✅ Packages loaded.


Unnamed: 0,permalink,name,homepage_url,category_list,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,first_funding_at,last_funding_at,funding_class
0,/organization/-fame,#fame,http://livfame.com,Media,10000000.0,operating,IND,16,Mumbai,Mumbai,1,2014-01-05,2015-01-05,2015-01-05,Funding Eligible
1,/organization/-qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,700000.0,operating,USA,DE,DE - Other,Delaware City,2,2014-09-04,2014-03-01,2014-10-14,Funding Eligible
2,/organization/-the-one-of-them-inc-,"(THE) ONE of THEM,Inc.",http://oneofthem.jp,Apps|Games|Mobile,3406878.0,operating,Unknown_country_code,Unknown_state_code,Unknown_region,Unknown_city,1,2013-01-30,2014-01-30,2014-01-30,Funding Eligible
3,/organization/0-6-com,0-6.com,http://www.0-6.com,Curated Web,2000000.0,operating,CHN,22,Beijing,Beijing,1,2007-01-01,2008-03-19,2008-03-19,Funding Eligible
4,/organization/004-technologies,004 Technologies,http://004gmbh.de/en/004-interact,Software,0.0,operating,USA,IL,"Springfield, Illinois",Champaign,1,2010-01-01,2014-07-24,2014-07-24,Mentorship Needed


In [2]:
# Feature engineering
data['founded_year'] = pd.to_datetime(data['founded_at'], errors='coerce').dt.year
data['first_funding_year'] = pd.to_datetime(data['first_funding_at'], errors='coerce').dt.year

def simplify_category(cat):
    if pd.isna(cat): return 'Other'
    cat = cat.lower()
    if 'agri' in cat: return 'AgriTech'
    elif 'bio' in cat or 'health' in cat: return 'Biotech'
    elif 'tech' in cat or 'software' in cat: return 'Software'
    else: return 'Other'

data['simplified_category'] = data['category_list'].apply(simplify_category)


In [3]:
X = data[['funding_total_usd', 'funding_rounds', 'status', 'founded_year', 'first_funding_year', 'simplified_category']]
y = data['funding_class']

numeric_features = ['funding_total_usd', 'funding_rounds', 'founded_year', 'first_funding_year']
categorical_features = ['status', 'simplified_category']

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


In [4]:
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)

# Preprocess features
X_train_pre = preprocessor.fit_transform(X_train)
X_test_pre = preprocessor.transform(X_test)

# Balance with SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_pre, y_train)

print("✅ Data preprocessing and balancing complete.")


✅ Data preprocessing and balancing complete.


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=200, random_state=42),
    "SVC": SVC(kernel='rbf', probability=True),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

results = []

for name, model in models.items():
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test_pre)
    y_proba = model.predict_proba(X_test_pre)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro')
    rec = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    roc = roc_auc_score(y_test, y_proba, multi_class='ovr')
    results.append({
        "model": name,
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "roc_auc": roc
    })
    print(f"🔍 {name} → Accuracy: {acc:.4f}, F1: {f1:.4f}, ROC AUC: {roc:.4f}")

results_df = pd.DataFrame(results).sort_values(by="f1", ascending=False)
print("\n📊 Model comparison (sorted by F1):")
print(results_df)


Found existing installation: xgboost 2.1.4
Uninstalling xgboost-2.1.4:
  Successfully uninstalled xgboost-2.1.4
Collecting xgboost
  Using cached xgboost-2.1.4-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Using cached xgboost-2.1.4-py3-none-macosx_12_0_arm64.whl (1.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.4
🔍 LogisticRegression → Accuracy: 0.7738, F1: 0.8021, ROC AUC: 0.9627
🔍 RandomForest → Accuracy: 1.0000, F1: 1.0000, ROC AUC: 1.0000
🔍 GradientBoosting → Accuracy: 1.0000, F1: 1.0000, ROC AUC: 1.0000
🔍 SVC → Accuracy: 0.7282, F1: 0.7684, ROC AUC: 0.9748
🔍 XGBoost → Accuracy: 0.9968, F1: 0.9961, ROC AUC: 0.9993

📊 Model comparison (sorted by F1):
                model  accuracy  precision    recall        f1   roc_auc
1        RandomForest  1.000000   1.000000  1.000000  1.000000  1.000000
2    GradientBoosting  1.000000   1.000000  1.000000  1.000000  1.000000
4             XGBoost  0.996837   0.993673  0.998576  0.996093  0.999306
0  Logist

In [8]:
# Save best model
best_model_name = results_df.iloc[0]['model']
best_model = models[best_model_name]
import joblib
joblib.dump(best_model, f"Models/best_model_{best_model_name}.pkl")
print(f"✅ Best model ({best_model_name}) saved successfully!")


✅ Best model (RandomForest) saved successfully!


In [7]:
# Predict on one entry row and check if it imatching with the original data
sample_entry = X_test.iloc[0:1]
sample_entry_pre = preprocessor.transform(sample_entry)
sample_prediction = best_model.predict(sample_entry_pre)
sample_prediction_label = label_encoder.inverse_transform(sample_prediction)
print(f"🔍 Sample entry prediction: {sample_prediction_label[0]}")
# Check if the prediction matches the original data 
original_label = label_encoder.inverse_transform(y_test[0:1])
if sample_prediction_label[0] == original_label[0]:
    print("✅ The prediction matches the original data.")
else:
    print("❌ The prediction does not match the original data.")
# Evaluate the best model
print("\n📋 Classification Report:")
print(classification_report(y_test, best_model.predict(X_test_pre), target_names=label_encoder.classes_))

🔍 Sample entry prediction: Funding Eligible
✅ The prediction matches the original data.

📋 Classification Report:
                   precision    recall  f1-score   support

 Funding Eligible       1.00      1.00      1.00      9598
Mentorship Needed       1.00      1.00      1.00      2119
         Rejected       1.00      1.00      1.00      1247

         accuracy                           1.00     12964
        macro avg       1.00      1.00      1.00     12964
     weighted avg       1.00      1.00      1.00     12964

