In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/start-up-founder-retention-prediction/sample_submission.csv
/kaggle/input/start-up-founder-retention-prediction/train.csv
/kaggle/input/start-up-founder-retention-prediction/test.csv


In [9]:
!pip install category_encoders imbalanced-learn

[0m^C


In [1]:
import time
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.svm import SVC

# ======================================================
# Logging helper
# ======================================================

last_time = time.time()

def log(msg):
    global last_time
    now = time.time()
    elapsed = now - last_time
    print(f"[{time.strftime('%H:%M:%S')}] {msg} | +{elapsed:.2f}s")
    last_time = now

# ======================================================
log("STARTING SCRIPT")

# 1) Load TRAIN and TEST
# ======================================================

log("Loading datasets...")
train_path = "/kaggle/input/start-up-founder-retention-prediction/train.csv"
test_path  = "/kaggle/input/start-up-founder-retention-prediction/test.csv"

df = pd.read_csv(train_path)
df_ext_test = pd.read_csv(test_path)

log(f"Train shape: {df.shape}, Test shape: {df_ext_test.shape}")

# ======================================================
# 2) Column Groups
# ======================================================
log("Setting column groups...")

numeric_cols = [
    "founder_id", "founder_age", "years_with_startup",
    "monthly_revenue_generated", "funding_rounds_led",
    "distance_from_investor_hub", "num_dependents"
]

ordinal_cols = [
    "founder_visibility","startup_reputation","team_size_category",
    "startup_stage","startup_performance_rating","venture_satisfaction",
    "work_life_balance_rating"
]

categorical_cols = [
    "founder_gender","founder_role","education_background",
    "personal_status","innovation_support"
]

boolean_cols = ["working_overtime","remote_operations","leadership_scope"]
target_col = "retention_status"

# ======================================================
# 3) Normalize Boolean Columns
# ======================================================
log("Normalizing boolean columns...")

def normalize_boolean(col):
    return (
        col.astype(str)
           .str.strip()
           .str.lower()
           .map({"true":1,"false":0,"yes":1,"no":0,"1":1,"0":0})
           .astype("Int64")
    )

for col in boolean_cols:
    df[col] = normalize_boolean(df[col])
    df_ext_test[col] = normalize_boolean(df_ext_test[col])

log("Boolean normalization done.")

# ======================================================
# 4) Ordinal Encoding
# ======================================================
log("Applying ordinal encoding...")

ordinal_mappings = {
    "founder_visibility": {"low":0,"medium":1,"high":2,"very high":3},
    "startup_reputation": {"poor":0,"fair":1,"good":2,"excellent":3},
    "team_size_category": {"small":0,"medium":1,"large":2},
    "startup_stage": {"entry":0,"mid":1,"senior":2},
    "startup_performance_rating": {"low":0,"below average":1,"average":2,"high":3},
    "venture_satisfaction": {"low":0,"medium":1,"high":2,"very high":3},
    "work_life_balance_rating": {"fair":0,"good":1,"excellent":2}
}

def apply_ordinal(df, col, mapping):
    df[col] = df[col].astype(str).str.strip().str.lower().map(mapping)

for col in ordinal_cols:
    apply_ordinal(df, col, ordinal_mappings[col])
    apply_ordinal(df_ext_test, col, ordinal_mappings[col])

log("Ordinal encoding done.")

# ======================================================
# 5) Convert Target
# ======================================================
log("Converting target variable...")

df[target_col] = df[target_col].map({"Stayed":1, "Left":0})

# ======================================================
# 6) Train/Validation Split
# ======================================================
log("Splitting train/validation sets...")

X = df.drop(columns=[target_col])
y = df[target_col]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

log(f"Train size: {X_train.shape}, Val size: {X_val.shape}")

# ======================================================
# 7) Preprocessing Pipelines
# ======================================================
log("Building preprocessing pipelines...")

skewed_numeric_cols = ["monthly_revenue_generated", "distance_from_investor_hub"]
skewed_numeric_cols = [c for c in skewed_numeric_cols if c in numeric_cols]
skew_indices = [numeric_cols.index(c) for c in skewed_numeric_cols]

def log_transform_selected(X):
    X = X.copy().astype(float)
    for idx in skew_indices:
        X[:, idx] = np.log1p(np.clip(X[:, idx], 0, None))
    return X

numeric_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("log", FunctionTransformer(log_transform_selected, validate=False)),
    ("scale", StandardScaler())
])

def make_ohe():
    if sklearn.__version__ >= "1.2":
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    return OneHotEncoder(handle_unknown="ignore", sparse=False)

categorical_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ohe", make_ohe())
])

boolean_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent"))
])

ordinal_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("scale", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_cols),
    ("ord", ordinal_pipeline, ordinal_cols),
    ("cat", categorical_pipeline, categorical_cols),
    ("bool", boolean_pipeline, boolean_cols)
])

log("Preprocessing setup ready.")

# ======================================================
# 8) Train SVM Model
# ======================================================
log("Training SVM model... This may take a while.")

svm = SVC(
    kernel="rbf",
    C=1.5,
    gamma="scale",
    probability=True,
    random_state=42
)

clf = Pipeline([
    ("preprocess", preprocessor),
    ("model", svm)
])

clf.fit(X_train, y_train)
log("SVM training completed.")

# ======================================================
# 9) K-Fold Evaluation
# ======================================================
log("Starting K-fold validation...")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_f1, fold_auc = [], []
combined_cm = np.zeros((2,2), dtype=int)

X_val_df = X_val.reset_index(drop=True)
y_val_arr = y_val.reset_index(drop=True).to_numpy()

for fold, (_, idx) in enumerate(skf.split(X_val_df, y_val_arr), 1):
    log(f"Evaluating Fold {fold}...")
    
    X_fold = X_val_df.iloc[idx]
    y_fold = y_val_arr[idx]

    y_pred = clf.predict(X_fold)
    y_prob = clf.predict_proba(X_fold)[:, 1]

    f1 = f1_score(y_fold, y_pred)
    auc = roc_auc_score(y_fold, y_prob)
    cm = confusion_matrix(y_fold, y_pred)

    fold_f1.append(f1)
    fold_auc.append(auc)
    combined_cm += cm

    print(f"Fold {fold} - F1={f1:.4f}, AUC={auc:.4f}")
    print(cm)

log("K-fold validation finished.")
print("\nAverage F1:", np.mean(fold_f1))
print("Average AUC:", np.mean(fold_auc))
print("Combined Confusion Matrix:\n", combined_cm)

# ======================================================
# 10) Retrain on Full Data
# ======================================================
log("Training final model on full dataset...")

clf_full = Pipeline([
    ("preprocess", preprocessor),
    ("model", svm)
])

clf_full.fit(X, y)
log("Full training completed.")

# ======================================================
# 11) Predict on External Test
# ======================================================
log("Predicting on external test...")

ext_pred = clf_full.predict(df_ext_test)
ext_pred_label = np.where(ext_pred == 1, "Stayed", "Left")

submission = pd.DataFrame({
    "founder_id": df_ext_test["founder_id"],
    "retention_status": ext_pred_label
})

submission.to_csv("submission.csv", index=False)
log("submission.csv saved!")

print(submission.head())
log("SCRIPT FINISHED.")


[12:17:56] STARTING SCRIPT | +0.00s
[12:17:56] Loading datasets... | +0.00s
[12:17:57] Train shape: (59611, 24), Test shape: (14900, 23) | +0.38s
[12:17:57] Setting column groups... | +0.00s
[12:17:57] Normalizing boolean columns... | +0.00s
[12:17:57] Boolean normalization done. | +0.11s
[12:17:57] Applying ordinal encoding... | +0.00s
[12:17:57] Ordinal encoding done. | +0.22s
[12:17:57] Converting target variable... | +0.00s
[12:17:57] Splitting train/validation sets... | +0.00s
[12:17:57] Train size: (47688, 23), Val size: (11923, 23) | +0.06s
[12:17:57] Building preprocessing pipelines... | +0.00s
[12:17:57] Preprocessing setup ready. | +0.00s
[12:17:57] Training SVM model... This may take a while. | +0.00s
[12:37:19] SVM training completed. | +1162.50s
[12:37:19] Starting K-fold validation... | +0.00s
[12:37:19] Evaluating Fold 1... | +0.01s
Fold 1 - F1=0.7526, AUC=0.8322
[[847 287]
 [323 928]]
[12:37:31] Evaluating Fold 2... | +11.53s
Fold 2 - F1=0.7499, AUC=0.8237
[[838 296]
 [