In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/start-up-founder-retention-prediction/sample_submission.csv
/kaggle/input/start-up-founder-retention-prediction/train.csv
/kaggle/input/start-up-founder-retention-prediction/test.csv


In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from xgboost import XGBClassifier

# ======================================================
# 1) Load TRAIN and EXTERNAL TEST CSVs
# ======================================================

train_path = "/kaggle/input/start-up-founder-retention-prediction/train.csv"
test_path  = "/kaggle/input/start-up-founder-retention-prediction/test.csv"

df = pd.read_csv(train_path)
df_ext_test = pd.read_csv(test_path)

# ======================================================
# 2) FIXED COLUMN GROUPS
# ======================================================

numeric_cols = [
    "founder_id",
    "founder_age",
    "years_with_startup",
    "monthly_revenue_generated",
    "funding_rounds_led",
    "distance_from_investor_hub",
    "num_dependents"
]

categorical_cols = [
    "founder_gender",
    "founder_role",
    "work_life_balance_rating",
    "venture_satisfaction",
    "startup_performance_rating",
    "education_background",
    "personal_status",
    "startup_stage",
    "team_size_category",
    "years_since_founding",
    "innovation_support",
    "startup_reputation"
]

boolean_cols = [
    "working_overtime",
    "remote_operations",
    "founder_visibility",
    "leadership_scope"
]

target_col = "retention_status"

# ======================================================
# 3) Normalize Booleans (handles messy labels)
# ======================================================

def normalize_boolean(col):
    return (
        col.astype(str)
           .str.strip()
           .str.lower()
           .map({
               "true": 1, "false": 0,
               "yes": 1, "no": 0,
               "1": 1, "0": 0
           })
           .astype("Int64")
    )

# TRAIN
for col in boolean_cols:
    df[col] = normalize_boolean(df[col])

# EXTERNAL TEST
for col in boolean_cols:
    if col in df_ext_test.columns:
        df_ext_test[col] = normalize_boolean(df_ext_test[col])

# Convert target
df[target_col] = df[target_col].map({"Stayed": 1, "Left": 0})

# ======================================================
# 4) IDENTIFY AND REMOVE REDUNDANT COLUMNS
# ======================================================

cols_to_drop = []

# A) Near-zero variance columns
for col in numeric_cols + categorical_cols + boolean_cols:
    if df[col].nunique() <= 1:
        cols_to_drop.append(col)

# B) High-correlation numeric columns
numerics_df = df[numeric_cols].astype(float)
corr_matrix = numerics_df.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

high_corr_cols = [col for col in upper_tri.columns if any(upper_tri[col] > 0.95)]
cols_to_drop += high_corr_cols

# Remove duplicates in drop list
cols_to_drop = list(set(cols_to_drop))

print("\n===== REDUNDANT COLUMNS REMOVED =====")
print(cols_to_drop)

# Remove redundant columns
df = df.drop(columns=cols_to_drop)
df_ext_test = df_ext_test.drop(columns=cols_to_drop)

# Update column groups
numeric_cols = [col for col in numeric_cols if col not in cols_to_drop]
categorical_cols = [col for col in categorical_cols if col not in cols_to_drop]
boolean_cols = [col for col in boolean_cols if col not in cols_to_drop]

# ======================================================
# 5) 80:20 SPLIT FOR VALIDATION
# ======================================================

X = df.drop(columns=[target_col])
y = df[target_col]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# ======================================================
# 6) PREPROCESSING PIPELINES
# ======================================================

skewed_numeric_cols = ["monthly_revenue_generated", "distance_from_investor_hub"]
skewed_numeric_cols = [col for col in skewed_numeric_cols if col in numeric_cols]
skew_indices = [numeric_cols.index(c) for c in skewed_numeric_cols]

def log_transform_selected(X):
    X = X.copy().astype(float)
    for idx in skew_indices:
        col = X[:, idx]
        col = np.where(col < 0, 0.0, col)
        X[:, idx] = np.log1p(col)
    return X

numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("log", FunctionTransformer(log_transform_selected, validate=False)),
    ("scaler", StandardScaler())
])

def make_ohe():
    if sklearn.__version__ >= "1.2":
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    else:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", make_ohe())
])

boolean_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_pipeline, numeric_cols),
    ("cat", categorical_pipeline, categorical_cols),
    ("bool", boolean_pipeline, boolean_cols)
])
from sklearn.linear_model import LogisticRegression

# ======================================================
# 7) LOGISTIC REGRESSION MODEL (replaces XGBoost)
# ======================================================

log_reg = LogisticRegression(
    penalty="l2",
    solver="lbfgs",
    max_iter=500,
    class_weight="balanced",     # (optional but helps if target is imbalanced)
    random_state=42
)

clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", log_reg)
])

clf.fit(X_train, y_train)

# ======================================================
# 8) STRATIFIED K-FOLD VALIDATION
# ======================================================

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_f1, fold_auc = [], []
combined_cm = np.zeros((2, 2), dtype=int)

X_val_df = X_val.reset_index(drop=True)
y_val_arr = y_val.reset_index(drop=True).to_numpy()

print("\n===== Validation (20%) with K-Fold =====")
for i, (_, val_idx) in enumerate(skf.split(X_val_df, y_val_arr), 1):
    X_fold = X_val_df.iloc[val_idx]
    y_fold = y_val_arr[val_idx]

    y_pred = clf.predict(X_fold)
    y_prob = clf.predict_proba(X_fold)[:, 1]

    f1 = f1_score(y_fold, y_pred)
    auc = roc_auc_score(y_fold, y_prob)
    cm = confusion_matrix(y_fold, y_pred)

    fold_f1.append(f1)
    fold_auc.append(auc)
    combined_cm += cm

    print(f"\nFold {i} → F1={f1:.4f}, AUC={auc:.4f}")
    print(cm)

print("\n===== FINAL VALIDATION METRICS =====")
print("Average F1:", np.mean(fold_f1))
print("Average AUC:", np.mean(fold_auc))
print("Combined CM:\n", combined_cm)

# ======================================================
# 9) RETRAIN LOGISTIC REGRESSION ON FULL TRAIN.CSV
# ======================================================

clf_full = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", log_reg)
])

clf_full.fit(X, y)
print("\nModel retrained on FULL train.csv ✔")

# ======================================================
# 10) PREDICT ON EXTERNAL TEST & EXPORT CSV
# ======================================================

ext_pred_binary = clf_full.predict(df_ext_test)
ext_pred_label = np.where(ext_pred_binary == 1, "Stayed", "Left")

submission = pd.DataFrame({
    "founder_id": df_ext_test["founder_id"],
    "retention_status": ext_pred_label
})

submission.to_csv("submission.csv", index=False)

print("\nsubmission.csv created successfully!")
print(submission.head())

  return op(a, b)



===== REDUNDANT COLUMNS REMOVED =====
['founder_visibility']

===== Validation (20%) with K-Fold =====

Fold 1 → F1=0.7550, AUC=0.8389
[[873 261]
 [334 917]]

Fold 2 → F1=0.7466, AUC=0.8305
[[854 280]
 [339 912]]

Fold 3 → F1=0.7504, AUC=0.8253
[[838 296]
 [322 929]]

Fold 4 → F1=0.7565, AUC=0.8359
[[847 287]
 [315 935]]

Fold 5 → F1=0.7489, AUC=0.8327
[[827 307]
 [318 932]]

===== FINAL VALIDATION METRICS =====
Average F1: 0.7514875479817921
Average AUC: 0.8326628502629994
Combined CM:
 [[4239 1431]
 [1628 4625]]

Model retrained on FULL train.csv ✔

submission.csv created successfully!
   founder_id retention_status
0       52685           Stayed
1       30585             Left
2       54656           Stayed
3       33442             Left
4       15667           Stayed
