In [1]:
!pip install category_encoders


Collecting category_encoders
  Downloading category_encoders-2.9.0-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.9.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.9/85.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.9.0


In [2]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler,LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.utils import class_weight
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import category_encoders as ce


import joblib



In [5]:
import os
if not os.path.exists('train.csv') and not os.path.exists('test.csv'):
    !unzip -o start-up-founder-retention-prediction.zip
else:
    print("Files 'train.csv' and 'test.csv' already exist. Skipping unzip.")

Archive:  start-up-founder-retention-prediction.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [6]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_sub = pd.read_csv("sample_submission.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
display(train.head())


Train shape: (59611, 24)
Test shape: (14900, 23)


Unnamed: 0,founder_id,founder_age,founder_gender,years_with_startup,founder_role,monthly_revenue_generated,work_life_balance_rating,venture_satisfaction,startup_performance_rating,funding_rounds_led,...,num_dependents,startup_stage,team_size_category,years_since_founding,remote_operations,leadership_scope,innovation_support,startup_reputation,founder_visibility,retention_status
0,8410,31,Male,19,Education,5390.0,Excellent,Medium,Average,2,...,0.0,Mid,Medium,89.0,No,No,No,Excellent,Medium,Stayed
1,64756,59,Female,4,Media,5534.0,Poor,High,Low,3,...,3.0,Mid,Medium,21.0,No,No,No,Fair,Low,Stayed
2,30257,24,Female,10,Healthcare,8159.0,Good,High,Low,0,...,3.0,Mid,Medium,74.0,No,No,No,Poor,Low,Stayed
3,65791,36,Female,7,Education,3989.0,Good,High,High,1,...,2.0,Mid,Small,50.0,Yes,No,No,Good,Medium,Stayed
4,65026,56,Male,41,Education,4821.0,,,Average,0,...,0.0,Senior,Medium,68.0,No,No,No,Fair,Medium,Stayed


In [7]:



TARGET = "retention_status"
IDCOL = "founder_id"

if TARGET not in train.columns:
    raise KeyError(f"Target column '{TARGET}' not found in train columns: {list(train.columns)[:30]}")

X = train.drop(columns=[TARGET], errors="ignore").drop(columns=["founder_id"], errors="ignore")
y_raw = train[TARGET].copy()

le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y_raw)

print("Classes:", list(le_target.classes_))

X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print("Shapes -> X_train:", X_train.shape, "X_val:", X_val.shape, "y_train:", y_train.shape, "y_val:", y_val.shape)


Classes: ['Left', 'Stayed']
Shapes -> X_train: (47688, 22) X_val: (11923, 22) y_train: (47688,) y_val: (11923,)


In [8]:
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
print("cat_cols:", len(cat_cols), "num_cols:", len(num_cols))

pipe_te_df = Pipeline([
    ("targ_enc", ce.TargetEncoder(cols=cat_cols, smoothing=0.2)),
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", SVC(kernel="rbf", probability=False, random_state=42))
])
pipe_te_df


cat_cols: 15 num_cols: 7


In [9]:
SAMPLE_FRAC = 1.0
if SAMPLE_FRAC < 1.0:
    X_train_sample = X_train.sample(frac=SAMPLE_FRAC, random_state=42)
    y_train_sample = y_train[X_train_sample.index]
    print("Using sampled training set:", X_train_sample.shape)
else:
    X_train_sample = X_train
    y_train_sample = y_train
    print("Using full training set:", X_train_sample.shape)

Using full training set: (47688, 22)


In [10]:
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric cols:", len(num_cols), "Categorical cols:", len(cat_cols))

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor_keras = ColumnTransformer(
    transformers=[
        ('num', num_pipe, num_cols),
        ('cat', cat_pipe, cat_cols)
    ],
    remainder='drop'
)
preprocessor_keras.fit(X)

X_all = preprocessor_keras.transform(X)
X_test_all = preprocessor_keras.transform(test.drop(columns=[IDCOL], errors='ignore'))

print("X_all shape:", X_all.shape)
print("X_test_all shape:", X_test_all.shape)

X_train_k, X_val_k, y_train_k, y_val_k = train_test_split(
    X_all, y_encoded,
    test_size=0.2,
    stratify=y_encoded,
    random_state=42
)

print("Train split:", X_train_k.shape, "Val split:", X_val_k.shape)



Numeric cols: 7 Categorical cols: 15
X_all shape: (59611, 56)
X_test_all shape: (14900, 56)
Train split: (47688, 56) Val split: (11923, 56)


In [11]:

import time, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.decomposition import PCA

np.random.seed(42)


try:
    train_df
except NameError:
    import os
    if os.path.exists("train.csv"):
        train_df = pd.read_csv("train.csv")
    else:
        raise RuntimeError("train_df not found in notebook and train.csv is not in working directory. Place train_df or train.csv and rerun this cell.")


train_df['retention_status'] = train_df['retention_status'].astype(str).str.strip().str.title()
train_df['retention_status'] = train_df['retention_status'].replace({'Stayed':'Retained','Exited':'Left'})
train_df = train_df[train_df['retention_status'].isin(['Retained','Left'])].reset_index(drop=True)


eps = 1e-6

train_df['startup_performance_rating'] = train_df.get('startup_performance_rating', '').astype(str).str.strip().str.title().replace({'Below Average':'Poor','Low':'Poor','High':'Excellent'})
train_df['startup_performance_rating'] = train_df['startup_performance_rating'].replace({'Nan': np.nan, 'None': np.nan}).fillna('Unknown')
perf_map = {'Poor':1,'Average':2,'Good':3,'Excellent':4,'Unknown':0}
train_df['startup_performance_num'] = train_df['startup_performance_rating'].map(perf_map).fillna(0)


team_map = {'Small':10,'Medium':50,'Large':200}
if 'team_size_category' in train_df.columns:
    train_df['team_size_num'] = train_df['team_size_category'].map(team_map).fillna(0)
else:
    train_df['team_size_num'] = 0


numeric_fill_cols = [c for c in ['monthly_revenue_generated','num_dependents','years_since_founding'] if c in train_df.columns]
for c in numeric_fill_cols:
    train_df[c + '_missing_ind'] = train_df[c].isnull().astype(int)
    train_df[c] = train_df[c].fillna(train_df[c].median())


if 'years_with_startup' in train_df.columns and 'years_since_founding' in train_df.columns:
    mask = train_df['years_with_startup'] > train_df['years_since_founding']
    train_df.loc[mask, 'years_with_startup'] = train_df.loc[mask, 'years_since_founding']
    train_df['inconsistent_years'] = mask.astype(int)
else:
    train_df['inconsistent_years'] = 0
    if 'years_with_startup' not in train_df.columns:
        train_df['years_with_startup'] = 0
    if 'years_since_founding' not in train_df.columns:
        train_df['years_since_founding'] = 1


train_df['experience_ratio'] = (train_df['years_with_startup'] / (train_df['years_since_founding'] + eps)).clip(0,2)
train_df['revenue_perf'] = train_df.get('monthly_revenue_generated', 0) * train_df['startup_performance_num']
train_df['revenue_per_head'] = train_df.get('monthly_revenue_generated', 0) / (train_df.get('team_size_num',0) + 1)


numeric_cols = [c for c in ['founder_age','years_with_startup','monthly_revenue_generated','funding_rounds_led',
                            'distance_from_investor_hub','num_dependents','years_since_founding',
                            'experience_ratio','startup_performance_num','team_size_num','revenue_perf','revenue_per_head',
                            'monthly_revenue_generated_missing_ind','num_dependents_missing_ind','years_since_founding_missing_ind',
                            'inconsistent_years'] if c in train_df.columns]


categorical_cols = [c for c in ['founder_gender','working_overtime','remote_operations','innovation_support','startup_stage'] if c in train_df.columns]


for c in categorical_cols:
    train_df[c] = train_df[c].fillna('Unknown').astype(str)


X_full_df = train_df[numeric_cols + categorical_cols].copy()
y_full = train_df['retention_status'].map({'Retained':1,'Left':0}).astype(int)


X_small_df, _, y_small, _ = train_test_split(X_full_df, y_full, train_size=0.2, stratify=y_full, random_state=42)


numeric_pipe = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', RobustScaler())])
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')), ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

preprocessor_full = ColumnTransformer([('num', numeric_pipe, numeric_cols), ('cat', cat_pipe, categorical_cols)])
preprocessor_small = ColumnTransformer([('num', numeric_pipe, numeric_cols), ('cat', cat_pipe, categorical_cols)])

preprocessor_full.fit(X_full_df)
preprocessor_small.fit(X_small_df)

X_full_trans = preprocessor_full.transform(X_full_df)
X_small_trans = preprocessor_small.transform(X_small_df)


n_comp_full = min(40, X_full_trans.shape[1])
pca_full = PCA(n_components=n_comp_full, random_state=42)
X_full_pca = pca_full.fit_transform(X_full_trans)

n_comp_small = min(20, X_small_trans.shape[1])
pca_small = PCA(n_components=n_comp_small, random_state=42)
X_small_pca = pca_small.fit_transform(X_small_trans)


print("Prepared datasets:")
print(f"  Full dataset rows: {X_full_df.shape[0]}, features after preproc: {X_full_trans.shape[1]}, PCA dims: {X_full_pca.shape[1]}")
print(f"  Small dataset rows: {X_small_df.shape[0]}, features after preproc: {X_small_trans.shape[1]}, PCA dims: {X_small_pca.shape[1]}")

X_full_pca_var = X_full_pca
y_full_var = y_full.values
X_small_pca_var = X_small_pca
y_small_var = y_small.values


globals().update({
    'X_full_pca_var': X_full_pca_var,
    'y_full_var': y_full_var,
    'X_small_pca_var': X_small_pca_var,
    'y_small_var': y_small_var
})


Prepared datasets:
  Full dataset rows: 59611, features after preproc: 27, PCA dims: 27
  Small dataset rows: 11922, features after preproc: 27, PCA dims: 20


In [None]:

import time
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd


X_full = X_full_pca_var
y_full = y_full_var
X_small = X_small_pca_var
y_small = y_small_var

def train_eval(X, y, label):
    out = {}

    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    svm = SVC(kernel='linear', C=0.0624, random_state=42)
    t0 = time.time(); svm.fit(X_tr, y_tr); t1 = time.time()
    y_pred = svm.predict(X_val)
    out['svm_time_sec'] = round(t1 - t0, 2)
    out['svm_f1'] = round(f1_score(y_val, y_pred), 4)
    out['svm_acc'] = round(accuracy_score(y_val, y_pred), 4)

    mlp = MLPClassifier(hidden_layer_sizes=(128,64), learning_rate_init=0.001, max_iter=300, batch_size=128, random_state=42)
    t0 = time.time(); mlp.fit(X_tr, y_tr); t1 = time.time()
    y_pred = mlp.predict(X_val)
    out['mlp_time_sec'] = round(t1 - t0, 2)
    out['mlp_f1'] = round(f1_score(y_val, y_pred), 4)
    out['mlp_acc'] = round(accuracy_score(y_val, y_pred), 4)
    out['n_train_rows'] = X_tr.shape[0]
    out['n_val_rows'] = X_val.shape[0]
    return out

print("Training on FULL dataset...")
res_full = train_eval(X_full, y_full, 'full')

print("Training on 20% subset...")
res_small = train_eval(X_small, y_small, 'small')


rows = []
rows.append({
    'dataset': 'full',
    'n_rows_train': res_full['n_train_rows'],
    'svm_f1': res_full['svm_f1'],
    'svm_acc': res_full['svm_acc'],
    'svm_time_s': res_full['svm_time_sec'],
    'mlp_f1': res_full['mlp_f1'],
    'mlp_acc': res_full['mlp_acc'],
    'mlp_time_s': res_full['mlp_time_sec']
})
rows.append({
    'dataset': '20pct',
    'n_rows_train': res_small['n_train_rows'],
    'svm_f1': res_small['svm_f1'],
    'svm_acc': res_small['svm_acc'],
    'svm_time_s': res_small['svm_time_sec'],
    'mlp_f1': res_small['mlp_f1'],
    'mlp_acc': res_small['mlp_acc'],
    'mlp_time_s': res_small['mlp_time_sec']
})

df_comp = pd.DataFrame(rows).set_index('dataset')
print("\n=== Comparison table ===")
print(df_comp)


interpret = []

if df_comp.loc['full','svm_f1'] > df_comp.loc['20pct','svm_f1']:
    interpret.append("Linear SVM improved on the full dataset (higher F1).")
else:
    interpret.append("Linear SVM did not improve on the full dataset.")

if df_comp.loc['full','mlp_f1'] > df_comp.loc['20pct','mlp_f1']:
    interpret.append("MLP improved on the full dataset (higher F1).")
else:
    interpret.append("MLP did not improve on the full dataset.")

print("\nNotes:")
for s in interpret:
    print(" -", s)


Training on FULL dataset...


In [None]:
input_dim = X_train_k.shape[1]
print("Input dimension:", input_dim)

classes = np.unique(y_encoded)
class_weights_array = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_encoded
)
class_weight = {int(c): float(w) for c, w in zip(classes, class_weights_array)}
print("Class weights:", class_weight)

def build_mlp_model():
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

model = build_mlp_model()

callbacks = [
    keras.callbacks.EarlyStopping(
        patience=5,
        restore_best_weights=True,
        monitor='val_loss'
    ),
    keras.callbacks.ReduceLROnPlateau(
        patience=3,
        factor=0.5,
        min_lr=1e-5,
        monitor='val_loss'
    )
]

print("\nTraining Keras MLP (with class_weight) on train/val split")
history = model.fit(
    X_train_k, y_train_k,
    validation_data=(X_val_k, y_val_k),
    epochs=50,
    batch_size=256,
    class_weight=class_weight,
    callbacks=callbacks,
    verbose=2
)

print("\nTuning threshold on validation")
val_probs = model.predict(X_val_k, verbose=0).ravel()

thresholds = np.linspace(0.35, 0.65, 31)
best_t = 0.5
best_acc = 0

for t in thresholds:
    preds = (val_probs >= t).astype(int)
    acc = accuracy_score(y_val_k, preds)
    if acc > best_acc:
        best_acc = acc
        best_t = t

print("Best threshold on val:", best_t)
print("Best val accuracy:", round(best_acc, 5))
model_full = build_mlp_model()

es_full = keras.callbacks.EarlyStopping(
    patience=4,
    restore_best_weights=True,
    monitor='loss'
)

history_full = model_full.fit(
    X_all, y_encoded,
    epochs=len(history.history['loss']),
    batch_size=256,
    class_weight=class_weight,
    callbacks=[es_full],
    verbose=2
)

print("\nPredicting on test")
test_probs = model_full.predict(X_test_all, verbose=0).ravel()
preds_num = (test_probs >= best_t).astype(int)

preds_str = le_target.inverse_transform(preds_num)

if "founder_id" in test.columns:
    ids = test["founder_id"].values
else:
    ids = np.arange(len(preds_str))

submission_keras = pd.DataFrame({
    "founder_id": ids,
    "retention_status": preds_str
})

out_path = "submission_keras_mlp.csv"
submission_keras.to_csv(out_path, index=False)


print("\nSubmission saved:", out_path)
print("Used threshold:", best_t)
print("Used class weights:", class_weight)


In [None]:


import numpy as np, pandas as pd, joblib

best_svm_pipe = rnd_svm.best_estimator_
print("Using best SVM params:", rnd_svm.best_params_)


print("\n Fitting best SVM on FULL training data...")
best_svm_pipe.fit(X, y_encoded)
print(" Full-data SVM fit complete.")


test_features = test.drop(columns=[IDCOL], errors="ignore")


svm_preds_num = best_svm_pipe.predict(test_features)


svm_preds_str = le_target.inverse_transform(svm_preds_num)


if IDCOL in test.columns:

    ids = test[IDCOL].values
else:
    ids = np.arange(len(svm_preds_str))

submission_svm_tuned = pd.DataFrame({
    "founder_id": ids,
    "retention_status": svm_preds_str
})

out_svm = "/content/submission_svm_tuned.csv"
submission_svm_tuned.to_csv(out_svm, index=False)

joblib.dump(best_svm_pipe, "/content/svm_tuned_full.joblib")

print("\n Tuned SVM submission saved:", out_svm)
print(" Tuned SVM model saved:", "/content/svm_tuned_full.joblib")
print(" Submission rows:", len(submission_svm_tuned))


In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.exceptions import NotFittedError
import joblib

X_full = train_df.drop(columns=['retention_status'])
y_full = train_df['retention_status'].map({'Retained':1,'Left':0})

try:
    n_features_after_preproc = preprocessor.transform(X_full).shape[1]
except NotFittedError:
    preprocessor.fit(X_full)
    n_features_after_preproc = preprocessor.transform(X_full).shape[1]

n_pca = min(40, n_features_after_preproc)

lr_pipe = Pipeline([
    ('preprocess', preprocessor),
    ('pca', PCA(n_components=n_pca, random_state=42)),
    ('clf', LogisticRegression(solver='saga', penalty='l2', C=1.0, max_iter=1000, class_weight='balanced', random_state=42, n_jobs=-1))
])

lr_pipe.fit(X_full, y_full)

test_ids = test_df['founder_id']
test_X = test_df.drop(columns=['founder_id']).copy()
test_preds = lr_pipe.predict(test_X)
label_map = {1:'Stayed', 0:'Left'}
submission = pd.DataFrame({'founder_id': test_ids, 'retention_status': [label_map[int(p)] for p in test_preds]})
submission.to_csv('/content/submission_lr.csv', index=False, encoding='utf-8')
joblib.dump(lr_pipe, '/content/best_logreg.joblib')
print("Saved /content/submission_lr.csv and /content/best_logreg.joblib")
print(submission['retention_status'].value_counts())


In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_sub = pd.read_csv("sample_submission.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
display(train.head())