In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [26]:
"""
AutoML pipeline for Kaggle (train + test prediction)

- Handles numeric and categorical features automatically
- Trains multiple models, finds best hyperparameters
- Predicts on provided test.csv and outputs submission.csv
"""

import pandas as pd
import numpy as np
import os
import time
import joblib
from sklearn.model_selection import StratifiedKFold, KFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_squared_error, r2_score

# Gradient boosting libraries
HAS_XGB = HAS_LGB = HAS_CAT = False
try: from xgboost import XGBClassifier, XGBRegressor; HAS_XGB = True
except: pass
try: from lightgbm import LGBMClassifier, LGBMRegressor; HAS_LGB = True
except: pass
try: from catboost import CatBoostClassifier, CatBoostRegressor; HAS_CAT = True
except: pass
!pip install catboost

RANDOM_STATE = 42

# ------------------- Utilities -------------------

def detect_task(y_series):
    if y_series.dtype.name in ['object', 'category']:
        return 'classification'
    unique = y_series.nunique(dropna=True)
    if np.issubdtype(y_series.dtype, np.integer) and unique <= 20:
        return 'classification'
    return 'regression'

def score_function(task, y_true, y_pred, y_proba=None):
    if task == 'classification':
        acc = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred, average='weighted')
        auc = None
        if y_proba is not None and len(np.unique(y_true)) == 2:
            try: auc = roc_auc_score(y_true, y_proba[:,1])
            except: pass
        return {'accuracy': acc, 'f1_weighted': f1, 'roc_auc': auc}
    else:
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_true, y_pred)
        return {'rmse': rmse, 'mse': mse, 'r2': r2}

# ------------------- Preprocessor -------------------

def build_preprocessor(X, max_onehot_levels=12):
    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

    low_card_cols = [c for c in cat_cols if X[c].nunique(dropna=True) <= max_onehot_levels]
    high_card_cols = [c for c in cat_cols if c not in low_card_cols]

    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    low_card_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

    high_card_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    transformers = []
    if numeric_cols: transformers.append(('num', numeric_transformer, numeric_cols))
    if low_card_cols: transformers.append(('low_card', low_card_transformer, low_card_cols))
    if high_card_cols: transformers.append(('high_card', high_card_transformer, high_card_cols))

    return ColumnTransformer(transformers=transformers, remainder='drop', sparse_threshold=0)
    import warnings
from sklearn.preprocessing import OneHotEncoder


# ------------------- Models & Hyperparameter Spaces -------------------
def get_models_and_spaces(task):
    models = {}
    if task == 'regression':
        models['LinearRegression'] = {'model': LinearRegression(), 'space': {}}
        models['DecisionTreeRegressor'] = {'model': DecisionTreeRegressor(random_state=RANDOM_STATE),
                                           'space': {'model__max_depth': [None, 3, 5, 10, 20],
                                                     'model__min_samples_split': [2, 5, 10, 20]}}
        models['RandomForestRegressor'] = {'model': RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1),
                                           'space': {'model__n_estimators': [100, 200, 500],
                                                     'model__max_depth': [None, 5, 10, 20],
                                                     'model__min_samples_split': [2, 5, 10]}}
        if HAS_XGB:
            models['XGBRegressor'] = {'model': XGBRegressor(random_state=RANDOM_STATE, n_jobs=-1, verbosity=0),
                                      'space': {'model__n_estimators': [100, 300, 500],
                                                'model__max_depth': [3, 5, 8],
                                                'model__learning_rate': [0.01, 0.05, 0.1]}}
        if HAS_LGB:
            models['LGBMRegressor'] = {'model': LGBMRegressor(random_state=RANDOM_STATE, n_jobs=-1),
                                       'space': {'model__n_estimators': [100, 300, 500],
                                                 'model__num_leaves': [31, 50, 100],
                                                 'model__learning_rate': [0.01, 0.05, 0.1]}}
        if HAS_CAT:
            models['CatBoostRegressor'] = {'model': CatBoostRegressor(random_state=RANDOM_STATE, verbose=0),
                                           'space': {'model__iterations': [100, 300, 500],
                                                     'model__depth': [4, 6, 8],
                                                     'model__learning_rate': [0.01, 0.05, 0.1]}}
    else:
        models['LogisticRegression'] = {'model': LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
                                        'space': {'model__C': [0.01, 0.1, 1, 10],
                                                  'model__penalty': ['l2']}}
        models['DecisionTreeClassifier'] = {'model': DecisionTreeClassifier(random_state=RANDOM_STATE),
                                            'space': {'model__max_depth': [None, 3, 5, 10],
                                                      'model__min_samples_split': [2, 5, 10]}}
        models['RandomForestClassifier'] = {'model': RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
                                            'space': {'model__n_estimators': [100, 300, 500],
                                                      'model__max_depth': [None, 5, 10, 20],
                                                      'model__min_samples_split': [2, 5, 10]}}
        if HAS_XGB:
            models['XGBClassifier'] = {'model': XGBClassifier(random_state=RANDOM_STATE, use_label_encoder=False, verbosity=0, n_jobs=-1),
                                       'space': {'model__n_estimators': [100, 300, 500],
                                                 'model__max_depth': [3, 5, 8],
                                                 'model__learning_rate': [0.01, 0.05, 0.1]}}
        if HAS_LGB:
            models['LGBMClassifier'] = {'model': LGBMClassifier(random_state=RANDOM_STATE, n_jobs=-1),
                                        'space': {'model__n_estimators': [100, 300, 500],
                                                  'model__num_leaves': [31, 50, 100],
                                                  'model__learning_rate': [0.01, 0.05, 0.1]}}
        if HAS_CAT:
            models['CatBoostClassifier'] = {'model': CatBoostClassifier(random_state=RANDOM_STATE, verbose=0),
                                            'space': {'model__iterations': [100, 300, 500],
                                                      'model__depth': [4, 6, 8],
                                                      'model__learning_rate': [0.01, 0.05, 0.1]}}
    return models




# ------------------- Main AutoML Runner -------------------

def run_auto_ml(train_df, test_df=None, target_col=None, n_iter_search=25, cv_folds=5, max_onehot_levels=12):
    start_time = time.time()

    if target_col is None:
        for candidate in ['target','TARGET','label','y','Survived']:
            if candidate in train_df.columns: target_col = candidate; break
        if target_col is None: raise ValueError("Provide target column name.")

    X_train = train_df.drop(columns=[target_col])
    y_train = train_df[target_col]
    task = detect_task(y_train)
    print(f"Detected task: {task}")

    # Combine train + test for preprocessing if test_df is provided
    if test_df is not None:
        combined = pd.concat([X_train, test_df], axis=0)
    else:
        combined = X_train.copy()

    preprocessor = build_preprocessor(combined, max_onehot_levels=max_onehot_levels)
    models_spaces = get_models_and_spaces(task)
    results = []

    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=RANDOM_STATE) if task=='classification' else KFold(n_splits=cv_folds, shuffle=True, random_state=RANDOM_STATE)
    scoring = 'accuracy' if task=='classification' else 'neg_root_mean_squared_error'

    for name,cfg in models_spaces.items():
        print('\n'+'='*40)
        print(f"Training model: {name}")
        pipe = Pipeline([('preprocessor', preprocessor), ('model', cfg['model'])])
        param_space = cfg.get('space', {})

        if param_space:
            search = RandomizedSearchCV(pipe, param_distributions=param_space, n_iter=min(n_iter_search,50),
                                        cv=cv, scoring=scoring, random_state=RANDOM_STATE, n_jobs=-1)
            search.fit(X_train,y_train)
            best = search.best_estimator_
            best_params = search.best_params_
        else:
            pipe.fit(X_train,y_train)
            best = pipe
            best_params = {}

        # Evaluate on training data
        y_train_pred = best.predict(X_train)
        y_train_proba = None
        try:
            if task=='classification' and hasattr(best.named_steps['model'],'predict_proba'):
                y_train_proba = best.predict_proba(X_train)
        except: pass
        train_scores = score_function(task, y_train, y_train_pred, y_train_proba)

        # Feature importances
        feature_importances = None
        try:
            m = best.named_steps['model']
            if hasattr(m,'feature_importances_'):
                X_trans = best.named_steps['preprocessor'].transform(X_train)
                feature_names = best.named_steps['preprocessor'].get_feature_names_out() if hasattr(best.named_steps['preprocessor'],'get_feature_names_out') else X_train.columns
                fi = pd.Series(m.feature_importances_, index=feature_names)
                feature_importances = fi.sort_values(ascending=False).head(20)
        except: pass

        results.append({'model': name, 'best_params': best_params,
                        'train_scores': train_scores,
                        'feature_importances': feature_importances})

        joblib.dump(best,f"best_model_{name}.joblib")
        print(f"Train scores for {name}: {train_scores}")
        if best_params:
            print("Best hyperparameters:")
            for k,v in best_params.items(): print(f"  {k}: {v}")
        if feature_importances is not None: print("Top feature importances:\n", feature_importances)

        # Predict on test_df if provided
        if test_df is not None:
            test_preds = best.predict(test_df)
            submission = pd.DataFrame({
                "PassengerId": test_df["PassengerId"],
                target_col: test_preds
            })
            submission_file = f"submission_{name}.csv"
            submission.to_csv(submission_file, index=False)
            print(f"Test predictions saved to {submission_file}")

    elapsed = time.time()-start_time
    print(f"\nAll models finished in {elapsed/60:.2f} minutes")

    return results

# ------------------- CLI -------------------

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='Auto ML pipeline (multi-model search with Kaggle test prediction)')
    parser.add_argument('--train', type=str, default="/train.csv")
    parser.add_argument('--test', type=str, default="/test.csv")
    parser.add_argument('--target', type=str, default="Survived")
    parser.add_argument('--n_iter', type=int, default=25)
    parser.add_argument('--cv', type=int, default=5)
    parser.add_argument('--max_onehot', type=int, default=12)
    args,_ = parser.parse_known_args()

    train_df = pd.read_csv(args.train)
    test_df = pd.read_csv(args.test) if os.path.exists(args.test) else None

    results = run_auto_ml(train_df, test_df=test_df, target_col=args.target, n_iter_search=args.n_iter,
                          cv_folds=args.cv, max_onehot_levels=args.max_onehot)


Detected task: classification

Training model: LogisticRegression




Train scores for LogisticRegression: {'accuracy': 0.8080808080808081, 'f1_weighted': 0.806710661578735, 'roc_auc': np.float64(0.8617955027215886)}
Best hyperparameters:
  model__penalty: l2
  model__C: 10
Test predictions saved to submission_LogisticRegression.csv

Training model: DecisionTreeClassifier




Train scores for DecisionTreeClassifier: {'accuracy': 0.8282828282828283, 'f1_weighted': 0.825898782631449, 'roc_auc': np.float64(0.863284121049436)}
Best hyperparameters:
  model__min_samples_split: 2
  model__max_depth: 3
Top feature importances:
 low_card__Sex_female    0.621440
num__Pclass             0.155642
high_card__Ticket       0.062216
num__Age                0.057095
num__Fare               0.050200
num__SibSp              0.044395
high_card__Name         0.009012
num__PassengerId        0.000000
num__Parch              0.000000
low_card__Sex_male      0.000000
low_card__Embarked_Q    0.000000
low_card__Embarked_C    0.000000
low_card__Embarked_S    0.000000
high_card__Cabin        0.000000
dtype: float64
Test predictions saved to submission_DecisionTreeClassifier.csv

Training model: RandomForestClassifier
Train scores for RandomForestClassifier: {'accuracy': 1.0, 'f1_weighted': 1.0, 'roc_auc': np.float64(1.0)}
Best hyperparameters:
  model__n_estimators: 300
  model__min_



Train scores for LGBMClassifier: {'accuracy': 0.9259259259259259, 'f1_weighted': 0.9254226010645951, 'roc_auc': np.float64(0.9853588129400611)}
Best hyperparameters:
  model__num_leaves: 50
  model__n_estimators: 300
  model__learning_rate: 0.01
Top feature importances:
 high_card__Name         2099
high_card__Ticket       2028
num__PassengerId        1777
num__Fare               1746
num__Age                1431
num__Pclass              332
low_card__Sex_female     295
high_card__Cabin         247
num__SibSp               195
low_card__Embarked_C     105
low_card__Embarked_S      79
num__Parch                54
low_card__Sex_male         8
low_card__Embarked_Q       3
dtype: int32
Test predictions saved to submission_LGBMClassifier.csv

Training model: CatBoostClassifier




Train scores for CatBoostClassifier: {'accuracy': 0.9147025813692481, 'f1_weighted': 0.9138804927405357, 'roc_auc': np.float64(0.9567421894140329)}
Best hyperparameters:
  model__learning_rate: 0.1
  model__iterations: 100
  model__depth: 8
Top feature importances:
 low_card__Sex_female    24.972550
low_card__Sex_male      16.227276
num__Pclass             11.193541
high_card__Ticket        8.958246
num__Age                 8.765641
num__Fare                7.570820
high_card__Name          7.205843
num__PassengerId         6.183712
high_card__Cabin         2.866396
num__SibSp               2.448261
low_card__Embarked_C     1.167143
num__Parch               1.156842
low_card__Embarked_S     1.121396
low_card__Embarked_Q     0.162332
dtype: float64
Test predictions saved to submission_CatBoostClassifier.csv

All models finished in 5.13 minutes
