<a href="https://www.kaggle.com/code/samyakb/using-xgboost-lightgbm?scriptVersionId=158722148" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X_full = pd.read_csv('/kaggle/input/playground-series-s4e1/train.csv', index_col='id')
X_test_full = pd.read_csv('/kaggle/input/playground-series-s4e1/test.csv', index_col='id')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['Exited'], inplace=True)
y = X_full.Exited
X_full.drop(['Exited','CreditScore'], axis=1, inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

# Select categorical columns 
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [2]:
X_train.head()

Unnamed: 0_level_0,Surname,Geography,Gender,CustomerId,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
15225,Hsu,France,Male,15786717,32.0,10,0.0,2,1.0,1.0,49463.44
28152,Omeokachie,France,Female,15572461,32.0,0,0.0,1,1.0,1.0,75578.67
117535,Hsieh,Spain,Male,15682778,43.0,10,123817.85,2,1.0,1.0,92910.53
92286,P'eng,France,Male,15807525,41.0,4,0.0,1,1.0,1.0,132798.58
133713,Padovano,France,Female,15795564,39.0,7,0.0,1,1.0,0.0,131763.66


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from category_encoders import TargetEncoder

# Assume you already have X_train, X_valid, y_train, and y_valid defined

# Categorical columns with relatively low cardinality
categorical_cols = [cname for cname in X_train.columns if
                    X_train[cname].nunique() < 10 and 
                    X_train[cname].dtype == "object"]

# Numerical columns
numerical_cols = [cname for cname in X_train.columns if 
                  X_train[cname].dtype in ['int64', 'float64']]

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='constant'), numerical_cols),
        ('cat', TargetEncoder(), categorical_cols)
    ])

# Define the model
from lightgbm import LGBMClassifier
from lightgbm import early_stopping, log_evaluation
model=LGBMClassifier(
        random_state=2023,
        objective="binary",
        metric="auc",
        n_jobs=-1,
        n_estimators=5000,
        verbose=-1,    
    )
# model = model = XGBRegressor(
#     objective='reg:squarederror',  # for regression tasks
#     booster='gbtree',              # tree-based models
#     n_estimators=2024,              # number of trees (you can adjust this)
#     learning_rate=0.05,            # step size shrinkage to prevent overfitting
#     max_depth=5,                   # maximum depth of a tree
#     subsample=0.8,                 # fraction of samples used for each boosting round
#     colsample_bytree=0.8,          # fraction of features used for each boosting round
#     random_state=10                # random seed for reproducibility
# )


# Create and fit the pipeline
my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

my_pipeline.fit(X_train, y_train)

# Make predictions on validation set
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)


MAE: 0.14142454630835882


In [4]:
preds_test = my_pipeline.predict(X_test)

In [5]:
# Save test predictions to file
output = pd.DataFrame({'id': X_test.index,
                       'Exited': preds_test})
output.to_csv('submission.csv', index=False)