In [None]:
import pandas as pd

In [None]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')


In [None]:
print(" =============== TRAIN ================")
print(train.info())
print(train.isna().sum())
print(" ======================================")
print(" =============== TEST ================")
print(test.info())
print(test.isna().sum())
print(" ======================================")

### Data preprocessing  

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder


In [None]:
train.columns

In [None]:
# Binary categorical features mapping
binary_map = {'yes' : 1, 'no' : 0}
binary_columns = ['default', 'housing', 'loan']

for col in binary_columns:
    train[col] = train[col].map(binary_map)
    test[col] = test[col].map(binary_map)

# Ordinal encoding for date-related features
month_order = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

ord_enc = OrdinalEncoder(categories=[month_order])
train[['month']] = ord_enc.fit_transform(train[['month']]).astype(int)
test[['month']] = ord_enc.fit_transform(test[['month']]).astype(int)


# One-Hot encode categorical features (excluding already encoded ones)






In [None]:
categorical_cols = ['job', 'marital', 'education', 'contact', 'poutcome']

# Fill missing values in categorical columns
# train[categorical_cols] = train[categorical_cols].fillna("missing")
# test[categorical_cols] = test[categorical_cols].fillna("missing")

# Fit the encoder on train data only
encoder = OneHotEncoder(drop='first', handle_unknown='ignore')
encoder.fit(train[categorical_cols])

# Transform train and test data
encoded_train = encoder.transform(train[categorical_cols])
encoded_test = encoder.transform(test[categorical_cols])

# Convert to DataFrame
encoded_train_df = pd.DataFrame(encoded_train.toarray().astype(int), columns=encoder.get_feature_names_out(categorical_cols))
encoded_test_df = pd.DataFrame(encoded_test.toarray().astype(int), columns=encoder.get_feature_names_out(categorical_cols))

# Concatenate the encoded features with the original data
train = pd.concat([train.reset_index(drop=True), encoded_train_df], axis=1).drop(columns=categorical_cols)
test = pd.concat([test.reset_index(drop=True), encoded_test_df], axis=1).drop(columns=categorical_cols)

In [None]:
# Show the final dataframe structure
print("DataFrame shape after encoding:", train.shape)
print("Sample data:\n", train.head())

In [None]:
train

In [None]:
# interaction features
train['default_housing'] = train['default'] * train['housing']
test['default_housing'] = test['default'] * test['housing']




In [None]:
# Cyclic encoding for month feature
import numpy as np
train['month_sin'] = train['month'].apply(lambda x: np.sin(2 * np.pi * x / 12))
test['month_sin'] = test['month'].apply(lambda x: np.sin(2 * np.pi * x / 12))

train['month_cos'] = train['month'].apply(lambda x: np.cos(2 * np.pi * x / 12))
test['month_cos'] = test['month'].apply(lambda x: np.cos(2 * np.pi * x / 12))


In [None]:
# Log transformation for balance feature

train['balance_log'] = np.log1p(train['balance'])
test['balance_log'] = np.log1p(test['balance'])

In [None]:
train

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data
X = train.drop(columns=['y'])  # Replace 'target' with your target column
y = train['y']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a LightGBM model
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train, y_train)

# Evaluate the model
y_pred = lgb_model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_auc_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:

# -------------------- Models --------------------
models = {
    'Logistic Regression': Pipeline([
        ('clf', LogisticRegression(max_iter=1000))
    ]),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)
}

# -------------------- Handle Missing Values --------------------
from sklearn.impute import SimpleImputer

# Replace infinity values with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute missing values with the mean for numerical columns
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# -------------------- Cross-Validation --------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score)
}

# -------------------- Evaluation --------------------
results = {}
for name, model in models.items():
    print(f"Evaluating: {name}")
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring)
    results[name] = {
        'Accuracy': np.mean(scores['test_accuracy']),
        'F1 Score': np.mean(scores['test_f1']),
        'ROC AUC': np.mean(scores['test_roc_auc'])
    }

# -------------------- Display Results --------------------
results_df = pd.DataFrame(results).T.sort_values(by="ROC AUC", ascending=False)
print("\nModel Performance:\n")
print(results_df)


In [None]:
%pip install CatBoost