In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import optuna
import matplotlib.pyplot as plt

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import kagglehub
# # Download latest version
# path = kagglehub.dataset_download("sushant097/bank-marketing-dataset-full")
# print("Path to dataset files:", path)

In [None]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e8/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e8/test.csv')
sub_df = pd.read_csv('/kaggle/input/playground-series-s5e8/sample_submission.csv')

# RAW DATA  With optuna XGBoost
 {'max_depth': 10, 'learning_rate': 0.088542375425159, 'n_estimators': 353, 'subsample': 0.7933680125492807, 'colsample_bytree': 0.665859932215261, 'gamma': 1.9591038948726576, 'reg_alpha': 4.135658777432998, 'reg_lambda': 3.0378978111339086}

# RAW DATA  With optuna XGBoost bez cechy previous 
 {'max_depth': 9, 'learning_rate': 0.11410300715659989, 'n_estimators': 368, 'subsample': 0.907977121199552, 'colsample_bytree': 0.6014268962223818, 'gamma': 0.7012459904827175, 'reg_alpha': 4.361523501536162, 'reg_lambda': 4.294305690785547}




# we can create separate column with flag for -1 value
train_df['no_previous_contact'] = (train_df['pdays'] == -1).astype(int)
test_df['no_previous_contact'] = (test_df['pdays'] == -1).astype(int)

# We can create additional column with pdays only without -1 values
train_df['pdays_cleaned'] = train_df['pdays'].where(train_df['pdays'] != -1, np.nan) 
test_df['pdays_cleaned'] = test_df['pdays'].where(test_df['pdays'] != -1, np.nan) 

# We can create additional column with numeric months
#train_df['month_as_num'] = train_df['month'].map({'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11, 'dec':12})
#test_df['month_as_num'] = test_df['month'].map({'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11, 'dec':12})

mapping = ({'yes':1,"no":0})
train_df['loan_code'] = train_df['loan'].map(mapping)
train_df['housing_code'] = train_df['housing'].map(mapping)

train_df['loan_plus_housing'] = (train_df['loan_code']+train_df['housing_code']).astype(int)

test_df['loan_code'] = test_df['loan'].map(mapping)
test_df['housing_code'] = test_df['housing'].map(mapping)

test_df['loan_plus_housing'] = (test_df['loan_code']+test_df['housing_code']).astype(int)

def add_no_and_unknown_counts(df):
    # Zliczanie 'NO' w kolumnach A, B, C
    df['no_count'] = df[['default', 'housing', 'loan']].eq('no').sum(axis=1)
    
    # Zliczanie 'UNKNOWN' w kolumnach D, E
    df['unknown_count'] = df[['education', 'contact','poutcome']].eq('unknown').sum(axis=1)
    
    return df

add_no_and_unknown_counts(train_df)
add_no_and_unknown_counts(test_df)


In [None]:
#train_df['deep_debt'] = ((train_df['balance'] < 0 ) & (train_df['loan_plus_housing'] == 2)).astype(int)

#test_df['deep_debt'] = ((test_df['balance'] < 0 ) & (test_df['loan_plus_housing'] == 2)).astype(int)

In [None]:
#train_df['day'] = train_df['day'].astype(str)

#test_df['day'] = test_df['day'].astype(str)

## Create add columns sin/cos  according to month/day as day of year
train_df['month_as_num'] = train_df['month'].map({'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11, 'dec':12})
DAYS_IN_YEAR = 31 * 12  
# Number of the day in whole year
train_df['day_of_year'] = (train_df['month_as_num'] - 1) * 31 + train_df['day']
# Coding as cyclic data
train_df['day_of_year_sin'] = np.sin(2 * np.pi * train_df['day_of_year'] / DAYS_IN_YEAR)
train_df['day_of_year_cos'] = np.cos(2 * np.pi * train_df['day_of_year'] / DAYS_IN_YEAR)

#Same for test_df
test_df['month_as_num'] = test_df['month'].map({'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11, 'dec':12})
test_df['day_of_year'] = (test_df['month_as_num'] - 1) * 31 + test_df['day']
test_df['day_of_year_sin'] = np.sin(2 * np.pi * test_df['day_of_year'] / DAYS_IN_YEAR)
test_df['day_of_year_cos'] = np.cos(2 * np.pi * test_df['day_of_year'] / DAYS_IN_YEAR)

In [None]:
train_df.columns 

In [None]:
# Check if new column can influence model learning process
train_df['month_as_num'] = train_df['month'].map({'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11, 'dec':12})
DAYS_IN_YEAR = 31 * 12  
# Number of the day in whole year
train_df['day_of_year'] = (train_df['month_as_num'] - 1) * 31 + train_df['day']
# Coding as cyclic data
train_df['day_of_year_sin'] = np.sin(2 * np.pi * train_df['day_of_year'] / DAYS_IN_YEAR)
train_df['day_of_year_cos'] = np.cos(2 * np.pi * train_df['day_of_year'] / DAYS_IN_YEAR)

train_df['balance_log'] = train_df['balance'].where(train_df['balance'] > -1, 0).apply(np.log1p)
train_df['duration_log'] = train_df['duration'].where(train_df['duration'] > -1, 0).apply(np.log1p)

#TEST 
test_df['month_as_num'] = test_df['month'].map({'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11, 'dec':12})
test_df['day_of_year'] = (test_df['month_as_num'] - 1) * 31 + test_df['day']
test_df['day_of_year_sin'] = np.sin(2 * np.pi * test_df['day_of_year'] / DAYS_IN_YEAR)
test_df['day_of_year_cos'] = np.cos(2 * np.pi * test_df['day_of_year'] / DAYS_IN_YEAR)
test_df['balance_log'] = test_df['balance'].where(test_df['balance'] > -1, 0).apply(np.log1p)
test_df['duration_log'] = test_df['duration'].where(test_df['duration'] > -1, 0).apply(np.log1p)

In [None]:
# ==== WPROWADŹ SWOJE KOLUMNY ====
categorical_cols = [ 'job', 'marital','education','default','housing','loan','contact','month','poutcome']#, 'no_previous_contact',  'no_count', 'unknown_count']  #'loan_plus_housing', <- wpisz swoje kolumny kategoryczne
numerical_cols = ['age','balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'balance_log', 'duration_log', 'day_of_year_sin', 'day_of_year_cos']#, 'day_of_year_sin', 'day_of_year_cos']#, 'previous']#'pdays']#, 'previous']    # <- wpisz swoje kolumny numeryczne
target_col = 'y'

# ==== NAJLEPSZE PARAMETRY Z OPTUNA ====
best_params = {'max_depth': 8, 
               'learning_rate': 0.02943605111668108, 
               'n_estimators': 6031, 
               'subsample': 0.9512194748967198, 
               'colsample_bytree': 0.623801967411067, 
               'gamma': 0.2564186517635848, 
               'reg_alpha': 2.604997938973841, 
               'reg_lambda': 2.3156755963499447, 
               'min_child_weight': 7, 
               'max_bin': 4576,  
               'grow_policy': 'lossguide'}

In [None]:
# ==== WCZYTAJ DANE ====
# train_df, test_df wczytane wcześniej
X = train_df[categorical_cols + numerical_cols].copy()
y = train_df[target_col].copy()

X_test = test_df[categorical_cols + numerical_cols].copy()

# ==== ENKODOWANIE ====
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    X_test[col] = le.transform(X_test[col])

# ==== K-FOLD ====
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


# ==== FINAL TRAINING ====
scores = []
test_preds = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    model = XGBClassifier(
        **best_params,
        eval_metric='auc',
        use_label_encoder=False,
        n_jobs=-1,
        early_stopping_rounds=300 #ES dodane 
      #  scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum()
    )

    model.fit(
        X_train, y_train,                 
        eval_set=[(X_val, y_val)], #ES dodane 
        verbose=1500 #ES dodane 
    ) 
    y_pred = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

    # Zbieranie predykcji na test
    test_preds.append(model.predict_proba(X_test)[:, 1])

    print(f"Fold {fold+1} ROC-AUC: {auc:.5f}")

# ==== WYNIKI ====
print(f"\nŚredni ROC-AUC: {np.mean(scores):.5f}")

# ==== UŚREDNIANIE PREDYKCJI NA TEST ====
final_test_preds = np.mean(test_preds, axis=0)

# ==== TWORZENIE SUBMISSION ====
submission = pd.DataFrame({
    'id': test_df['id'],  # dostosuj do nazwy kolumny ID w konkursie
    'y': final_test_preds
})
submission.to_csv('submission.csv', index=False)

print("\nPlik submission.csv zapisany!")