In [10]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import optuna
import matplotlib.pyplot as plt

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e8/sample_submission.csv
/kaggle/input/playground-series-s5e8/train.csv
/kaggle/input/playground-series-s5e8/test.csv


In [11]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e8/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e8/test.csv')
sub_df = pd.read_csv('/kaggle/input/playground-series-s5e8/sample_submission.csv')

In [12]:
# we can create separate column with flag for -1 value
train_df['no_previous_contact'] = (train_df['pdays'] == -1).astype(int)
test_df['no_previous_contact'] = (test_df['pdays'] == -1).astype(int)

# We can create additional column with pdays only without -1 values
train_df['pdays_cleaned'] = train_df['pdays'].where(train_df['pdays'] != -1, np.nan) 
test_df['pdays_cleaned'] = test_df['pdays'].where(test_df['pdays'] != -1, np.nan) 

# We can create additional column with numeric months
#train_df['month_as_num'] = train_df['month'].map({'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11, 'dec':12})
#test_df['month_as_num'] = test_df['month'].map({'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11, 'dec':12})

mapping = ({'yes':1,"no":0})
train_df['loan_code'] = train_df['loan'].map(mapping)
train_df['housing_code'] = train_df['housing'].map(mapping)

train_df['loan_plus_housing'] = (train_df['loan_code']+train_df['housing_code']).astype(int)

test_df['loan_code'] = test_df['loan'].map(mapping)
test_df['housing_code'] = test_df['housing'].map(mapping)

test_df['loan_plus_housing'] = (test_df['loan_code']+test_df['housing_code']).astype(int)

In [13]:
def add_no_and_unknown_counts(df):
    # Zliczanie 'NO' w kolumnach A, B, C
    df['no_count'] = df[['default', 'housing', 'loan']].eq('no').sum(axis=1)
    
    # Zliczanie 'UNKNOWN' w kolumnach D, E
    df['unknown_count'] = df[['education', 'contact','poutcome']].eq('unknown').sum(axis=1)
    
    return df

add_no_and_unknown_counts(train_df)
add_no_and_unknown_counts(test_df)


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,...,month,duration,campaign,pdays,previous,poutcome,no_previous_contact,pdays_cleaned,no_count,unknown_count
0,750000,32,blue-collar,married,secondary,no,1397,yes,no,unknown,...,may,224,1,-1,0,unknown,1,,2,2
1,750001,44,management,married,tertiary,no,23,yes,no,cellular,...,apr,586,2,-1,0,unknown,1,,2,1
2,750002,36,self-employed,married,primary,no,46,yes,yes,cellular,...,may,111,2,-1,0,unknown,1,,1,1
3,750003,58,blue-collar,married,secondary,no,-1380,yes,yes,unknown,...,may,125,1,-1,0,unknown,1,,1,2
4,750004,28,technician,single,secondary,no,1950,yes,no,cellular,...,jul,181,1,-1,0,unknown,1,,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,999995,43,management,married,tertiary,no,0,yes,no,cellular,...,nov,65,2,-1,0,unknown,1,,2,1
249996,999996,40,services,married,unknown,no,522,yes,no,cellular,...,nov,531,1,189,1,failure,0,189.0,2,1
249997,999997,63,retired,married,primary,no,33,no,no,cellular,...,jul,178,1,92,8,success,0,92.0,3,0
249998,999998,50,blue-collar,married,primary,no,2629,yes,no,unknown,...,may,163,2,-1,0,unknown,1,,2,2


train_df['deep_debt'] = ((train_df['balance'] < 0 ) & (train_df['loan_plus_housing'] == 2)).astype(int)

test_df['deep_debt'] = ((test_df['balance'] < 0 ) & (test_df['loan_plus_housing'] == 2)).astype(int)

In [14]:
train_df['day'] = train_df['day'].astype(str)

test_df['day'] = test_df['day'].astype(str)

In [15]:
train_df.columns

Index(['id', 'age', 'job', 'marital', 'education', 'default', 'balance',
       'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign',
       'pdays', 'previous', 'poutcome', 'y', 'no_previous_contact',
       'pdays_cleaned', 'no_count', 'unknown_count'],
      dtype='object')

In [16]:
# ==== WPROWADŹ SWOJE KOLUMNY ====
categorical_cols = [ 'job', 'marital','education','default','housing','loan','contact','month','poutcome', 'day', 'no_previous_contact', 'no_count', 'unknown_count']  #'loan_plus_housing', <- wpisz swoje kolumny kategoryczne
numerical_cols = ['age','balance', 'duration', 'campaign', 'pdays_cleaned']#, 'previous']#'pdays']#, 'previous']    # <- wpisz swoje kolumny numeryczne
target_col = 'y'

# ==== NAJLEPSZE PARAMETRY Z OPTUNA ====
best_params =  {'max_depth': 10, 
                'learning_rate': 0.05576641173716932, 
                'n_estimators': 492, 
                'subsample': 0.7568779660915822, 
                'colsample_bytree': 0.6384212073745787, 
                'gamma': 2.0698430928317997, 
                'reg_alpha': 2.9165075287479447, 
                'reg_lambda': 2.040935203862332
               }


In [17]:
# ==== WCZYTAJ DANE ====
# train_df, test_df wczytane wcześniej
X = train_df[categorical_cols + numerical_cols].copy()
y = train_df[target_col].copy()

X_test = test_df[categorical_cols + numerical_cols].copy()

# ==== ENKODOWANIE ====
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    X_test[col] = le.transform(X_test[col])



# ==== K-FOLD ====
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


# ==== FINAL TRAINING ====
scores = []
test_preds = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    model = XGBClassifier(
        **best_params,
        eval_metric='auc',
        use_label_encoder=False,
        n_jobs=-1,
        scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum()
    )

    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

    # Zbieranie predykcji na test
    test_preds.append(model.predict_proba(X_test)[:, 1])

    print(f"Fold {fold+1} ROC-AUC: {auc:.5f}")

# ==== WYNIKI ====
print(f"\nŚredni ROC-AUC: {np.mean(scores):.5f}")

# ==== UŚREDNIANIE PREDYKCJI NA TEST ====
final_test_preds = np.mean(test_preds, axis=0)

# ==== TWORZENIE SUBMISSION ====
submission = pd.DataFrame({
    'id': test_df['id'],  # dostosuj do nazwy kolumny ID w konkursie
    'y': final_test_preds
})
submission.to_csv('submission.csv', index=False)

print("\nPlik submission.csv zapisany!")

Fold 1 ROC-AUC: 0.96853
Fold 2 ROC-AUC: 0.96721
Fold 3 ROC-AUC: 0.96717
Fold 4 ROC-AUC: 0.96835
Fold 5 ROC-AUC: 0.96775

Średni ROC-AUC: 0.96780

Plik submission.csv zapisany!
