In [1]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import optuna
import matplotlib.pyplot as plt

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e8/sample_submission.csv
/kaggle/input/playground-series-s5e8/train.csv
/kaggle/input/playground-series-s5e8/test.csv


In [2]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e8/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e8/test.csv')
sub_df = pd.read_csv('/kaggle/input/playground-series-s5e8/sample_submission.csv')

# RAW DATA  With optuna XGBoost
 {'max_depth': 10, 'learning_rate': 0.088542375425159, 'n_estimators': 353, 'subsample': 0.7933680125492807, 'colsample_bytree': 0.665859932215261, 'gamma': 1.9591038948726576, 'reg_alpha': 4.135658777432998, 'reg_lambda': 3.0378978111339086}

# RAW DATA  With optuna XGBoost bez cechy previous 
 {'max_depth': 9, 'learning_rate': 0.11410300715659989, 'n_estimators': 368, 'subsample': 0.907977121199552, 'colsample_bytree': 0.6014268962223818, 'gamma': 0.7012459904827175, 'reg_alpha': 4.361523501536162, 'reg_lambda': 4.294305690785547}




In [3]:
# we can create separate column with flag for -1 value
train_df['no_previous_contact'] = (train_df['pdays'] == -1).astype(int)
test_df['no_previous_contact'] = (test_df['pdays'] == -1).astype(int)

# We can create additional column with pdays only without -1 values
train_df['pdays_cleaned'] = train_df['pdays'].where(train_df['pdays'] != -1, np.nan) 
test_df['pdays_cleaned'] = test_df['pdays'].where(test_df['pdays'] != -1, np.nan) 

# We can create additional column with numeric months
#train_df['month_as_num'] = train_df['month'].map({'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11, 'dec':12})
#test_df['month_as_num'] = test_df['month'].map({'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11, 'dec':12})

In [4]:
mapping = ({'yes':1,"no":0})
train_df['loan_code'] = train_df['loan'].map(mapping)
train_df['housing_code'] = train_df['housing'].map(mapping)

train_df['loan_plus_housing'] = (train_df['loan_code']+train_df['housing_code']).astype(int)

test_df['loan_code'] = test_df['loan'].map(mapping)
test_df['housing_code'] = test_df['housing'].map(mapping)

test_df['loan_plus_housing'] = (test_df['loan_code']+test_df['housing_code']).astype(int)

In [5]:
#train_df['deep_debt'] = ((train_df['balance'] < 0 ) & (train_df['loan_plus_housing'] == 2)).astype(int)

#test_df['deep_debt'] = ((test_df['balance'] < 0 ) & (test_df['loan_plus_housing'] == 2)).astype(int)

In [6]:
#train_df['day'] = train_df['day'].astype(str)

#test_df['day'] = test_df['day'].astype(str)

In [7]:
# ==== WPROWADŹ SWOJE KOLUMNY ====
categorical_cols = [ 'job', 'marital','education','default','housing','loan','contact','month','poutcome', 'no_previous_contact', 'loan_plus_housing' ]  # <- wpisz swoje kolumny kategoryczne
numerical_cols = ['age','balance', 'duration', 'campaign', 'pdays_cleaned', 'day']#'pdays']#, 'previous']    # <- wpisz swoje kolumny numeryczne
target_col = 'y'


In [8]:
# ==== WCZYTAJ DANE ====
# train_df i test_df muszą być wcześniej wczytane

# RAW DATA  With optuna XGBoost without previous

best_params = {'max_depth': 8, 'learning_rate': 0.12763052427108496, 'n_estimators': 411, 'subsample': 0.953989568332575, 'colsample_bytree': 0.6023438935073028, 'gamma': 0.019491768244553676, 'reg_alpha': 3.961328967768697, 'reg_lambda': 1.131707666650681}

X = train_df[categorical_cols + numerical_cols].copy()
y = train_df[target_col].copy()
X_test =  test_df[categorical_cols + numerical_cols].copy()

# ==== ENKODOWANIE DANYCH KATEGORYCZNYCH ==== Dla CATBOOSTA MOŻNA POMINĄ TYLKO PODAĆ LISTĘ
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    X_test[col] = le.transform(X_test[col])

# for col in ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome"]:
#     X[col] = X[col].astype("category")
#     X_test[col] =  X_test[col].astype("category")

final_model = XGBClassifier(**best_params, scale_pos_weight=(y == 0).sum() / (y == 1).sum(), eval_metric='auc') # wpisz jeszcze banance oraz auc!!!!!!!!!!!!!!!!!!!
final_model.fit(X, y)

# Predykcja na testowym zbiorze (np. do submission)
y_pred = final_model.predict_proba(X_test)[:, 1]



In [9]:
submission = pd.DataFrame({
    "id": test_df["id"],        # lub np. range(len(y_pred_final))
    "Personality": y_pred
})

submission.to_csv("submission.csv", index=False)
print('Sukcesss!!!')


Sukcesss!!!
