In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

train = pd.read_csv("/train.csv")
test_data = pd.read_csv("/test.csv")

# Remove the id column from train and test

train.drop("id",axis=1,inplace=True)
train.head()

: 

In [3]:
# Describe training data set.
train.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y
count,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0
mean,40.926395,1204.067397,16.117209,256.229144,2.577008,22.412733,0.298545,0.120651
std,10.098829,2836.096759,8.250832,272.555662,2.718514,77.319998,1.335926,0.325721
min,18.0,-8019.0,1.0,1.0,1.0,-1.0,0.0,0.0
25%,33.0,0.0,9.0,91.0,1.0,-1.0,0.0,0.0
50%,39.0,634.0,17.0,133.0,2.0,-1.0,0.0,0.0
75%,48.0,1390.0,21.0,361.0,3.0,-1.0,0.0,0.0
max,95.0,99717.0,31.0,4918.0,63.0,871.0,200.0,1.0


In [4]:
# Check null values
train.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

Awesome, no null values for training data.

In [5]:
test_data.isnull().sum()

id           0
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
dtype: int64

Awesome, no null values for testing data.

In [6]:
# Columns names with their unique values.
for cname in train.columns:
    if train[cname].dtype == "object":
        print(cname, train[cname].unique())

job ['technician' 'blue-collar' 'student' 'admin.' 'management' 'entrepreneur'
 'self-employed' 'unknown' 'services' 'retired' 'housemaid' 'unemployed']
marital ['married' 'single' 'divorced']
education ['secondary' 'primary' 'tertiary' 'unknown']
default ['no' 'yes']
housing ['no' 'yes']
loan ['no' 'yes']
contact ['cellular' 'unknown' 'telephone']
month ['aug' 'jun' 'may' 'feb' 'apr' 'nov' 'jul' 'jan' 'oct' 'mar' 'sep' 'dec']
poutcome ['unknown' 'other' 'failure' 'success']


Data is categoried so, We can encode the value using ordinal encoding.

In [None]:
# Let make default, housing and loan columns to numeric
cat_numeric_cols = ['default','housing','loan']

for col in cat_numeric_cols:
    train[col]=train[col].map({'yes':1 ,'no':0})

for col in cat_numeric_cols:
    test_data[col]=test_data[col].map({'yes':1 ,'no':0})


train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,42,technician,married,secondary,0,7,0,0,cellular,25,aug,117,3,-1,0,unknown,0
1,38,blue-collar,married,secondary,0,514,0,0,unknown,18,jun,185,1,-1,0,unknown,0
2,36,blue-collar,married,secondary,0,602,1,0,unknown,14,may,111,2,-1,0,unknown,0
3,27,student,single,secondary,0,34,1,0,unknown,28,may,10,2,-1,0,unknown,0
4,26,technician,married,secondary,0,889,1,0,cellular,3,feb,902,1,-1,0,unknown,1


In [8]:
X = train.drop(columns=['y'])
y = train['y']

In [None]:
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X.columns if
                    X[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X.columns if
                X[cname].dtype in ['int64', 'float64']]
print(categorical_cols,numerical_cols)

['job', 'marital', 'education', 'contact', 'month', 'poutcome'] ['age', 'default', 'balance', 'housing', 'loan', 'day', 'duration', 'campaign', 'pdays', 'previous']


In [10]:
# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)),
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=40)

In [12]:
# # Define model
# model = XGBClassifier(
#     objective='binary:logistic',
#     eval_metric='auc',
#     use_label_encoder=False,
#     random_state=40,
#     learning_rate=0.05,
#     n_estimators=5000,
#     max_depth=6,
#     subsample=0.7,
#     colsample_bytree=0.8,
# )

In [13]:
from lightgbm import LGBMClassifier
model = LGBMClassifier(
    objective='binary',
    n_estimators=5000,
    learning_rate=0.01,
    max_depth=16,
    num_leaves=512,
    min_child_samples=2,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.0,
    reg_lambda=0.0,
    random_state=42
)

In [14]:
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

In [None]:
# Preprocessing of training data, fit model
my_pipeline.fit(X, y,)



[LightGBM] [Info] Number of positive: 90488, number of negative: 659512
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058474 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1039
[LightGBM] [Info] Number of data points in the train set: 750000, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120651 -> initscore=-1.986283
[LightGBM] [Info] Start training from score -1.986283


In [16]:
# # ROC AUC score
# from sklearn.metrics import roc_auc_score
# y_pred_proba = my_pipeline.predict_proba(X_test)[:, 1]
# auc_score = roc_auc_score(y_test, y_pred_proba)

# print(auc_score)

In [17]:
test = test_data.drop(columns=['id'])
test_pred_proba= my_pipeline.predict_proba(test)[:, 1]
test_pred_proba = np.clip(test_pred_proba, 0, 1)

In [18]:
output = pd.DataFrame({
    'id': test_data['id'],
    'y': test_pred_proba
})
output.to_csv('submission.csv', index=False)