In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

from sklearn.metrics import confusion_matrix

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv("bank_marketing_train.csv")
test = pd.read_csv("bank_marketing_test.csv")

In [11]:
def simple_lgbc_model(train_df, test_df):
    """
    This function returns a list of prediction. 
    Several libraries required. Copy, paste the following codes and run in advance.
    
    import pandas as pd
    import numpy as np
    import seaborn as sns
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    from sklearn.preprocessing import LabelEncoder
    
    Parameters
    ----------
    train_df : Pandas DataFrame. Including 20 features cols and 1 target cols.
    test_df : Pandas DataFrame. Including 20 features cols.
    In advance, label encording is not necessary, as it automatically process.
    """
    combined = [train_df, test_df]

    target_col = ["y"]
    drop_cols = ["duration"] + target_col
    feature_cols = [col for col in train_df.columns if col not in drop_cols]
    categorical_cols =  train_df.select_dtypes(exclude="number").columns.tolist()
    numerical_cols = [col for col in train_df.columns if col not in categorical_cols]
    
    le = LabelEncoder()
    for df in combined:
        for col in categorical_cols:
            df[col] = le.fit_transform(df[col])
    
    X = train_df[feature_cols]
    y = train_df[target_col]
    X2 = test_df[feature_cols]
    
    #     from sklearn.model_selection import train_test_split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 1234)
            
    seed = 1234
    gbm = lgb.LGBMClassifier(class_weight="balanced")

    gbm.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=False)
    ret = mean_squared_error(y_val, gbm.predict(X_val))

    # make an importance dataframe
    importance = pd.DataFrame(gbm.feature_importances_, index=train[feature_cols].columns, columns=['importance'])
    print(importance.sort_index(by="importance", ascending=False))
    
    # prediction
    y_pred_clf = gbm.predict(X2)
    
    return y_pred_clf

In [4]:
train.head()
test.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,37,services,married,high.school,no,yes,no,telephone,mon,226,1,999,0,nonexistent,1.1,93.125,-35.4,1.806,5191.6,no
1,40,admin.,married,basic.6y,no,no,no,telephone,mon,151,1,999,0,nonexistent,1.1,93.125,-35.4,1.806,5191.6,no
2,25,services,single,high.school,no,yes,no,telephone,mon,50,1,999,0,nonexistent,1.1,93.125,-35.4,1.806,5191.6,no
3,41,blue-collar,married,unknown,unknown,no,no,telephone,mon,55,1,999,0,nonexistent,1.1,93.125,-35.4,1.806,5191.6,no
4,37,admin.,married,high.school,no,yes,no,telephone,mon,172,1,999,0,nonexistent,1.1,93.125,-35.4,1.806,5191.6,no


In [12]:
prediction_lgbc = simple_lgbc_model(train_df=train, test_df=test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


                importance
age                    535
euribor3m              502
campaign               304
education              236
job                    225
day_of_week            200
cons.conf.idx          141
cons.price.idx         140
housing                106
marital                105
pdays                   92
contact                 78
emp.var.rate            67
nr.employed             61
default                 53
loan                    53
previous                53
poutcome                49




In [6]:
def evaluation(answer, prediction):
    print("accuracy_score:", accuracy_score(answer, prediction))
    print("precision_score:", precision_score(answer, prediction))
    print("recall_score:",recall_score(answer, prediction))
    print("f1_score:",f1_score(answer, prediction))
    print("roc_auc_score:", roc_auc_score(answer, prediction))

In [7]:
true = test["y"]
evaluation(answer=test["y"], prediction=prediction_lgbc)

accuracy_score: 0.28918295495932983
precision_score: 0.11840471756672874
recall_score: 0.8143009605122732
f1_score: 0.20674705324481776
roc_auc_score: 0.5180408912150407


In [13]:
cm = confusion_matrix(true, prediction_lgbc)
print(cm)

[[6815  485]
 [ 825  112]]


In [9]:
true.sum()

937

In [10]:
prediction_lgbc.sum()

6444