In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

from sklearn.metrics import confusion_matrix

In [None]:
train = pd.read_csv("bank_marketing_train.csv")
test = pd.read_csv("bank_marketing_test.csv")

In [None]:
def simple_lgbc_model(train_df, test_df):
    """
    This function returns a list of prediction. 
    Several libraries required. Copy, paste the following codes and run in advance.
    
    import pandas as pd
    import numpy as np
    import seaborn as sns
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    from sklearn.preprocessing import LabelEncoder
    
    Parameters
    ----------
    train_df : Pandas DataFrame. Including 20 features cols and 1 target cols.
    test_df : Pandas DataFrame. Including 20 features cols.
    In advance, label encording is not necessary, as it automatically process.
    """
    combined = [train_df, test_df]

    target_col = ["y"]
    drop_cols = ["duration"] + target_col
    feature_cols = [col for col in train_df.columns if col not in drop_cols]
    categorical_cols =  train_df.select_dtypes(exclude="number").columns.tolist()
    numerical_cols = [col for col in train_df.columns if col not in categorical_cols]
    
    le = LabelEncoder()
    for df in combined:
        for col in categorical_cols:
            df[col] = le.fit_transform(df[col])
    
    X = train_df[feature_cols]
    y = train_df[target_col]
    X2 = test_df[feature_cols]
    
    #     from sklearn.model_selection import train_test_split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 1234)
            
    seed = 1234
    gbm = lgb.LGBMClassifier(class_weight="balanced")

    gbm.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=False)
    ret = mean_squared_error(y_val, gbm.predict(X_val))

    # make an importance dataframe
    importance = pd.DataFrame(gbm.feature_importances_, index=train[feature_cols].columns, columns=['importance'])
    print(importance.sort_index(by="importance", ascending=False))
    
    # prediction
    y_pred_clf = gbm.predict(X2)
    
    return y_pred_clf

In [None]:
train.head()
test.head()

In [None]:
prediction_lgbc = simple_lgbc_model(train_df=train, test_df=test)

In [None]:
def evaluation(answer, prediction):
    print("accuracy_score:", accuracy_score(answer, prediction))
    print("precision_score:", precision_score(answer, prediction))
    print("recall_score:",recall_score(answer, prediction))
    print("f1_score:",f1_score(answer, prediction))
    print("roc_auc_score:", roc_auc_score(answer, prediction))

In [None]:
true = test["y"]
evaluation(answer=test["y"], prediction=prediction_lgbc)

In [None]:
cm = confusion_matrix(true, prediction_lgbc)
print(cm)

In [None]:
true.sum()

In [None]:
prediction_lgbc.sum()