In [1]:
# https://lightgbm.readthedocs.io/en/v3.3.2/Python-Intro.html
# https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/
# https://coiled.io/blog/dask-dataframe-merge-join/


In [None]:
import numpy as np
import lightgbm as lgb
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from dask.distributed import Client


In [None]:
# https://docs.dask.org/en/latest/deploying-python.html
# https://coiled.io/blog/dask-dataframe-merge-join/  << Ctrl + F "Run Massive Joins on Dask Cluster"
# Example: 
# https://github.com/microsoft/LightGBM/blob/fdc582ea6ba13faf15ee6707c7c7542790c8821d/examples/python-guide/dask/prediction.py
# https://examples.dask.org/machine-learning.html
# obj: create Dask cluster


client = Client(n_workers=2, threads_per_worker=2)
# client.restart()


# dashboard: for task, cpu, worker load, must have bokeh installed
# https://docs.dask.org/en/stable/dashboard.html < diagnostics
# https://youtu.be/N_GqzcuGLCY

In [None]:
%%time

# types of joins: https://www.geeksforgeeks.org/how-to-merge-two-csv-files-by-specific-column-using-pandas-in-python/
# how to merge df w/ dask: https://coiled.io/blog/dask-dataframe-merge-join/
# obj: full outer join label csv to feature csv



ProgressBar().register()
training_features = dd.read_csv("./originalDataset/train_data.csv", blocksize=25e6)
training_label = dd.read_csv("./originalDataset/train_labels.csv", blocksize=25e6)

training_merged = training_features.merge(
    training_label, 
    how="outer", 
    on=["customer_ID"]
)



In [None]:
%%time
# feature engineering
categorical_feature_array=[
        "B_30", "B_38", "D_114", 
        "D_116", "D_117", "D_120",
        "D_126", "D_63", "D_64",
        "D_66", "D_68"
]

def feature_engineering(dataframe):
    for c in categorical_feature_array:
        dataframe[c] = dataframe[c].astype('category')


training_merged = feature_engineering(training_merged)

In [None]:
%%time
# features = training_merged[training_merged.columns[training_merged.columns!="target"]]
# label = training_merged["target"]

# must create training set, validation set, & test set using training_merged to train, validate, & test model

%%time

training_set, validation_set, test_set = training_merged.random_split([0.8, 0.1, 0.1], random_state=123)

train_x = training_set.drop(columns=["target","customer_ID"])
train_y = training_set["target"]

val_x = validation_set.drop(columns=["target","customer_ID"])
val_y = validation_set["target"]

test_x = test_set.drop(columns=["target","customer_ID"])
test_y = test_set["target"]

In [None]:
%%time

# https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html
# https://lightgbm.readthedocs.io/en/latest/Parallel-Learning-Guide.html#dask
# https://www.analyticsvidhya.com/blog/2021/08/complete-guide-on-how-to-use-lightgbm-in-python/
# model development



# https://lightgbm.readthedocs.io/en/latest/Parameters.html
# https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html
# https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html#for-better-accuracy



params = {
    # change accuracy, use dart bosoting type?
    "num_leaves" : [31],
    "max_bin" : [63],
    "learning_rate" : [0.1],
    "num_iterations" : [100],
    "boosting_type" : ["goss"],

    # change overfitting
    "max_depth" : [-1],
    "min_split_gain" : [0],
    "min_child_samples" : [20],
    "reg_alpha" : [0],
    "reg_lambda" : [0],
    "early_stopping" : [5],
    "feature_pre_filter" : [True],

    # 20x weight on negative
    "scale_pos_weight" : [0.05],

    # gpu
    "device_type" : ["gpu"],
    "gpu_use_dp" : [True],
    "gpu_platform_id" : [0],
    "gpu_device_id" : [0]

}

dask_model = lgb.DaskLGBMClassifier(
    # client=client,  << used for cluster
    objective="binary",
    n_jobs=-1,
    random_state=42,
    **params
)

dask_model.fit(
    train_x,
    train_y,
    eval_set=[(val_x, val_y),(train_x, train_y)],
    eval_metric="logloss",
    categorical_feature=categorical_feature_array,
    verbose=20
)

print(f"Training accuracy: {dask_model.score(train_x, train_y)}")
print(f"Validation accuracy: {dask_model.score(val_x, val_y)}")
print(f"Test accuracy: {dask_model.score(test_x, test_y)}")

In [None]:
# tuning https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html

In [None]:
%%time
# https://www.analyticsvidhya.com/blog/2021/08/complete-guide-on-how-to-use-lightgbm-in-python/
# Additional
from sklearn.metrics import classification_report
lgb.plot_importance(dask_model)
lgb.plot_metric(dask_model)
print(classification_report(test_y,dask_model.predict(test_x)))

In [None]:
%%time
# make predictions on test_data features and reference sample_submission for format
test_features_with_ID = dd.read_csv("./originalDataset/test_data.csv", blocksize=25e6)

# changes features to match what was used to train model
test_features_with_ID = feature_engineering(test_features_with_ID)

test_features = test_features_with_ID.drop(columns=["customer_ID"])
pred_y = dask_model.predict(test_features)
final_submit_df = test_features_with_ID["customer_ID"]
final_submit_df["prediction"] = pred_y
final_submit_df.to_csv("final_submit.csv", single_file=True)

# in cli
# kaggle competitions submit -c [COMPETITION] -f [FILE] -m [MESSAGE]
!kaggle competitions submit -c amex-default-prediction -f final_submit.csv -m "first submit"

In [None]:
# pickle model when done?