In [1]:
import pandas as pd
import numpy as np
import time
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.datasets import load_boston
import re


In [2]:
columns_to_drop = [
    'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE',
    'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION',
    'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY',
    'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG',
    'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG',
    'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG',
    'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE',
    'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE',
    'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE',
    'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI',
    'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI',
    'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI',
    'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE',
    'HOUSETYPE_MODE', 'TOTALAREA_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE',
    'OBS_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3',
    'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8',
    'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13',
    'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18',
    'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
    'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK'
]
application_train = pd.read_csv('../../../../homecredit/application_train.csv')
# application_test = pd.read_csv('../../../../homecredit/application_test.csv')

application_train= application_train.drop(columns=columns_to_drop)
# application_test= application_test.drop(columns=columns_to_drop)

In [6]:
X_encoded = pd.get_dummies(application_train.drop('TARGET',axis=1), columns=['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','NAME_TYPE_SUITE','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE'], drop_first=True)


X= X_encoded.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))


y = application_train.TARGET


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = lgb.Dataset(X_train, label=y_train)

params = {
            'max_bin': 63,
          'num_leaves': 255,
          'learning_rate': 0.1,
          'tree_learner': 'serial',
          'task': 'train',
          'is_training_metric': 'false',
          'min_data_in_leaf': 1,
          'min_sum_hessian_in_leaf': 100,
          'ndcg_eval_at': [1, 3, 5, 10],
          'device_type': 'cpu'
          }
print("*****************************")
t0 = time.time()
gbm = lgb.train(params, train_set=train_data, num_boost_round=10,
                valid_sets=None, valid_names=None, feval=None, init_model=None,
                feature_name='auto', categorical_feature='auto',

    
                keep_training_booster=False, callbacks=None)
t1 = time.time()

print('cpu version elapse time: {}'.format(t1-t0))
time.sleep(20)
print("*****************************")

params = {
    'max_bin': 63,
          'num_leaves': 255,
          'learning_rate': 0.1,
          'tree_learner': 'serial',
          'task': 'train',
          'is_training_metric': 'false',
          'min_data_in_leaf': 1,
          'min_sum_hessian_in_leaf': 100,
          'ndcg_eval_at': [1, 3, 5, 10],
          'device_type': 'gpu'
          }

t0 = time.time()
gbm = lgb.train(params, train_set=train_data, num_boost_round=10,
                valid_sets=None, valid_names=None, feval=None, init_model=None,
                feature_name='auto', categorical_feature='auto',
    
     
                keep_training_booster=False, callbacks=None)
t1 = time.time()

print('gpu version elapse time: {}'.format(t1-t0))

from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score

y_pred = gbm.predict(X_test)  # 使用已經訓練好的模型對測試數據進行預測
mse = mean_squared_error(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred.round())  # 分類問題的精確度
auc = roc_auc_score(y_test, y_pred)  # 二分類問題的 AUC


*****************************
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1273
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 138
[LightGBM] [Info] Start training from score 0.080794
cpu version elapse time: 0.9898197650909424
*****************************
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1273
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 138
[LightGBM] [Info] Using GPU Device: Quadro P600, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 64 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 24 dense feature groups (5.63 MB) transferred to GPU in 0.010986 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 0.080794
gpu version elapse time

In [18]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
import time

In [2]:
X,y = make_classification(n_samples=10000000, n_features=100, n_classes=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [19]:
t0 = time.time()
model = lgbm.LGBMClassifier(device="gpu")
model.fit(X_train, y_train)

t1 = time.time()

print('gpu version elapse time: {}'.format(t1-t0))

t0 = time.time()
model = lgbm.LGBMClassifier(device="cpu")
model.fit(X_train, y_train)

t1 = time.time()

print('cpu version elapse time: {}'.format(t1-t0))


[LightGBM] [Info] Number of positive: 3751475, number of negative: 3748525
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 7500000, number of used features: 100
[LightGBM] [Info] Using GPU Device: Quadro P600, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 100 dense feature groups (715.26 MB) transferred to GPU in 0.653109 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500197 -> initscore=0.000787
[LightGBM] [Info] Start training from score 0.000787
gpu version elapse time: 56.258347511291504
[LightGBM] [Info] Number of positive: 3751475, number of negative: 3748525
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 7