# Light GBM

lightGBM XGBoostun eğitim süresi performansını arttırmaya yönelik geliştirilen bir GBM türüdür

- Daha performanslı
- Level-wise büyüme strarejisi yerine Leaf wise büyüme stratejisi
- Breadth-first search(BFS) yerine depth-first search(DFS)

In [1]:
import numpy as np
import pandas as pd 
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler
from sklearn import model_selection
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier,MLPRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
from sklearn import neighbors
from sklearn.svm import SVR

In [2]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
dms = pd.get_dummies(df[["League","Division","NewLeague"]])
#Veri setindeki kategorik değişkenler için dummy değişkenler oluşturduk
#One hot encoding yöntemi ile.
y = df["Salary"] #Bağımlı değişkenimiz
X_ = df.drop(["Salary","League","Division","NewLeague"],axis=1).astype("float64")
#Veri setimizdeki bağımlı değişken ve kategorik değişkenlerimizin orjinal hallerini uçurduk
X = pd.concat([X_,dms[["League_N","Division_W","NewLeague_N"]]],axis = 1)
#Bu kategorik değişkenlerin dummy versyonlarını ve diğer bağımsız
#değişken değerlerimizi bir arada tutarak bağımsız değişkenlerimizi
#X olarak oluşturmuş olduk
X_train,X_test,y_train,y_test  =train_test_split(X,y,test_size = 0.25,random_state=42)

In [3]:
from lightgbm import LGBMRegressor

### Model Ve Tahmin

In [4]:
lgb_model = LGBMRegressor().fit(X_train,y_train)

In [5]:
y_pred = lgb_model.predict(X_test)

In [6]:
np.sqrt(mean_squared_error(y_test,y_pred))

363.8712087611089

### Model Tuning

In [7]:
lgb_model = LGBMRegressor()
lgb_model

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [9]:
lgbm_params = {"learning_rate":[0.01,0.1,0.5,1],
              "n_estimators":[20,40,100,200,500,1000],
              "max_depth":[1,2,3,4,5,6,7,8,9,10]}

In [12]:
lgbm_cv_model = GridSearchCV(lgb_model,lgbm_params,cv = 10 ,n_jobs = -1, verbose =2).fit(X_train,y_train)

Fitting 10 folds for each of 240 candidates, totalling 2400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 672 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 2296 tasks      | elapsed:   24.1s




[Parallel(n_jobs=-1)]: Done 2400 out of 2400 | elapsed:   25.5s finished


In [13]:
lgbm_cv_model.best_params_

{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 20}

In [15]:
lgb_Tuned = LGBMRegressor(learning_rate = 0.1,max_depth = 6,n_estimators = 20).fit(X_train,y_train)

In [16]:
y_pred = lgb_Tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test,y_pred))

371.5044868943621