# Light GBM

In [1]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.3.5-py3-none-manylinux1_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 3.3 MB/s eta 0:00:01
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.5


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor

from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=42)


In [4]:
#conda install -c conda-forge lightgbm

In [5]:
lgbm = LGBMRegressor()
lgbm_model = lgbm.fit(X_train, y_train)

## Tahmin

In [6]:
y_pred = lgbm_model.predict(X_test,
                            num_iteration = lgbm_model.best_iteration_)

In [7]:
np.sqrt(mean_squared_error(y_test, y_pred))

363.8712087611089

## Model Tuning

In [8]:
lgbm_model

In [9]:
lgbm_grid = {
    'colsample_bytree': [0.4, 0.5,0.6,0.9,1],
    'learning_rate': [0.01, 0.1, 0.5,1],
    'n_estimators': [20, 40, 100, 200, 500,1000],
    'max_depth': [1,2,3,4,5,6,7,8] }

lgbm = LGBMRegressor()
lgbm_cv_model = GridSearchCV(lgbm, lgbm_grid, cv=10, n_jobs = -1, verbose = 2)

In [10]:
lgbm_cv_model.fit(X_train, y_train)

Fitting 10 folds for each of 960 candidates, totalling 9600 fits
[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=1, n_estimators=20; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=1, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=1, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=1, n_estimators=500; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=1, n_estimators=1000; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=2, n_estimators=1000; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=2, n_estimators=1000; total time=   0.1s
[CV] END colsample_bytre

In [11]:
lgbm_cv_model.best_params_

{'colsample_bytree': 0.5,
 'learning_rate': 0.1,
 'max_depth': 6,
 'n_estimators': 20}

In [12]:
lgbm_tuned = LGBMRegressor(learning_rate = 0.1,
                           max_depth = 6,
                           n_estimators = 20,
                          colsample_bytree = 0.5)

lgbm_tuned = lgbm_tuned.fit(X_train,y_train)

In [13]:
y_pred = lgbm_tuned.predict(X_test)

In [14]:
np.sqrt(mean_squared_error(y_test, y_pred))

375.6085209015434