In [2]:
# Bibliotecas

import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score

import seaborn as sns

In [3]:
import warnings
warnings.filterwarnings('ignore')


## Carregando os dados para classificação e regressão 

In [5]:
# Vou usar este dataset para fazer uma classificação

df_iris = sns.load_dataset('iris')

df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [6]:
df_iris.shape

(150, 5)

In [7]:
df_iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [8]:
df_iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [9]:
df_iris.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [10]:
df_iris['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

## Separando as bases

In [12]:
# seprando X e y

X_iris = df_iris.drop(['species'], axis=1)

y_iris = df_iris['species']

In [13]:
X_iris_train, X_iris_test, y_iris_train, y_iris_test = train_test_split(X_iris, y_iris, random_state=1991)

## Criando o objeto com o classcificador LightGBM

In [15]:
classificador_lgbm = lgb.LGBMClassifier()

In [16]:
type(classificador_lgbm)

lightgbm.sklearn.LGBMClassifier

### <a> Compatibilidade com scikit-learn </a>

O [LGBMClassifier](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html) é uma subclasse de *sklearn.base.ClassifierMixin*, ou seja, é compatível com sklearn.

In [18]:
from sklearn.model_selection import cross_val_score

score_lgbm = cross_val_score(classificador_lgbm, X_iris_train, y_iris_train).mean() * 100

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000063 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 89, number of used features: 4
[LightGBM] [Info] Start training from score -1.022900
[LightGBM] [Info] Start training from score -1.192800
[LightGBM] [Info] Start training from score -1.087439
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000018 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 79
[LightGBM] [Info] Number of data points in the train set: 89, number of used features: 4
[LightGBM] [Info] Start training from score -1.022900
[LightGBM] [Info] Start training from score -1.192800
[LightGBM] [Info] Start training from score -1.087439
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000016 seconds.
You ca

In [19]:
score_lgbm

96.44268774703558

### <a> Pequena tunagem! </a>

O LightGBM permite alteração de vários hiperparâmetros, como learning rate, altura máxima das árvores, quantidade máxima de folhas nas árvores, número de árvores (estimadores), etc...

LightGBM permite alterar a implementação do classificador! Podemos, por exemplo, utilizar random forest (bagging) ao invés de algoritmos de boosting! Vamos ver se melhora o resultado.

In [21]:
classificador_lgbm_tunado = lgb.LGBMClassifier(max_depth=10)

score_lgbm_tunado = 100 * cross_val_score(classificador_lgbm_tunado, X_iris_train, y_iris_train).mean()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000028 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 89, number of used features: 4
[LightGBM] [Info] Start training from score -1.022900
[LightGBM] [Info] Start training from score -1.192800
[LightGBM] [Info] Start training from score -1.087439
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000019 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 79
[LightGBM] [Info] Number of data points in the train set: 89, number of used features: 4
[LightGBM] [Info] Start training from score -1.022900
[LightGBM] [Info] Start training from score -1.192800
[LightGBM] [Info] Start training from score -1.087439
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000020 seconds.
You ca

In [22]:
score_lgbm_tunado

96.44268774703558

### <a> Tipos de classificador </a>

Direto da [documentação](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html): ‘gbdt’, traditional Gradient Boosting Decision Tree. ‘dart’, Dropouts meet Multiple Additive Regression Trees. ‘goss’, Gradient-based One-Side Sampling. ‘rf’, Random Forest.

In [24]:
classificador_lgbm_dart = lgb.LGBMClassifier(boosting_type='dart')

score_lgbm_dart = 100 * cross_val_score(classificador_lgbm_dart, X_iris_train, y_iris_train).mean()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000019 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 89, number of used features: 4
[LightGBM] [Info] Start training from score -1.022900
[LightGBM] [Info] Start training from score -1.192800
[LightGBM] [Info] Start training from score -1.087439
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000020 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 79
[LightGBM] [Info] Number of data points in the train set: 89, number of used features: 4
[LightGBM] [Info] Start training from score -1.022900
[LightGBM] [Info] Start training from score -1.192800
[LightGBM] [Info] Start training from score -1.087439
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000021 seconds.
You ca

In [25]:
score_lgbm_dart

97.35177865612648

In [49]:
# Capricho de legibilidade
classificador_campeao = classificador_lgbm_dart

In [53]:
# Com o melhor modelo, podemos utilizar a base toda de treino
classificador_campeao.fit(X_iris_train, y_iris_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000022 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 4
[LightGBM] [Info] Start training from score -1.029619
[LightGBM] [Info] Start training from score -1.192138
[LightGBM] [Info] Start training from score -1.080913


In [56]:
# Podemos realizar a predição da base de teste!
predicoes_iris = classificador_campeao.predict(X_iris_test)

predicoes_iris[:10]

array(['setosa', 'versicolor', 'virginica', 'virginica', 'versicolor',
       'setosa', 'versicolor', 'virginica', 'setosa', 'setosa'],
      dtype=object)

In [63]:
# Calculando o número de acertos
(predicoes_iris == y_iris_test).sum()

37

In [66]:
# Mas qual o tamanho da base de teste?
len(y_iris_test)

38

In [73]:
acertos = (predicoes_iris == y_iris_test).sum()
total = len(y_iris_test)

acuracia = 100 * acertos / total

print(f'{acuracia.round(2)}%')

97.37%


## Regressão 

Tentar prever o consumo dos carros a partir de suas features

In [78]:
df_mpg = sns.load_dataset('mpg')

df_mpg.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [81]:
df_mpg.shape

(398, 9)

In [83]:
df_mpg.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [85]:
df_mpg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [87]:
df_mpg.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
name            0
dtype: int64

In [91]:
df_mpg.dropna(inplace=True)
df_mpg.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
name            0
dtype: int64

In [94]:
df_mpg.shape

(392, 9)

In [99]:
X_mpg = df_mpg.drop(['mpg', 'origin', 'name'], axis=1)
y_mpg = df_mpg['mpg']

In [102]:
X_mpg_train, X_mpg_test, y_mpg_train, y_mpg_test = train_test_split(X_mpg, y_mpg, random_state=1991)

In [107]:
# Criando regressor

regressor_lgbm = lgb.LGBMRegressor()

In [112]:
erro_regressor_lgbm = cross_val_score(regressor_lgbm, X_mpg_train, y_mpg_train, scoring='neg_root_mean_squared_error').mean()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000061 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223
[LightGBM] [Info] Number of data points in the train set: 235, number of used features: 6
[LightGBM] [Info] Start training from score 23.500851
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223
[LightGBM] [Info] Number of data points in the train set: 235, number of used features: 6
[LightGBM] [Info] Start training from score 23.155319
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000037 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 226
[LightGBM] [Info] Number of data points in the train set: 235, number of used features: 6
[LightGBM] [Info] Start training from

In [114]:
erro_regressor_lgbm

-2.814294219411532

In [117]:
regressor_lgbm_tunado = lgb.LGBMRegressor(max_depth=2)

erro_regressor_lgbm_tunado = cross_val_score(regressor_lgbm_tunado, X_mpg_train, y_mpg_train, scoring='neg_root_mean_squared_error').mean()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000029 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223
[LightGBM] [Info] Number of data points in the train set: 235, number of used features: 6
[LightGBM] [Info] Start training from score 23.500851
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000047 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223
[LightGBM] [Info] Number of data points in the train set: 235, number of used features: 6
[LightGBM] [Info] Start training from score 23.155319
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000032 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 226
[LightGBM] [Info] Number of data points in the train set: 235, number of used features: 6
[LightGBM] [Info] Start training from

In [121]:
erro_regressor_lgbm_tunado

-2.8625602070574754

In [124]:
regressor_lgbm_dart = lgb.LGBMRegressor(boosting_type='dart')

erro_regressor_lgbm_dart = cross_val_score(regressor_lgbm_dart, X_mpg_train, y_mpg_train, scoring='neg_root_mean_squared_error').mean()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223
[LightGBM] [Info] Number of data points in the train set: 235, number of used features: 6
[LightGBM] [Info] Start training from score 23.500851
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000029 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223
[LightGBM] [Info] Number of data points in the train set: 235, number of used features: 6
[LightGBM] [Info] Start training from score 23.155319
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000032 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 226
[LightGBM] [Info] Number of data points in the train set: 235, number of used features: 6
[LightGBM] [Info] Start training from

In [126]:
erro_regressor_lgbm_dart

-3.7968106629660374

In [129]:
# Definindo melhor modelo

regressor_campeao = regressor_lgbm

In [132]:
regressor_campeao.fit(X_mpg_train,y_mpg_train, eval_metric='root_mean_squared_error')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 260
[LightGBM] [Info] Number of data points in the train set: 294, number of used features: 6
[LightGBM] [Info] Start training from score 23.374150


In [135]:
predicoes_mpg = regressor_campeao.predict(X_mpg_test)

predicoes_mpg[:10]

array([23.69295371, 21.41433662, 15.76488679, 23.22019793, 23.61253242,
       22.08386182, 15.59812369, 38.72657512, 13.97749782, 20.63135158])

In [138]:
y_mpg_test[:10]

361    25.4
77     22.0
221    17.5
366    17.6
272    23.8
180    25.0
229    16.0
310    38.1
158    16.0
364    26.6
Name: mpg, dtype: float64

In [141]:
from sklearn.metrics import mean_squared_error
import math

mse = mean_squared_error(y_mpg_test, predicoes_mpg)

display(mse)

rmse = math.sqrt(mse)

rmse



9.38529336079936

3.063542616122609