In [None]:
!pip3 install lightgbm

In [4]:
import numpy as np
import lightgbm as lgb
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [5]:
data = load_iris()
X,y = data.data,data.target

In [6]:
X.shape

(150, 4)

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [9]:
# Convert the dataset into lightgbm's dataset format

d_train = lgb.Dataset(X_train,label=y_train)
d_test = lgb.Dataset(X_test,label=y_test,reference=d_train)

In [11]:
d_train

<lightgbm.basic.Dataset at 0x7f9deb39b2e8>

In [12]:
## Define the model parameters

params = {
    'objective': 'multiclass',
    'num_class' : 3,
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'max_depth': -1,
    'metric': 'multi_logloss'
}

In [13]:
# train the lightgbm model

model = lgb.train(
    params=params,
    train_set=d_train,
    valid_sets=[d_train,d_test],
    verbose_eval=10
)



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 91
[LightGBM] [Info] Number of data points in the train set: 120, number of used features: 4
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.073920
[LightGBM] [Info] Start training from score -1.123930
[10]	training's multi_logloss: 0.3256	valid_1's multi_logloss: 0.277193
[20]	training's multi_logloss: 0.151212	valid_1's multi_logloss: 0.0990331
[30]	training's multi_logloss: 0.0897383	valid_1's multi_logloss: 0.0611451
[40]	training's multi_logloss: 0.0569661	valid_1's multi_logloss: 0.0634732
[50]	training's multi_logloss: 0.0368118	valid_1's multi_logloss: 0.0513861
[60]	training's multi_logloss: 0.0236404	valid_1's multi_logloss: 0.032982
[70]	training's multi_logloss: 0.0167464	valid_1's multi_logloss: 0.0296178
[80]	training's multi_logloss: 0.0122266	valid_1's multi_logloss: 0.0259877
[90]	training's multi_logloss: 0.00893676	valid_1's mult

In [14]:
# Make Predictions

y_pred = model.predict(X_test)

In [16]:
y_pred

array([[3.76856993e-05, 9.94970142e-01, 4.99217222e-03],
       [9.99958799e-01, 4.05167599e-05, 6.84729322e-07],
       [1.06280618e-05, 2.66580419e-03, 9.97323568e-01],
       [1.53201680e-05, 9.99231849e-01, 7.52830959e-04],
       [2.04917968e-05, 9.92651411e-01, 7.32809704e-03],
       [9.99980074e-01, 1.91932053e-05, 7.32564689e-07],
       [3.55342610e-05, 9.99665394e-01, 2.99071579e-04],
       [3.17558820e-04, 8.85970268e-02, 9.11085414e-01],
       [7.04453013e-05, 9.73687055e-01, 2.62424998e-02],
       [2.32880544e-06, 9.99931204e-01, 6.64667656e-05],
       [9.39427926e-04, 5.08820367e-02, 9.48178535e-01],
       [9.99879975e-01, 1.03083674e-04, 1.69408578e-05],
       [9.99968207e-01, 3.11465822e-05, 6.46874194e-07],
       [9.99929468e-01, 6.84090273e-05, 2.12274199e-06],
       [9.99984418e-01, 1.48836672e-05, 6.97873495e-07],
       [6.90497920e-05, 9.99605697e-01, 3.25253313e-04],
       [9.44011782e-06, 1.30711933e-03, 9.98683441e-01],
       [4.47342222e-06, 9.99439

In [17]:
#convert probabilities to class labels

y_pred_class = [list(x).index(max(x)) for x in y_pred]

In [18]:
y_pred_class

[1,
 0,
 2,
 1,
 1,
 0,
 1,
 2,
 1,
 1,
 2,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 1,
 2,
 0,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 0,
 0]

In [19]:
# calculate accuracy

accuracy_score(y_test,y_pred_class)

1.0

## Gridsearchcv

In [20]:
from sklearn.model_selection import GridSearchCV

# Define the LightGBM classifier
lgb_clf = lgb.LGBMClassifier(objective='multiclass', num_class=3, boosting_type='gbdt')

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [15, 31, 50],
    'max_depth': [-1, 5, 10]
}

# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(estimator=lgb_clf, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy on Test Set: {accuracy:.2f}')

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Parameters: {'learning_rate': 0.05, 'max_depth': -1, 'num_leaves': 15}
Best Accuracy: 0.9500000000000001
Accuracy on Test Set: 1.00
