<a href="https://colab.research.google.com/github/Siddha-Regilla/Data-Science-Classroom-notebooks/blob/main/18.%20XGBM_and_LGBM_15_03_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### XGBoost

In [4]:
# installing the library
!pip install xgboost



In [5]:
# First XGBoost model for Pima Indians dataset
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pandas import read_csv


In [7]:
# Upload dataset - "pima-indians-diabetes.data.csv"
# Read dataset
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8] # Independant variables
Y = array[:,8] # Dependant variables
dataframe

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [8]:
# split data into train and test sets
seed = 42
test_size = 0.33 # Proportion of testing data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)


In [9]:
# fit model on training data
# hyperparamenters used in XGBM -
# 1. max_depth is how large a tree can crow
# 2. n_estimators is the no. of trees to be used
# 3. learning_rate - the rate at which a model learns. if its too high then it might miss the optimal value and low rate may take longer to compute
# 4. Gamma value - regularization factor, controls the reduction of loss after split
# 5. objective = 'binary : logostic' - its the type of data the model is predicting for. Here it is a binary classifier and it is using logistic regression
model = XGBClassifier(max_depth =6, n_estimators=500, learning_rate=0.50,gamma=0.5, objective='binary:logistic') # building the model
model.fit(X_train, y_train) # fitting the model on training data

In [12]:
# make predictions for test data
y_pred = model.predict(X_test) # Generates predicted probabilities for the test data.
predictions = [round(value) for value in y_pred] # Converts these probabilities into binary predictions by rounding each value to the nearest integer (0 or 1).

In [13]:
# evaluate predictions by checking the model accurarcy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f" % (accuracy * 100.0))

Accuracy: 74.02


In [15]:
# we can tune the above mentioned hyperparameters to identy the highest accurarcy by changing their values
# we can add furthur hyperparameters like booster = gbtree -> gradient boosting tree & max_leave = 10.
# This would further improve the model performance. Fine tunning the hyperparameter. The following link has all the details of all hyperparameters for XGBM
# https://xgboost.readthedocs.io/en/stable/parameter.html

In [16]:
predictions

[1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,


# ***Light GBM***

In [17]:
# Importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [18]:
# Uploading the dataset - ima-indians-diabetes.data.csv
# Reading the dataset
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
# dividing the data into independant and dependant variables
X = array[:,0:8]
Y = array[:,8]
dataframe

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [19]:
# Splitting the dataset into the Training set and Test set

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)


In [20]:
# installing lightgbm model
!pip install lightgbm



In [21]:
import lightgbm as lgb


In [22]:
# put the data in a particular format as it focus on 1 portion of the dataset
d_train = lgb.Dataset(x_train, label=y_train)

In [23]:
# defining the hyperparamenter of the model
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 10
params['min_data'] = 50
params['max_depth'] = 10

In [24]:
# training the model on the formatted training dataset and the above defined hyperparameters and also specifing the no. of decision trees
clf = lgb.train(params, d_train, 500)

[LightGBM] [Info] Number of positive: 206, number of negative: 370
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000416 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 637
[LightGBM] [Info] Number of data points in the train set: 576, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357639 -> initscore=-0.585627
[LightGBM] [Info] Start training from score -0.585627


In [26]:
#Prediction
y_pred=clf.predict(x_test) # Generates predicted probabilities for the test data.

In [27]:
# converting into binary values
predictions = [round(value) for value in y_pred] #Converts these probabilities into binary predictions by rounding each value to the nearest integer (0 or 1).

In [28]:
# performance of the model through accurarcy
accuracy = accuracy_score(y_test, predictions)

In [29]:
accuracy

0.7604166666666666

In [30]:
predictions

[1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [32]:
# between XGBM and LGBM, XGBM is a more preferred model
# XGBM -
# 1. level wise growth - grows horizontally
# 2. More generic model
# 3. most of the projects XGBM model works better. Gives higher levels of accurarcy
# 4. Uses regularization
# 5. computes fater

# LGBM -
# 1. leaf wise growth - grows vertically
# as it uses gradient based one side sampling
# 3. selects a leaf and tries to minimize the loss
# 4. more specialised