In [1]:
#importing standard libraries 
import numpy as np 
import pandas as pd 
from pandas import Series, DataFrame 

#import lightgbm and xgboost 
import lightgbm as lgb 
import xgboost as xgb 

In [2]:
#loading our training dataset 'adult.csv' with name 'data' using pandas 
data=pd.read_csv(r'E:\MYLEARN\2-ANALYTICS-DataScience\datasets\adult.data.csv') 

In [3]:
#Assigning names to the columns 
data.columns=['age','workclass','fnlwgt','education','education-num',
              'marital_Status','occupation','relationship','race','sex',
              'capital_gain','capital_loss','hours_per_week','native_country','Income'] 

In [4]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital_Status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# Label Encoding our target variable 
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
l=LabelEncoder() 
l.fit(data.Income)

LabelEncoder()

In [6]:
l.classes_ 
data.Income=Series(l.transform(data.Income))  #label encoding our target variable 
data.Income.value_counts() 

0    24720
1     7841
Name: Income, dtype: int64

In [7]:
#One Hot Encoding of the Categorical features 
one_hot_workclass=pd.get_dummies(data.workclass) 
one_hot_education=pd.get_dummies(data.education) 
one_hot_marital_Status=pd.get_dummies(data.marital_Status) 
one_hot_occupation=pd.get_dummies(data.occupation)
one_hot_relationship=pd.get_dummies(data.relationship) 
one_hot_race=pd.get_dummies(data.race) 
one_hot_sex=pd.get_dummies(data.sex) 
one_hot_native_country=pd.get_dummies(data.native_country) 

In [8]:
#removing categorical features 
data.drop(['workclass','education','marital_Status',
           'occupation','relationship','race','sex','native_country'],axis=1,inplace=True)

In [9]:
#Merging one hot encoded features with our dataset 'data' 
data=pd.concat([data,
                one_hot_workclass,
                one_hot_education,
                one_hot_marital_Status,
                one_hot_occupation,
                one_hot_relationship,
                one_hot_race,
                one_hot_sex,
                one_hot_native_country],axis=1) 

In [10]:
data.dtypes

age                           int64
fnlwgt                        int64
education-num                 int64
capital_gain                  int64
capital_loss                  int64
hours_per_week                int64
Income                        int32
?                             uint8
Federal-gov                   uint8
Local-gov                     uint8
Never-worked                  uint8
Private                       uint8
Self-emp-inc                  uint8
Self-emp-not-inc              uint8
State-gov                     uint8
Without-pay                   uint8
10th                          uint8
11th                          uint8
12th                          uint8
1st-4th                       uint8
5th-6th                       uint8
7th-8th                       uint8
9th                           uint8
Assoc-acdm                    uint8
Assoc-voc                     uint8
Bachelors                     uint8
Doctorate                     uint8
HS-grad                     

In [11]:
data.shape

(32561, 109)

In [12]:
#removing dulpicate columns 
_, i = np.unique(data.columns, return_index=True) 
data=data.iloc[:, i] 

In [13]:
#Here our target variable is 'Income' with values as 1 or 0.  
#Separating our data into features dataset x and our target dataset y 
x=data.drop('Income',axis=1) 
y=data.Income 

In [14]:
#Now splitting our dataset into test and train 
from sklearn.model_selection import train_test_split 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.3)

# Applying xgboost

In [15]:
#The data is stored in a DMatrix object 
#label is used to define our outcome variable
dtrain=xgb.DMatrix(x_train,label=y_train)
dtest=xgb.DMatrix(x_test)

  if getattr(data, 'base', None) is not None and \


In [17]:
#setting parameters for xgboost
parameters={'max_depth':7, 
            'eta':1, 
            'silent':1,
            'objective':'binary:logistic',
            'eval_metric':'auc',
            'learning_rate':.05}

In [18]:
#training our model 
num_round=50
from datetime import datetime 

start = datetime.now() 
xg=xgb.train(parameters, dtrain,num_round) 
stop = datetime.now()

In [19]:
#Execution time of the model 
execution_time_xgb = stop-start 
execution_time_xgb

datetime.timedelta(seconds=5, microseconds=220769)

In [None]:
#datetime.timedelta( , , ) representation => (days , seconds , microseconds) 

In [20]:
#now predicting our model on test set 
ypred=xg.predict(dtest) 
ypred

array([0.04377641, 0.35446048, 0.5010493 , ..., 0.31888038, 0.05631509,
       0.04377641], dtype=float32)

In [21]:
#Converting probabilities into 1 or 0  
for i in range(0,9769): 
    if ypred[i]>=.5:       # setting threshold to .5 
        ypred[i]=1 
    else: 
        ypred[i]=0  

In [22]:
#calculating accuracy of our model 
from sklearn.metrics import accuracy_score 
accuracy_xgb = accuracy_score(y_test,ypred) 
accuracy_xgb

0.8622172177295526

# Light GBM

In [52]:
train_data=lgb.Dataset(x_train, label=y_train)

In [53]:
#setting parameters for lightgbm
param = {'num_leaves':150, 
         'objective':'binary',
         'max_depth':7,
         'learning_rate':.05,
         'max_bin':200}

param['metric'] = ['auc', 'binary_logloss']

Here we have set max_depth in xgb and LightGBM to 7 to have a fair comparison between the two.

In [54]:
#training our model using light gbm
num_round=50

start=datetime.now()
lgbm=lgb.train(param,train_data,num_round)
stop=datetime.now()

In [55]:
#Execution time of the model
execution_time_lgbm = stop-start
execution_time_lgbm

datetime.timedelta(microseconds=331794)

In [48]:
#predicting on test set
ypred2=lgbm.predict(x_test)
ypred2[0:5]  # showing first 5 predictions

array([0.24892097, 0.14424317, 0.06978853, 0.14574077, 0.02542005])

In [49]:
#converting probabilities into 0 or 1
for i in range(0,9769):
    if ypred2[i]>=.5:       # setting threshold to .5
        ypred2[i]=1
    else:  
        ypred2[i]=0

In [50]:
#calculating accuracy
accuracy_lgbm = accuracy_score(ypred2,y_test)
accuracy_lgbm

0.8632408639574163

In [64]:
y_test.value_counts()

0    7355
1    2414
Name: Income, dtype: int64

In [59]:
from sklearn.metrics import roc_auc_score

In [60]:
#calculating roc_auc_score for xgboost
auc_xgb =  roc_auc_score(y_test,ypred)
auc_xgb

0.7722702995274

In [66]:
#calculating roc_auc_score for light gbm. 
auc_lgbm = roc_auc_score(y_test,ypred2)
auc_lgbm 
comparison_dict = {'accuracy score':(accuracy_lgbm,accuracy_xgb),'auc score':(auc_lgbm,auc_xgb),'execution time':(execution_time_lgbm,execution_time_xgb)}

In [67]:
#Creating a dataframe ‘comparison_df’ for comparing the performance of Lightgbm and xgb. 
comparison_df = DataFrame(comparison_dict) 
comparison_df.index= ['LightGBM','xgboost'] 
comparison_df

Unnamed: 0,accuracy score,auc score,execution time
LightGBM,0.865902,0.767905,00:00:00.489697
xgboost,0.867028,0.77227,00:00:05.090863


There has been only a slight increase in accuracy and auc score by applying Light GBM over XGBOOST but there is a significant difference in the execution time for the training procedure. Light GBM is almost 7 times faster than XGBOOST and is a much better approach when dealing with large datasets.