In [76]:
#importing standard libraries 
import numpy as np 
import pandas as pd 
from pandas import Series, DataFrame 

#import lightgbm and xgboost 
import lightgbm as lgbm
import xgboost as xgb 

In [97]:
#loading our training dataset 'adult.csv' with name 'data' using pandas 
df=pd.read_csv('E:\Projects folder\gbm vs xgboost\income.csv',header=None) 

In [98]:
#assigning names to the columns 
df.columns=['age','work_class','fnl_wgt','education','education-num','marital_Status',
            'occupation','relationship','race','sex','capital_gain','capital_loss','hours_per_week',
            'native_country','Income'] 

In [99]:
df.head() 

Unnamed: 0,age,work_class,fnl_wgt,education,education-num,marital_Status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [100]:
# LabelEncoding our predictor variable 
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
l=LabelEncoder() 
l.fit(df.Income) 
df.Income=Series(l.transform(df.Income))  
df.Income.value_counts() 

0    24720
1     7841
Name: Income, dtype: int64

In [101]:
#One_Hot_Encoding for the Categorical features in the dataset
one_hot_work_class=pd.get_dummies(df.work_class) 
one_hot_education=pd.get_dummies(df.education) 
one_hot_marital_Status=pd.get_dummies(df.marital_Status) 
one_hot_occupation=pd.get_dummies(df.occupation)
one_hot_relationship=pd.get_dummies(df.relationship) 
one_hot_race=pd.get_dummies(df.race) 
one_hot_sex=pd.get_dummies(df.sex) 
one_hot_native_country=pd.get_dummies(df.native_country) 

In [102]:
#removing categorical features 
df.drop(['work_class','education','marital_Status','occupation','relationship',
         'race','sex','native_country'],axis=1,inplace=True)

In [103]:
#Merging one hot encoded features with our dataset 'data' 
df=pd.concat([df,one_hot_workclass,one_hot_education,one_hot_marital_Status,
              one_hot_occupation,one_hot_relationship,
              one_hot_race,one_hot_sex,one_hot_native_country],axis=1) 

In [104]:
 _, i = np.unique(df.columns, return_index=True) #removing duplicates
df=df.iloc[:, i] 

In [105]:
#Here our target variable is 'Income' with values as 1 or 0.  
#Separating our data into features dataset x and our target dataset y 
x=df.drop('Income',axis=1) 
y=df.Income 

In [106]:
#Imputing missing values in our target variable 
y.fillna(y.mode()[0],inplace=True) 

# Model 1

In [107]:
#splitting our dataset into test and train 
from sklearn.model_selection import train_test_split 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.3)

In [108]:
#DMatrix object 
#label is used to define our outcome variable
dtrain=xgb.DMatrix(x_train,label=y_train)
dtest=xgb.DMatrix(x_test)

  if getattr(data, 'base', None) is not None and \


In [212]:
#setting parameters for xgboost #hyperparameter-tuning
parameters={'max_depth':7, 'silent':1,'objective':'binary:logistic','eval_metric':'auc','learning_rate':.05}

In [213]:
#training our model 
num_round=50
from datetime import datetime 
start = datetime.now() 
xg=xgb.train(parameters,dtrain,num_round) 
stop = datetime.now()

In [214]:
#Execution time of the model 
execution_time_xgb = stop-start 
execution_time_xgb

datetime.timedelta(seconds=3, microseconds=243415)

In [230]:
#datetime.timedelta( , , ) representation => (days , seconds , microseconds) 
#now predicting our model on test set 
ypred=xg.predict(dtest) 
ypred

array([0.07466042, 0.17751524, 0.12756528, ..., 0.09026802, 0.38909808,
       0.05792807], dtype=float32)

In [231]:
#Converting probabilities into 1 or 0  
for i in range(0,9769): 
    if ypred[i]>=.5:       # setting threshold to .5 
       ypred[i]=1 
    else: 
       ypred[i]=0  

In [232]:
#calculating accuracy of our model 
from sklearn.metrics import accuracy_score 
accuracy_xgb = accuracy_score(y_test,ypred) 
accuracy_xgb

0.8647763332992118

In [116]:
train_data=lgbm.Dataset(x_train,label=y_train)

In [220]:
#setting parameters for lightgbm
param = {'num_leaves':150, 'objective':'binary','max_depth':7,'learning_rate':.05,'max_bin':200}
param['metric'] = ['auc', 'binary_logloss']

In [221]:
#training our model using light gbm
num_round=50
start=datetime.now()
lgbm=lgb.train(param,train_data,num_round)
stop=datetime.now()

In [222]:
#Execution time of the model
execution_time_lgbm = stop-start
execution_time_lgbm

datetime.timedelta(microseconds=239904)

In [223]:
#predicting on test set
ypred2=lgbm.predict(x_test)
ypred2[0:5]  # showing first 5 predictions

array([0.04946384, 0.162978  , 0.10434907, 0.94008616, 0.27074123])

In [224]:
#converting probabilities into 0 or 1
for i in range(0,9769):
    if ypred2[i]>=.5:       # setting threshold to .5
       ypred2[i]=1
    else:  
       ypred2[i]=0

In [225]:
#calculating accuracy
accuracy_lgbm = accuracy_score(ypred2,y_test)
accuracy_lgbm
y_test.value_counts()

0    7456
1    2313
Name: Income, dtype: int64

In [226]:
from sklearn.metrics import roc_auc_score

In [227]:
#calculating roc_auc_score for xgboost
auc_xgb =  roc_auc_score(y_test,ypred)
auc_xgb

0.769312376955035

In [228]:
#calculating roc_auc_score for light gbm. 
auc_lgbm = roc_auc_score(y_test,ypred2)
auc_lgbm_comparison_dict = {'accuracy score':(accuracy_lgbm,accuracy_xgb),'auc score':(auc_lgbm,auc_xgb),'execution time':(execution_time_lgbm,execution_time_xgb)}

In [229]:
#Creating a dataframe ‘comparison_df’ for comparing the performance of Lightgbm and xgb. 
comparison_df = DataFrame(auc_lgbm_comparison_dict) 
comparison_df.index= ['LightGBM','xgboost'] 
comparison_df

Unnamed: 0,accuracy score,auc score,execution time
LightGBM,0.86191,0.759234,00:00:00.239904
xgboost,0.864776,0.769312,00:00:03.243415


In [195]:
#calculating accuracy
accuracy_lgbm = accuracy_score(ypred2,y_test)
accuracy_lgbm

0.8613983007472618

# Model 2

In [235]:
parameters={'max_depth':8, 'silent':0,'objective':'binary:logistic','eval_metric':'rmse','learning_rate':.05}

In [236]:
#training our model 
num_round=50
from datetime import datetime 
start = datetime.now() 
xg=xgb.train(parameters,dtrain,num_round) 
stop = datetime.now()

In [237]:
#Execution time of the model 
execution_time_xgb = stop-start 
execution_time_xgb

datetime.timedelta(seconds=3, microseconds=317694)

In [238]:
#datetime.timedelta( , , ) representation => (days , seconds , microseconds) 
#now predicting our model on test set 
ypred=xg.predict(dtest) 
ypred

array([0.07404082, 0.16118208, 0.11502319, ..., 0.07803047, 0.38422543,
       0.05683424], dtype=float32)

In [239]:
#Converting probabilities into 1 or 0  
for i in range(0,9769): 
    if ypred[i]>=.5:       # setting threshold to .5 
       ypred[i]=1 
    else: 
       ypred[i]=0  

In [240]:
#calculating accuracy of our model 
from sklearn.metrics import accuracy_score 
accuracy_xgb = accuracy_score(y_test,ypred) 
accuracy_xgb

0.8652881564131436

In [241]:
#setting parameters for lightgbm
param = {'num_leaves':170, 'objective':'binary','max_depth':7,'learning_rate':.04,'max_bin':230}
param['metric'] = ['mse', 'binary_logloss']

In [242]:
#training our model using light gbm
num_round=50
start=datetime.now()
lgbm=lgb.train(param,train_data,num_round)
stop=datetime.now()

In [243]:
#Execution time of the model
execution_time_lgbm = stop-start
execution_time_lgbm

datetime.timedelta(microseconds=245997)

In [244]:
#predicting on test set
ypred2=lgbm.predict(x_test)
ypred2[0:5]  # showing first 5 predictions

array([0.06161212, 0.15670569, 0.12676215, 0.90024809, 0.2766249 ])

In [245]:
#converting probabilities into 0 or 1
for i in range(0,9769):
    if ypred2[i]>=.5:       # setting threshold to .5
       ypred2[i]=1
    else:  
       ypred2[i]=0

In [246]:
#calculating accuracy
accuracy_lgbm = accuracy_score(ypred2,y_test)
accuracy_lgbm
y_test.value_counts()

0    7456
1    2313
Name: Income, dtype: int64

In [247]:
from sklearn.metrics import mean_squared_error

In [248]:
#calculating mse_score for xgboost
mse_xgb =  mean_squared_error(y_test,ypred)
mse_xgb

0.1347118435868564

In [249]:
#calculating mse_score for light gbm. 
mse_lgbm = mean_squared_error(y_test,ypred2)
mse_lgbm_comparison_dict = {'accuracy score':(accuracy_lgbm,accuracy_xgb),'rsme score':(mse_lgbm,mse_xgb),'execution time':(execution_time_lgbm,execution_time_xgb)}

In [250]:
#Creating a dataframe ‘comparison_df’ for comparing the performance of Lightgbm and xgb. 
comparison_df = DataFrame(mse_lgbm_comparison_dict) 
comparison_df.index= ['LightGBM','xgboost'] 
comparison_df

Unnamed: 0,accuracy score,rsme score,execution time
LightGBM,0.863036,0.136964,00:00:00.245997
xgboost,0.865288,0.134712,00:00:03.317694


In [251]:
#calculating accuracy
accuracy_lgbm = accuracy_score(ypred2,y_test)
accuracy_lgbm

0.8630361347118436