# Reading the dataset

In [12]:
import pandas as pd
from easygui import fileopenbox
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
fileopenbox()

'D:\\Datasets\\income_evaluation.csv'

In [13]:
df=pd.read_csv('D:\\Datasets\\income_evaluation.csv')

# Data Pre-processing

Checking the data types of variables

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1    workclass       32561 non-null  object
 2    fnlwgt          32561 non-null  int64 
 3    education       32561 non-null  object
 4    education-num   32561 non-null  int64 
 5    marital-status  32561 non-null  object
 6    occupation      32561 non-null  object
 7    relationship    32561 non-null  object
 8    race            32561 non-null  object
 9    sex             32561 non-null  object
 10   capital-gain    32561 non-null  int64 
 11   capital-loss    32561 non-null  int64 
 12   hours-per-week  32561 non-null  int64 
 13   native-country  32561 non-null  object
 14   income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [15]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


Dropping the insignificant features

In [16]:
df.drop([' native-country'],inplace=True,axis=1)

Segregating categorical columns from the dataset

In [17]:
cat_cols=df.select_dtypes(include='object').columns
num_cols=df.select_dtypes(exclude='object').columns

In [18]:
cat_cols

Index([' workclass', ' education', ' marital-status', ' occupation',
       ' relationship', ' race', ' sex', ' income'],
      dtype='object')

In [19]:
num_cols

Index(['age', ' fnlwgt', ' education-num', ' capital-gain', ' capital-loss',
       ' hours-per-week'],
      dtype='object')

Checking for null and unique values

In [20]:
for i in cat_cols:
    y=df[i].isna().sum()
    print(i,':',y)

 workclass : 0
 education : 0
 marital-status : 0
 occupation : 0
 relationship : 0
 race : 0
 sex : 0
 income : 0


In [21]:
for i in cat_cols:
    y=df[i].unique()
    print(i,':',y)

 workclass : [' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked']
 education : [' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th']
 marital-status : [' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed']
 occupation : [' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Craft-repair' ' Transport-moving'
 ' Farming-fishing' ' Machine-op-inspct' ' Tech-support' ' ?'
 ' Protective-serv' ' Armed-Forces' ' Priv-house-serv']
 relationship : [' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative']
 race : [' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other']
 sex : [' Male' ' Female']
 income : [' <=50K' ' >50K']


In [22]:
for i in num_cols:
    y=df[i].isna().sum()
    print(i,':',y)

age : 0
 fnlwgt : 0
 education-num : 0
 capital-gain : 0
 capital-loss : 0
 hours-per-week : 0


In [23]:
import numpy as np

In [24]:
df[' workclass']=df[' workclass'].replace(' ?',np.nan)

In [25]:
df[' workclass'].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', nan, ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [26]:
df[' workclass'].mode()

0     Private
dtype: object

In [27]:
df[' workclass']=df[' workclass'].replace(np.nan,' Private')

In [28]:
df[' workclass'].value_counts()

 Private             24532
 Self-emp-not-inc     2541
 Local-gov            2093
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name:  workclass, dtype: int64

In [29]:
df[' occupation'].unique()

array([' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners',
       ' Prof-specialty', ' Other-service', ' Sales', ' Craft-repair',
       ' Transport-moving', ' Farming-fishing', ' Machine-op-inspct',
       ' Tech-support', ' ?', ' Protective-serv', ' Armed-Forces',
       ' Priv-house-serv'], dtype=object)

In [30]:
df[' occupation']=df[' occupation'].replace(' ?',np.nan)

In [31]:
df[' occupation'].mode()

0     Prof-specialty
dtype: object

In [32]:
df[' occupation']=df[' occupation'].replace(np.nan,' Prof-specialty')

In [33]:
df[' income']=df[' income'].replace({' <=50K':'0', ' >50K':'1'})

Encoding the categorical variables

In [39]:
X=df.iloc[:,:-1]
Y=df.iloc[:,-1]

In [40]:
X=pd.get_dummies(X)

Check for Data imbalance in dependent variable

In [41]:
Y.value_counts()/df.shape[0]*100

 income
0          75.919044
1          24.080956
dtype: float64

Balancing the data using SMOTE

In [42]:
from imblearn.over_sampling import SMOTE

In [43]:
sm=SMOTE()

In [44]:
X,Y=sm.fit_resample(X,Y)

In [45]:
X=pd.DataFrame(X)
Y=pd.DataFrame(Y)

Splitting the data into train and test

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=466)

Scaling the data

In [49]:
from sklearn.preprocessing import StandardScaler

In [50]:
sc=StandardScaler()

In [52]:
train_sc=sc.fit_transform(x_train)

In [53]:
test_sc=sc.transform(x_test)

# Building the model using Light Gradient Boosting Machine Algorithm

In [69]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

In [55]:
kfold=KFold(n_splits=10)

In [70]:
lgbm=LGBMClassifier()

In [110]:
param_grid={'boosting_type':['gbdt'], 'class_weight':[None], 'colsample_bytree':[1.0],
               'importance_type':['split'], 'learning_rate':[0.1], 'max_depth':[-1],
               'min_child_samples':[20], 'min_child_weight':[0.001], 'min_split_gain':[0.0],
               'n_estimators':[100,200] ,'n_jobs':[-1], 'num_leaves':[31], 'objective':[None],
               'random_state':[8302],'silent':[True],
               'subsample':[1.0], 'subsample_for_bin':[200000], 'subsample_freq':[0]}

In [111]:
grid=GridSearchCV(estimator=lgbm,param_grid=param_grid,cv=kfold,scoring='accuracy')

In [112]:
grid.fit(train_sc,y_train)

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_s...
                         'max_depth': [-1], 'min_child_samples': [20],
                         'min_child_weight': [0.001], 'min_split_gain': [0.0],
                         'n_estimators': [100, 200], 'n_jobs': [-1],
                         'num_leaves': [31], 'objective': [None],
              

# Evaluvating LightGBM model

In [113]:
grid.score(train_sc,y_train)

0.9258263985205732

In [114]:
grid.score(test_sc,y_test)

0.9107335490830637

In [79]:
grid.best_params_

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': 8302,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [80]:
from sklearn.metrics import*

In [81]:
pred_test=grid.predict(test_sc)

In [83]:
print(classification_report(y_test,pred_test))

              precision    recall  f1-score   support

           0       0.90      0.93      0.91      7367
           1       0.93      0.89      0.91      7465

    accuracy                           0.91     14832
   macro avg       0.91      0.91      0.91     14832
weighted avg       0.91      0.91      0.91     14832



In [86]:
print(classification_report(y_test,pred_test))

              precision    recall  f1-score   support

           0       0.90      0.93      0.91      7367
           1       0.93      0.89      0.91      7465

    accuracy                           0.91     14832
   macro avg       0.91      0.91      0.91     14832
weighted avg       0.91      0.91      0.91     14832



# Building the model using XGBoost Algorithm

In [118]:
import xgboost
from xgboost import XGBClassifier

In [119]:
xgb=XGBClassifier()

In [99]:
param_grid1={'base_score':[0.5], 'booster':['gbtree'], 'colsample_bylevel':[1],
              'colsample_bynode':[1], 'colsample_bytree':[1], 'gamma':[0], 'gpu_id':[-1],
              'importance_type':['gain'],
              'learning_rate':[0.2], 'max_delta_step':[0], 'max_depth':[6],
              'min_child_weight':[1],
              'n_estimators':[100,200], 'n_jobs':[-1], 'num_parallel_tree':[1],
              'objective':['binary:logistic'], 'random_state':[8302], 'reg_alpha':[0],
              'reg_lambda':[1], 'scale_pos_weight':[1], 'subsample':[1],'tree_method':['auto'],
              'use_label_encoder':[True], 'validate_parameters':[1]}

In [100]:
grid_xgb=GridSearchCV(estimator=xgb,param_grid=param_grid1,scoring='accuracy')

In [101]:
grid_xgb.fit(train_sc,y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_es...
                         'min_child_weight': [1], 'n_estimators': [100, 200],
                         'n_jobs': [-1], 'num_parallel_tree': [1],
                         'objective': ['binary:logistic'],
                         'random_state': [8302], 'reg_alpha': [0],
            

# Evaluvating XGBoost model

In [102]:
grid_xgb.score(train_sc,y_train)

0.9309697179842811

In [103]:
grid_xgb.score(test_sc,y_test)

0.9097222222222222

In [104]:
pred_test1=grid_xgb.predict(test_sc)

In [106]:
print(classification_report(y_test,pred_test1))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91      7367
           1       0.92      0.90      0.91      7465

    accuracy                           0.91     14832
   macro avg       0.91      0.91      0.91     14832
weighted avg       0.91      0.91      0.91     14832



In [117]:
print('The test accuracy for LightGBM model is :',grid.score(test_sc,y_test))
print('The test accuracy for XGBoost model is :',grid_xgb.score(test_sc,y_test))

The test accuracy for LightGBM model is : 0.9107335490830637
The test accuracy for XGBoost model is : 0.9097222222222222


# Conclusion

From the test accuracy scores, we can conclude that LightGBM model performed better with higher accuracy