In [1]:
#Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Importing dataset
df = pd.read_csv("dataset/cleaned_adult_data.csv")

In [3]:
#Top 5 records of dataset
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
#Shape of dataset
df.shape

(32537, 15)

In [5]:
#Checking missing values in dataset
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
country           0
salary            0
dtype: int64

In [6]:
#Checking duplicated values in dataset
df.duplicated().sum()

0

In [7]:
#Now we drop duplicated values from dataset
df.drop_duplicates(inplace = True)

In [8]:
df.duplicated().sum()

0

In [9]:
#Columns of dataset
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'country', 'salary'],
      dtype='object')

In [11]:
df['occupation'].unique()

array(['Adm-clerical', 'Exec-managerial', 'Handlers-cleaners',
       'Prof-specialty', 'Other-service', 'Sales', 'Craft-repair',
       'Transport-moving', 'Farming-fishing', 'Machine-op-inspct',
       'Tech-support', 'Protective-serv', 'Armed-Forces',
       'Priv-house-serv'], dtype=object)

In [12]:
#Create categorical features and numerical features
categorical_columns = df.select_dtypes(include='object').columns
numerical_columns= df.select_dtypes(exclude='object').columns

print(categorical_columns)
print()
print(numerical_columns)

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'country', 'salary'],
      dtype='object')

Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')


In [13]:
#Let's check unique values in each categorical columns
for col in categorical_columns:
  print(col,"\n", df[col].unique())
  print('='*80)

workclass 
 ['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov'
 'Self-emp-inc' 'Without-pay' 'Never-worked']
education 
 ['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 '1st-4th' 'Preschool' '12th']
marital-status 
 ['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']
occupation 
 ['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' 'Protective-serv'
 'Armed-Forces' 'Priv-house-serv']
relationship 
 ['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 'Other-relative']
race 
 ['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other']
sex 
 ['Male' 'Female']
country 
 ['United-States' 'Cuba' 'Jamaica' 'India' 'Mexico' 'South' 'Puerto-Rico'
 'Honduras' 'England' 'Canada' 'Ge

In [14]:
#In workclass, occupation and country features has '?' value 
#So we will replace this value with  np.nan values.

In [15]:
df['workclass'] = df['workclass'].replace("?", np.nan)
df['occupation'] = df['occupation'].replace("?", np.nan)
df['country'] = df['country'].replace("?", np.nan)

In [16]:
#Here we will drop education and fnlwgt columns from dataset
#Education column has already encoded as education num column in dataset

In [17]:
df.drop(columns=['education', 'fnlwgt'], inplace = True, axis = 1)

In [18]:
df.columns

Index(['age', 'workclass', 'education-num', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
       'hours-per-week', 'country', 'salary'],
      dtype='object')

In [19]:
df.shape

(32537, 13)

In [20]:
df.salary.value_counts(normalize= True)

salary
<=50K    0.759074
>50K     0.240926
Name: proportion, dtype: float64

In [21]:
df['salary'] = df['salary'].map({'<=50K':0, '>50K':1})

In [22]:
df.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [23]:
#Independent and Dependent Variable
X= df.iloc[:,:-1]
y= df.iloc[:,-1]

In [24]:
X.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [25]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: salary, dtype: int64

In [26]:
X.shape, y.shape

((32537, 12), (32537,))

In [27]:
categorical_columns = X.select_dtypes(include='object').columns
numerical_columns= X.select_dtypes(exclude='object').columns

In [28]:
categorical_columns

Index(['workclass', 'marital-status', 'occupation', 'relationship', 'race',
       'sex', 'country'],
      dtype='object')

In [29]:
numerical_columns

Index(['age', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')

### Model Building

In [30]:
from sklearn.impute import SimpleImputer #Handling missing values 
from sklearn.preprocessing import StandardScaler #Feature scaling
from sklearn.preprocessing import OneHotEncoder #Feature Encoding 
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer

In [31]:
#Numerical pipeline
num_pipeline= Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy= 'median')),
        ('scaler', StandardScaler())
    ]
)

#categorical pipeline
cat_pipeline= Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('one_hot_encoder', OneHotEncoder(handle_unknown= 'ignore',sparse_output=False)),
        ('scaler', StandardScaler())
    ]
)

preprocessor= ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_columns),
    ('cat_pipeline', cat_pipeline, categorical_columns)
])



In [32]:
#Train-test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size= 0.3, random_state= 42)

In [33]:
x_train = pd.DataFrame(preprocessor.fit_transform(x_train), columns= preprocessor.get_feature_names_out())

x_test = pd.DataFrame(preprocessor.transform(x_test), columns= preprocessor.get_feature_names_out())

In [34]:
x_train.shape

(22775, 88)

In [35]:
x_train.head()

Unnamed: 0,num_pipeline__age,num_pipeline__education-num,num_pipeline__capital-gain,num_pipeline__capital-loss,num_pipeline__hours-per-week,cat_pipeline__workclass_Federal-gov,cat_pipeline__workclass_Local-gov,cat_pipeline__workclass_Never-worked,cat_pipeline__workclass_Private,cat_pipeline__workclass_Self-emp-inc,...,cat_pipeline__country_Portugal,cat_pipeline__country_Puerto-Rico,cat_pipeline__country_Scotland,cat_pipeline__country_South,cat_pipeline__country_Taiwan,cat_pipeline__country_Thailand,cat_pipeline__country_Trinadad&Tobago,cat_pipeline__country_United-States,cat_pipeline__country_Vietnam,cat_pipeline__country_Yugoslavia
0,0.693443,-0.033365,0.917418,-0.219029,-0.033058,-0.174097,-0.259894,-0.013254,-1.75252,-0.190925,...,-0.03315,-0.061206,-0.018745,-0.049201,-0.041945,-0.023898,-0.023898,0.310528,-0.045475,-0.021982
1,-0.555208,1.13508,-0.14288,-0.219029,-0.033058,-0.174097,-0.259894,-0.013254,0.570607,-0.190925,...,-0.03315,-0.061206,-0.018745,-0.049201,-0.041945,-0.023898,-0.023898,0.310528,-0.045475,-0.021982
2,0.840343,2.303525,1.929169,-0.219029,0.777388,-0.174097,-0.259894,-0.013254,0.570607,-0.190925,...,-0.03315,-0.061206,-0.018745,-0.049201,-0.041945,-0.023898,-0.023898,0.310528,-0.045475,-0.021982
3,-1.216259,-0.422846,-0.14288,-0.219029,0.372165,-0.174097,-0.259894,-0.013254,0.570607,-0.190925,...,-0.03315,-0.061206,-0.018745,-0.049201,-0.041945,-0.023898,-0.023898,0.310528,-0.045475,-0.021982
4,-0.775558,-0.422846,-0.14288,-0.219029,0.372165,-0.174097,-0.259894,-0.013254,0.570607,-0.190925,...,-0.03315,-0.061206,-0.018745,-0.049201,-0.041945,-0.023898,-0.023898,0.310528,-0.045475,-0.021982


In [36]:
x_test.head()

Unnamed: 0,num_pipeline__age,num_pipeline__education-num,num_pipeline__capital-gain,num_pipeline__capital-loss,num_pipeline__hours-per-week,cat_pipeline__workclass_Federal-gov,cat_pipeline__workclass_Local-gov,cat_pipeline__workclass_Never-worked,cat_pipeline__workclass_Private,cat_pipeline__workclass_Self-emp-inc,...,cat_pipeline__country_Portugal,cat_pipeline__country_Puerto-Rico,cat_pipeline__country_Scotland,cat_pipeline__country_South,cat_pipeline__country_Taiwan,cat_pipeline__country_Thailand,cat_pipeline__country_Trinadad&Tobago,cat_pipeline__country_United-States,cat_pipeline__country_Vietnam,cat_pipeline__country_Yugoslavia
0,-0.555208,0.356117,-0.14288,-0.219029,-0.033058,-0.174097,-0.259894,-0.013254,-1.75252,-0.190925,...,-0.03315,-0.061206,-0.018745,-0.049201,-0.041945,-0.023898,-0.023898,0.310528,-0.045475,-0.021982
1,-1.069359,1.13508,-0.14288,-0.219029,-0.033058,5.743913,-0.259894,-0.013254,-1.75252,-0.190925,...,-0.03315,-0.061206,-0.018745,-0.049201,-0.041945,-0.023898,-0.023898,0.310528,-0.045475,-0.021982
2,1.574844,-0.033365,-0.14288,-0.219029,1.587834,-0.174097,3.847726,-0.013254,-1.75252,-0.190925,...,-0.03315,-0.061206,-0.018745,-0.049201,-0.041945,-0.023898,-0.023898,-3.220317,-0.045475,-0.021982
3,-0.555208,-0.033365,-0.14288,-0.219029,-0.033058,-0.174097,-0.259894,-0.013254,0.570607,-0.190925,...,-0.03315,-0.061206,-0.018745,-0.049201,-0.041945,-0.023898,-0.023898,0.310528,-0.045475,-0.021982
4,0.326192,1.13508,-0.14288,-0.219029,-0.033058,5.743913,-0.259894,-0.013254,-1.75252,-0.190925,...,-0.03315,-0.061206,-0.018745,-0.049201,-0.041945,-0.023898,-0.023898,0.310528,-0.045475,-0.021982


In [37]:
#Now create classification model 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost as xgb
import lightgbm as ltb

In [38]:
#Create an evaluate function to give all metrics after model training
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
def evaluate_model(true, predicted):
    score = accuracy_score(true, predicted)
    cm = confusion_matrix(true, predicted)
    report = classification_report(true, predicted)
    return score, cm, report

In [39]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Support Vector Machine": SVC(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(), 
    "AdaBoost Classifier": AdaBoostClassifier(),
    "XGBoost Classifier": xgb.XGBClassifier(),
    "LGBM Classifier": ltb.LGBMClassifier()
}
model_list = []
scores =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    
    # Evaluate Train and Test dataset
    model_train_score , model_train_cm, model_train_report = evaluate_model(y_train, y_train_pred)

    model_test_score , model_test_cm, model_test_report = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy Score: {:.4f}".format(model_train_score))
    print("- Confusion Matrix: \n",model_train_cm)
    print("- Classification Report: \n",model_train_report)

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Accuracy Score: {:.4f}".format(model_test_score))
    print("- Confusion Matrix: \n",model_test_cm)
    print("- Classification Report: \n",model_test_report)

    scores.append(model_test_score)
    
    print('='*35)
    print('\n')

Logistic Regression
Model performance for Training set
- Accuracy Score: 0.8519
- Confusion Matrix: 
 [[16088  1205]
 [ 2168  3314]]
- Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.93      0.91     17293
           1       0.73      0.60      0.66      5482

    accuracy                           0.85     22775
   macro avg       0.81      0.77      0.78     22775
weighted avg       0.85      0.85      0.85     22775

----------------------------------
Model performance for Test set
- Accuracy Score: 0.8506
- Confusion Matrix: 
 [[6863  542]
 [ 916 1441]]
- Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.93      0.90      7405
           1       0.73      0.61      0.66      2357

    accuracy                           0.85      9762
   macro avg       0.80      0.77      0.78      9762
weighted avg       0.84      0.85      0.85      9762



K-Neighbors Class

In [40]:
### Results
pd.DataFrame(list(zip(model_list, scores)), columns=['Model Name', 'Scores']).sort_values(by=["Scores"],ascending=False)

Unnamed: 0,Model Name,Scores
6,XGBoost Classifier,0.86847
7,LGBM Classifier,0.868367
5,AdaBoost Classifier,0.860889
0,Logistic Regression,0.850645
2,Support Vector Machine,0.847367
3,Random Forest Classifier,0.846548
1,K-Neighbors Classifier,0.828519
4,Decision Tree Classifier,0.813768


In [41]:
#Here we can say that XGBoost,LGBM and AdaBoost classifier model gives good accuracy than other models.