### Problem Statement 

Prepare a classification model using Naive Bayes for salary data 

Data Description:

age -- age of a person

workclass	-- A work class is a grouping of work 

education	-- Education of an individuals	

maritalstatus -- Marital status of an individulas	

occupation	 -- occupation of an individuals

relationship -- 	

race --  Race of an Individual

sex --  Gender of an Individual

capitalgain --  profit received from the sale of an investment	

capitalloss	-- A decrease in the value of a capital asset

hoursperweek -- number of hours work per week	

native -- Native of an individual

Salary -- salary of an individual

In [109]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [5]:
# Loading the data set

test_data = pd.read_csv('SalaryData_Test.csv')
test_data.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,34,Private,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K


In [8]:
train_data = pd.read_csv('SalaryData_train.csv')
train_data.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [57]:
X_test = test_data.drop('Salary', axis = 1)
X_train = train_data.drop('Salary', axis = 1)
y_test = test_data.Salary
y_train = train_data.Salary

In [58]:
y_test = pd.DataFrame(y_test, columns= ['Salary'])

In [59]:
y_test.Salary.value_counts().to_dict()

{' <=50K': 11360, ' >50K': 3700}

In [60]:
y_test.Salary.replace({' <=50K': 0, ' >50K': 1}, inplace = True)
y_test

Unnamed: 0,Salary
0,0
1,0
2,1
3,1
4,0
...,...
15055,0
15056,0
15057,0
15058,0


In [61]:
y_train = pd.DataFrame(y_train, columns = ['Salary'])
y_train

Unnamed: 0,Salary
0,<=50K
1,<=50K
2,<=50K
3,<=50K
4,<=50K
...,...
30156,<=50K
30157,>50K
30158,<=50K
30159,<=50K


In [62]:
y_train

Unnamed: 0,Salary
0,<=50K
1,<=50K
2,<=50K
3,<=50K
4,<=50K
...,...
30156,<=50K
30157,>50K
30158,<=50K
30159,<=50K


In [64]:
y_train.Salary.replace({' <=50K': 0, ' >50K': 1}, inplace = True)
y_train

Unnamed: 0,Salary
0,0
1,0
2,0
3,0
4,0
...,...
30156,0
30157,1
30158,0
30159,0


In [65]:
# Encoding the X_test and X_train data sets before modelling

X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30161 entries, 0 to 30160
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            30161 non-null  int64 
 1   workclass      30161 non-null  object
 2   education      30161 non-null  object
 3   educationno    30161 non-null  int64 
 4   maritalstatus  30161 non-null  object
 5   occupation     30161 non-null  object
 6   relationship   30161 non-null  object
 7   race           30161 non-null  object
 8   sex            30161 non-null  object
 9   capitalgain    30161 non-null  int64 
 10  capitalloss    30161 non-null  int64 
 11  hoursperweek   30161 non-null  int64 
 12  native         30161 non-null  object
dtypes: int64(5), object(8)
memory usage: 3.0+ MB


In [66]:
X_test.workclass.value_counts()

 Private             11021
 Self-emp-not-inc     1297
 Local-gov            1033
 State-gov             667
 Self-emp-inc          572
 Federal-gov           463
 Without-pay             7
Name: workclass, dtype: int64

In [67]:
X_test.education.value_counts()

 HS-grad         4943
 Some-college    3221
 Bachelors       2526
 Masters          887
 Assoc-voc        652
 11th             571
 Assoc-acdm       499
 10th             403
 7th-8th          266
 Prof-school      243
 9th              221
 12th             200
 Doctorate        169
 5th-6th          161
 1st-4th           71
 Preschool         27
Name: education, dtype: int64

In [68]:
X_test.maritalstatus.value_counts()

 Married-civ-spouse       6990
 Never-married            4872
 Divorced                 2083
 Separated                 472
 Widowed                   450
 Married-spouse-absent     182
 Married-AF-spouse          11
Name: maritalstatus, dtype: int64

In [69]:
X_test.occupation.value_counts()

 Exec-managerial      1992
 Craft-repair         1990
 Prof-specialty       1970
 Sales                1824
 Adm-clerical         1819
 Other-service        1596
 Machine-op-inspct    1004
 Transport-moving      744
 Handlers-cleaners     696
 Tech-support          508
 Farming-fishing       491
 Protective-serv       332
 Priv-house-serv        89
 Armed-Forces            5
Name: occupation, dtype: int64

In [70]:
X_test.relationship.value_counts()

 Husband           6203
 Not-in-family     3976
 Own-child         2160
 Unmarried         1576
 Wife               685
 Other-relative     460
Name: relationship, dtype: int64

In [71]:
X_test.race.value_counts()

 White                 12970
 Black                  1411
 Asian-Pac-Islander      408
 Amer-Indian-Eskimo      149
 Other                   122
Name: race, dtype: int64

In [72]:
X_test.sex.value_counts()

 Male      10147
 Female     4913
Name: sex, dtype: int64

In [73]:
X_test.native.value_counts()

 United-States                 13788
 Mexico                          293
 Philippines                      95
 Puerto-Rico                      66
 Germany                          65
 Canada                           56
 El-Salvador                      47
 India                            47
 China                            45
 Cuba                             41
 England                          33
 Italy                            32
 South                            30
 Dominican-Republic               30
 Japan                            30
 Portugal                         28
 Haiti                            27
 Columbia                         26
 Poland                           25
 Guatemala                        23
 Jamaica                          23
 Greece                           20
 Vietnam                          19
 Ecuador                          16
 Nicaragua                        15
 Peru                             15
 Iran                             14
 

In [74]:
X_test

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,34,Private,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15055,33,Private,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States
15056,39,Private,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States
15057,38,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States
15058,44,Private,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States


In [75]:
X_test = pd.get_dummies(X_test, drop_first = True)
X_test

Unnamed: 0,age,educationno,capitalgain,capitalloss,hoursperweek,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,native_ Portugal,native_ Puerto-Rico,native_ Scotland,native_ South,native_ Taiwan,native_ Thailand,native_ Trinadad&Tobago,native_ United-States,native_ Vietnam,native_ Yugoslavia
0,25,7,0,0,40,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,38,9,0,0,50,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,28,12,0,0,40,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,44,10,7688,0,40,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,34,6,0,0,30,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15055,33,13,0,0,40,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
15056,39,13,0,0,36,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
15057,38,13,0,0,50,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
15058,44,13,5455,0,40,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [76]:
X_train = pd.get_dummies(X_train, drop_first = True)
X_train

Unnamed: 0,age,educationno,capitalgain,capitalloss,hoursperweek,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,native_ Portugal,native_ Puerto-Rico,native_ Scotland,native_ South,native_ Taiwan,native_ Thailand,native_ Trinadad&Tobago,native_ United-States,native_ Vietnam,native_ Yugoslavia
0,39,13,2174,0,40,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,50,13,0,0,13,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,38,9,0,0,40,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,7,0,0,40,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,13,0,0,40,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30156,27,12,0,0,38,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
30157,40,9,0,0,40,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
30158,58,9,0,0,40,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
30159,22,9,0,0,20,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [87]:
# Check for balancing of the dataset

df = pd.concat([train_data,test_data])
df.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [103]:
df.Salary.value_counts(normalize = True)

 <=50K    0.752151
 >50K     0.247849
Name: Salary, dtype: float64

It's an imbalanced data set. Hence we need to go for the balncing of the data set

In [90]:
from imblearn.over_sampling import SMOTE

In [91]:
smt = SMOTE()
x_resample,y_resample = smt.fit_resample(X_train,y_train)

In [97]:
# Building the model

gnb = GaussianNB()
gnb.fit(x_resample,y_resample)

GaussianNB()

In [104]:
y_pred_test = gnb.predict(X_test)
y_pred_test

array([0, 1, 1, ..., 1, 1, 1], dtype=int64)

In [105]:
accuracy_score(y_test,y_pred_test)

0.7915006640106241

In [106]:
confusion_matrix(y_test,y_pred_test)

array([[8808, 2552],
       [ 588, 3112]], dtype=int64)

In [107]:
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.94      0.78      0.85     11360
           1       0.55      0.84      0.66      3700

    accuracy                           0.79     15060
   macro avg       0.74      0.81      0.76     15060
weighted avg       0.84      0.79      0.80     15060



In [110]:
bnb = BernoulliNB()
bnb.fit(x_resample,y_resample)

BernoulliNB()

In [111]:
y_pred_test = bnb.predict(X_test)
y_pred_test

array([0, 0, 1, ..., 1, 0, 1], dtype=int64)

In [112]:
accuracy_score(y_test,y_pred_test)

0.7697875166002656

In [113]:
confusion_matrix(y_test,y_pred_test)

array([[8568, 2792],
       [ 675, 3025]], dtype=int64)

In [114]:
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.93      0.75      0.83     11360
           1       0.52      0.82      0.64      3700

    accuracy                           0.77     15060
   macro avg       0.72      0.79      0.73     15060
weighted avg       0.83      0.77      0.78     15060



In [115]:
mnb = MultinomialNB()
mnb.fit(x_resample,y_resample)

MultinomialNB()

In [116]:
y_pred_test = mnb.predict(X_test)
y_pred_test

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [117]:
accuracy_score(y_test,y_pred_test)

0.7749667994687915

In [118]:
confusion_matrix(y_test,y_pred_test)

array([[10891,   469],
       [ 2920,   780]], dtype=int64)

In [119]:
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.79      0.96      0.87     11360
           1       0.62      0.21      0.32      3700

    accuracy                           0.77     15060
   macro avg       0.71      0.58      0.59     15060
weighted avg       0.75      0.77      0.73     15060

