In [106]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB

In [4]:
train_data = pd.read_csv('SalaryData_Train.csv')
test_data = pd.read_csv('SalaryData_Test.csv')

In [6]:
x_train = train_data.iloc[:, 0:-1]
y_train = train_data.iloc[:, -1]

In [8]:
x_test = test_data.iloc[:, 0:-1]
y_test = test_data.iloc[:, -1]

In [10]:
x_train.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


We have educationno column which is unnecssary coolumn. Hennce, we will drop it.

In [12]:
x_train.drop('educationno', axis = 1, inplace = True) 

In [14]:
x_test.drop('educationno', axis = 1, inplace = True)

In [16]:
x_train.head()

Unnamed: 0,age,workclass,education,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [18]:
x_train.shape

(30161, 12)

As we have 12 feature, let us reduce the nnumber of feature using feature reduction technique. So, initially, we need to convert the categorical
data to numerical data using encoder and then we will perform standardizaion and finally we will find the high variance components.

Let us see all unique values each feature has

In [20]:
for i in x_train.columns:
    print(x_train[i].unique())    

[39 50 38 53 28 37 49 52 31 42 30 23 32 34 25 43 40 54 35 59 56 19 20 45
 22 48 21 24 57 44 41 29 47 46 36 79 27 18 33 76 55 61 70 64 71 66 51 58
 26 17 60 90 75 65 77 62 63 67 74 72 69 68 73 81 78 88 80 84 83 85 82 86]
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' Self-emp-inc' ' Without-pay']
[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' 7th-8th' ' Doctorate' ' Assoc-voc' ' Prof-school'
 ' 5th-6th' ' 10th' ' Preschool' ' 12th' ' 1st-4th']
[' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed']
[' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Transport-moving' ' Farming-fishing'
 ' Machine-op-inspct' ' Tech-support' ' Craft-repair' ' Protective-serv'
 ' Armed-Forces' ' Priv-house-serv']
[' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative']
[' White' ' Black' ' Asian-Pa

Now, as we can see every categorical feature is Nominal data. So, we will use Label Encoder to encode each value.

In [22]:
data_columns = x_train.columns[1:x_train.columns.get_loc('capitalgain')]
for i in data_columns:
    label = LabelEncoder()
    x_train[i] = label.fit_transform(x_train[i])
    x_test[i] = label.fit_transform(x_test[i])

In [24]:
x_train.head()

Unnamed: 0,age,workclass,education,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native
0,39,5,9,4,0,1,4,1,2174,0,40,United-States
1,50,4,9,2,3,0,4,1,0,0,13,United-States
2,38,2,11,0,5,1,4,1,0,0,40,United-States
3,53,2,1,2,5,0,2,1,0,0,40,United-States
4,28,2,9,2,9,5,2,0,0,0,40,Cuba


In [26]:
label = LabelEncoder()
x_train['native'] = label.fit_transform(x_train['native'])


In [28]:
x_train.head()

Unnamed: 0,age,workclass,education,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native
0,39,5,9,4,0,1,4,1,2174,0,40,37
1,50,4,9,2,3,0,4,1,0,0,13,37
2,38,2,11,0,5,1,4,1,0,0,40,37
3,53,2,1,2,5,0,2,1,0,0,40,37
4,28,2,9,2,9,5,2,0,0,0,40,4


Now, let us perform hyperparameter tuning to get best number of components

In [90]:
# We will use 5 folds for model Validation
kfold = KFold(n_splits=7, random_state = 7, shuffle = True)
space = {
    'n_components': [2, 3, 4, 8, 10, 12]
}
pca = PCA()
gridsearch = GridSearchCV(pca, space, cv = kfold)
gridsearch.fit(x_train, y_train)

In [98]:
print("The best component for PCA is,", gridsearch.best_params_)

The best component for PCA is, {'n_components': 12}


The above value suggest that there is no need of feature reduction, because as we are increasing the number of components, the best parameters is keep on increasing which suggest that the more the components the bettes it will be. So, we won't do any and directly find Naive Bayas

In [144]:
naive_bayas = MultinomialNB()
naive_bayas.fit(x_train, y_train)
traindata_pred = naive_bayas.predict(x_train)

Now, let us find the accuracy of the model

In [146]:
accuracy = accuracy_score(y_train, y_pred)
print("The accuracy of the model on training data is,", accuracy)

The accuracy of the model on training data is, 0.7729186698053778


In [122]:
report = classification_report(y_train, traindata_pred)
print(report)

              precision    recall  f1-score   support

       <=50K       0.79      0.96      0.86     22653
        >50K       0.63      0.21      0.32      7508

    accuracy                           0.77     30161
   macro avg       0.71      0.59      0.59     30161
weighted avg       0.75      0.77      0.73     30161



Now, let us predict the model using testing data

In [135]:
x_test['native'] = label.fit_transform(x_test['native'])

In [137]:
testdata_pred = naive_bayas.predict(x_test)

In [140]:
accuracy = accuracy_score(y_test, testdata_pred)
print("The accuracy on testing data is,", accuracy)

The accuracy on testing data is, 0.7749667994687915


In [150]:
report = classification_report(y_test, testdata_pred)
print(report)

              precision    recall  f1-score   support

       <=50K       0.79      0.96      0.87     11360
        >50K       0.62      0.21      0.32      3700

    accuracy                           0.77     15060
   macro avg       0.71      0.58      0.59     15060
weighted avg       0.75      0.77      0.73     15060

