In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv('census.csv')
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
# Forecasters
forecasters = df.iloc[:, 0:14].values
forecasters

array([[39, ' State-gov', 77516, ..., 0, 40, ' United-States'],
       [50, ' Self-emp-not-inc', 83311, ..., 0, 13, ' United-States'],
       [38, ' Private', 215646, ..., 0, 40, ' United-States'],
       ...,
       [58, ' Private', 151910, ..., 0, 40, ' United-States'],
       [22, ' Private', 201490, ..., 0, 20, ' United-States'],
       [52, ' Self-emp-inc', 287927, ..., 0, 40, ' United-States']],
      dtype=object)

In [4]:
# Classes
classes = df.iloc[:, 14].values
classes

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

In [5]:
labelencoder_forecasters = LabelEncoder()

In [6]:
forecasters[:, 1] = labelencoder_forecasters.fit_transform(forecasters[:, 1])
forecasters[:, 3] = labelencoder_forecasters.fit_transform(forecasters[:, 3])
forecasters[:, 5] = labelencoder_forecasters.fit_transform(forecasters[:, 5])
forecasters[:, 6] = labelencoder_forecasters.fit_transform(forecasters[:, 6])
forecasters[:, 7] = labelencoder_forecasters.fit_transform(forecasters[:, 7])
forecasters[:, 8] = labelencoder_forecasters.fit_transform(forecasters[:, 8])
forecasters[:, 9] = labelencoder_forecasters.fit_transform(forecasters[:, 9])
forecasters[:, 13] = labelencoder_forecasters.fit_transform(forecasters[:, 13])

forecasters

array([[39, 7, 77516, ..., 0, 40, 39],
       [50, 6, 83311, ..., 0, 13, 39],
       [38, 4, 215646, ..., 0, 40, 39],
       ...,
       [58, 4, 151910, ..., 0, 40, 39],
       [22, 4, 201490, ..., 0, 20, 39],
       [52, 5, 287927, ..., 0, 40, 39]], dtype=object)

In [7]:
labelencoder_classes = LabelEncoder()
classes = labelencoder_classes.fit_transform(classes)
classes

array([0, 0, 0, ..., 0, 0, 1])

In [8]:
one_hot_encoder = OneHotEncoder(categorical_features=[1, 3, 5, 6, 7, 8, 9, 13])
forecasters = one_hot_encoder.fit_transform(forecasters).toarray()
forecasters

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.1740e+03, 0.0000e+00,
        4.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.3000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+01],
       ...,
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5024e+04, 0.0000e+00,
        4.0000e+01]])

In [9]:
scaler = StandardScaler()
forecasters[:, 1] = scaler.fit_transform(forecasters[:, 1].reshape(1, -1))
forecasters[:, 3] = scaler.fit_transform(forecasters[:, 3].reshape(1, -1))
forecasters[:, 5] = scaler.fit_transform(forecasters[:, 5].reshape(1, -1))
forecasters[:, 11] = scaler.fit_transform(forecasters[:, 11].reshape(1, -1))
forecasters[:, 12] = scaler.fit_transform(forecasters[:, 12].reshape(1, -1))
forecasters[:, 13] = scaler.fit_transform(forecasters[:, 13].reshape(1, -1))
forecasters
# 3, 5, 11, 12, 13

array([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.1740e+03, 0.0000e+00,
        4.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.3000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+01],
       ...,
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5024e+04, 0.0000e+00,
        4.0000e+01]])

In [10]:
forecasters_train, forecasters_test, classes_train, classes_test = train_test_split(forecasters
                                                                                    , classes
                                                                                    , test_size=0.15
                                                                                    , random_state=0)

print(len(forecasters_train))
print(len(forecasters_test))
print(len(classes_train))
print(len(classes_test))

27676
4885
27676
4885


In [11]:
estimator = GaussianNB()
estimator.fit(forecasters_train, classes_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [12]:
predictions = estimator.predict(forecasters_test)
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
precision = accuracy_score(classes_test, predictions)
print('Precision: {:.2f}%'.format(precision * 100))

Precision: 79.51%


In [14]:
matrix = confusion_matrix(classes_test, predictions)
print('Confusion Matrix:'
      '\n\t0\t1\n0:\t{}\t{}'
      '\n1:\t{}\t{}'.format(matrix[0][0], matrix[0][1], matrix[1][0], matrix[1][1]))

Confusion Matrix:
	0	1
0:	3515	178
1:	823	369
