In [None]:
import numpy as np 
import pandas as pd  
import seaborn as sns  
%matplotlib inline
import warnings

warnings.filterwarnings('ignore')

##### 1. Bayesian Classification + Support Vector Machine

Data
analysis and data
preparation

In [None]:
df = pd.read_csv('adult.csv', header=None, sep=', ', engine='python')

In [None]:
df.head()

In [None]:
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation',
              'relationship',
              'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']

df.head()

In [None]:
df.info()

In [None]:
# find categorical variables

categorical = list(df.select_dtypes(exclude='number').columns)
print('The categorical variables are :\n\n', categorical)

In [None]:
df[categorical].head()

In [None]:
# view frequency counts of values in categorical variables

for col in categorical:
    print(df[col].value_counts(), '\n')

In [None]:
# replace '?' values in workclass, occupation, native_country with `NaN`

df['workclass'].replace('?', np.NaN, inplace=True)
df['occupation'].replace('?', np.NaN, inplace=True)
df['native_country'].replace('?', np.NaN, inplace=True)

In [None]:
df[categorical].isnull().sum()

In [None]:
numerical = list(df.select_dtypes(include='number'))
df[numerical].isnull().sum()

Data
splitting

In [None]:
X = df.drop(['income'], axis=1)
y = df['income']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
# impute missing categorical variables with most frequent value

for df2 in [X_train, X_test]:
    df2['workclass'].fillna(X_train['workclass'].mode()[0], inplace=True)
    df2['occupation'].fillna(X_train['occupation'].mode()[0], inplace=True)
    df2['native_country'].fillna(X_train['native_country'].mode()[0], inplace=True)

In [None]:
# encode remaining variables with one-hot encoding
import category_encoders as ce

encoder = ce.OneHotEncoder(cols=['workclass', 'education', 'marital_status', 'occupation', 'relationship',
                                 'race', 'sex', 'native_country'])

X_train = encoder.fit_transform(X_train)

X_test = encoder.transform(X_test)

In [None]:
X_train.head()

Feature
scaling

In [None]:
from sklearn.preprocessing import RobustScaler

cols = X_train.columns
scaler = RobustScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])

In [None]:
X_train

Predict
the
results

In [None]:
# train a Gaussian Naive Bayes classifier on the training set
from sklearn.naive_bayes import GaussianNB

gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)

y_pred_gnb = gnb_model.predict(X_test)

In [None]:
# train a Support Vector classifier on the training set
from sklearn.svm import LinearSVC

svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)

Metrics

In [None]:
from sklearn.metrics import recall_score, f1_score, accuracy_score


def print_metrics(y_pred):
    print('Accuracy score: {:.4f}'.format(accuracy_score(y_test, y_pred)))
    print('Recall score: {:.4f}'.format(recall_score(y_test, y_pred, pos_label="<=50K")))
    print('F1 score: {:.4f}\n'.format(f1_score(y_test, y_pred, pos_label="<=50K")))


print('Naive Bayes\n')
print_metrics(y_pred_gnb)

print('Support Vector Machine\n')
print_metrics(y_pred_svm)

Confusion
matrix

In [None]:
from sklearn.metrics import confusion_matrix


def conf_matrix(y_pred):
    cm = confusion_matrix(y_test, y_pred)
    cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'],
                             index=['Predict Positive:1', 'Predict Negative:0'])
    sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')


print('Naive Bayes\n')
conf_matrix(y_pred_gnb)

In [None]:
print('Support Vector Machine\n')
conf_matrix(y_pred_svm)

Check
for overfitting and underfitting

In [None]:
def overfit_check(model):
    print('Training set score: {:.4f}'.format(model.score(X_train, y_train)))
    print('Test set score: {:.4f}\n'.format(model.score(X_test, y_test)))
    
print('Naive Bayes\n')
overfit_check(gnb_model)

print('Support Vector Machine\n')
overfit_check(svm_model)

The
training - set
accuracy
score and the
test - set
accuracy
are
quite
comparable
for both classifiers.So, there is no sign of overfitting.

Null
accuracy

In [None]:
y_test.value_counts()

In [None]:
null_accuracy = (7407 / (7407 + 2362))

print('Null accuracy score: {0:0.4f}'.format(null_accuracy))

We
can
see
that
our
model
accuracy
score is 0.8083 / 0.8316
but
null
accuracy
score is 0.7582.So, we
can
conclude
that
our
models
are
doing
a
very
good
job in predicting
the class labels.

Conclusion

So, it
seems
that
the
Support
Vector
Machine(SVM)
classifier
outperforms
the
Naive
Bayes
classifier in terms
of
both
accuracy and the
ability
to
correctly
classify
positive
instances(higher
recall).