### Importing necessary library

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


### Load the CSV file

In [2]:
CSV_DataFrame = pd.read_csv("adult.csv")

In [3]:

# Initialize LabelEncoder
label_encoder = LabelEncoder()

categorical_columns = ["work_class", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"]

# Iterate over each categorical column and encode its values
for col in categorical_columns:
    CSV_DataFrame[col] = label_encoder.fit_transform(CSV_DataFrame[col])
    print(f"Encoded labels for column '{col}': {label_encoder.classes_}")

Encoded labels for column 'work_class': ['federal-gov' 'local-gov' 'never-worked' 'private' 'self-emp-inc'
 'self-emp-not-inc' 'state-gov' 'without-pay']
Encoded labels for column 'education': ['assoc-acdm' 'assoc-voc' 'bachelors' 'doctorate' 'high-school' 'masters'
 'prof-school' 'school' 'some-college']
Encoded labels for column 'marital_status': ['divorced' 'married-af-spouse' 'married-civ-spouse'
 'married-spouse-absent' 'never-married' 'separated' 'widowed']
Encoded labels for column 'occupation': ['adm-clerical' 'armed-forces' 'craft-repair' 'exec-managerial'
 'farming-fishing' 'handlers-cleaners' 'machine-op-inspct' 'other-service'
 'priv-house-serv' 'prof-specialty' 'protective-serv' 'sales'
 'tech-support' 'transport-moving']
Encoded labels for column 'relationship': ['husband' 'not-in-family' 'other-relative' 'own-child' 'unmarried' 'wife']
Encoded labels for column 'race': ['amer-indian-eskimo' 'asian-pac-islander' 'black' 'other' 'white']
Encoded labels for column 'sex': ['

### Handling the data imbalance for the target variable (Income)

In [4]:
# Printing the value count
income_counts = CSV_DataFrame['income'].value_counts()
print(income_counts)

income
0    36912
1    11237
Name: count, dtype: int64


In [5]:
from imblearn.over_sampling import SMOTE
# Separate features and target variable
X = CSV_DataFrame.drop(columns=['income'])
y = CSV_DataFrame['income']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [6]:
income_counts = y_train_resampled.value_counts()
print(income_counts)

income
0    29525
1    29525
Name: count, dtype: int64


In [7]:
# Train Naive Bayes Classifier
naive_bayes_classifier = BernoulliNB()
naive_bayes_classifier.fit(X_train_resampled, y_train_resampled)

# Make Predictions
y_pred = naive_bayes_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.7156801661474559


In [8]:

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)


Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.69      0.79      7387
           1       0.44      0.79      0.56      2243

    accuracy                           0.72      9630
   macro avg       0.68      0.74      0.68      9630
weighted avg       0.80      0.72      0.74      9630


Confusion Matrix:
[[5129 2258]
 [ 480 1763]]
