In [196]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [197]:
data = pd.read_csv('adult_csv.csv')

In [198]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
0,2,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States,<=50K
1,3,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States,<=50K
2,2,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States,<=50K
3,3,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States,<=50K
4,1,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba,<=50K


# Preprocessing Step 1
1. Feature selection: remove features ‘capitalgain’, ‘capitalloss’ and ‘native-country.’

In [199]:
data = data.drop(columns = ['capitalgain', 'capitalloss', 'native-country'], axis = 1)

In [200]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,hoursperweek,class
0,2,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2,<=50K
1,3,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,<=50K
2,2,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,2,<=50K
3,3,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,2,<=50K
4,1,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,2,<=50K


# Preprocessing Step 2
1. Remove instances with at least one missing value ‘?’.

In [201]:
data.isna().sum()

age                  0
workclass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
hoursperweek         0
class                0
dtype: int64

In [202]:
data = data.dropna()

In [203]:
data.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
hoursperweek      0
class             0
dtype: int64

# Preprocessing Step 3
3. Randomly split the dataset into 90% as train and 10% as test. Remember to utilize 
a random seed for sampling train and test instances.

# Splitting into training and testing

In [204]:
train_dataset = data.sample(frac= 0.9, random_state = 42)
test_dataset = data.drop(train_dataset.index)
y_train = train_dataset['class']
x_train = train_dataset.iloc[:,:-1]
y_test = test_dataset['class']
x_test = test_dataset.iloc[:,:-1]

# Build the Classifier Step 1
1. Estimate the probabilities for continuous attributes (‘age’, ‘fnlwgt’, ‘education-num’
and ‘hoursperweek’) by fitting a Normal distribution.

# Normal distribution probabilites

In [205]:
from math import sqrt
from math import pi
from math import exp

# mean and Standard deviation for each column of data
def normal_distribution(column_value, mean, std):
    exp_val = exp(-((column_value-mean)**2 / (2 * std**2 )))
    final = (1 / (sqrt(2 * pi) * std)) * exp_val
    return final



# Laplace Smoothing

In [206]:
# take alpha as 1
def laplace_smoothing(count,N, K):
    val = (count + 1) / (N + K)
    return val

In [207]:
# priors probability
classes = []
priors = []
classes = np.unique(y_train)
for cls in range(len(classes)):
    priors.append(y_train.value_counts()[cls] / len(y_train))

In [208]:
def naive_bayes(X):
    total_no_of_predictions = []
    normal_dist_columns = ['age', 'fnlwgt', 'education-num', 'hoursperweek']
    laplace_columns = [i for i in list(x_train.columns) if i not in normal_dist_columns]
    all_columns = normal_dist_columns + laplace_columns
    
    class_probabilities = {}
    predictions = []
    for i in X.index:          
        row = X.loc[i]     
        for k, cls in enumerate(classes): 
            class_probabilities[cls] = priors[k]
            for col in all_columns:
                if col in normal_dist_columns:
                    mean = train.loc[train['class'] == cls, col].mean() 
                    std = train.loc[train['class'] == cls, col].std()                
                    probabilities = normal_distribution(row[col], mean, std)
                else:
                    x_and_class = len((train[(train['class'] == cls) & (train[col] == row[col])]))
                    probabilities = laplace_smoothing(x_and_class,y_train.value_counts()[cls], len(classes) )
                    

                class_probabilities[cls] *=  probabilities
                
        max_val = max(class_probabilities.values())
        for key, val in class_probabilities.items():
            if val == max_val:
                class_of_max = key
        predictions.append(class_of_max)
        
    return predictions   

In [209]:
# test using test dataset
test_predictions = naive_bayes(x_test)

In [210]:
count = 0
for i, j in zip(y_test, test_predictions):
    if i == j:
        count += 1
accuracy = count / len(y_test)

In [211]:
print(accuracy)

0.8116445796219857
