Loading the Pandas library and reading the dataset into a variable dfs, adding the column headers from a list

In [1]:
import pandas as pds
dfs = pds.read_csv('C:\\Users\\Rahul Singh\\Downloads\\adult.data', header=None)
head = ["age", "workclass","fnlwgt","edu","edu_num","marital","occupation", "relationship","race", "sex","capital_gain","capital_loss","hours per week","nativecountry","salary"]
dfs.columns = head

Dropping the 19610th entry because of a singular attribute trait i.e. only one instance of Holland-Netherland in the entire dataset. The occurrence of a single instance in the  testing data means that this entry or value was not "trained", and hence wasn't included in the dictionary of probabilities, raising an error.

In [2]:
dfs = dfs.drop(19609)
dfs.index = range(dfs.shape[0])

Loading the required libraries like Gaussian Naive Bayes, NumPy, Statified KFold model from SciKitLearn, and creating an object of Gaussian NB and Stratified KFold

In [3]:
from sklearn.naive_bayes import GaussianNB
import numpy as np
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)
gnb = GaussianNB()

The folllowing function, "cat_val", accepts the training and testing data with categorical variables, calculates the predicted probabilities for each testing class, and returns the list of probabilities for each entity

In [10]:
def cat_val(x_train, y_train):
    options = y_train.unique()  #Extracting Unique Test Classes
    tr_size = x_train.shape[0]  #Total no. of occurrences
    dfst = list(map(lambda x: (y_train==x).sum()/ tr_size, options))#Calculating probabilties of Each Test Class Occurrence
    stolen_probs = pds.Series(dfst, index = options)              #Storing the result in a Series for further ease of reference
    results_dicti = {}          #Initializing the Dictionary
    for i in x_train:           #For each attribute in Train Data
        sub_dicti = {}         
        for j in x_train[i].unique():    #For Unique Values of Each Attribute 
            sub_dicti2 = {}
            for k in options:            #For each Test Class
                sub_dicti2[k] = x_train[(x_train[i]== j) & (y_train == k)].shape[0]/tr_size/stolen_probs[k]#Conditional Probability
            sub_dicti[j] = sub_dicti2
        results_dicti[i] = sub_dicti
    return stolen_probs, results_dicti,options

In [11]:
def predict_proba_cat (x_test, stolen_probs, results_dicti,options):
    predict_list = np.empty((0,2))                             #create an empty array with 2 as the fixed no. of columns
    for i in x_test.iterrows():                                # for all the entities in the test data
        list1 = []                                             #(alternative)list1 = np.empty((1,0))
        for k in options:                                      # For each Test Class
            val = 1
            for j in i[1].index:                               # For all the attributes of each entity
                val = val * results_dicti[j][i[1][j]][k]       # Multiply the Conditional Probabilities of all the corresponding attribute values for each entity
            list1.append(val*stolen_probs[k])                  #(alternative)append normally
        finale = np.array([list1])                             #(alternative)remove this 
        predict_list = np.append(predict_list,finale,axis = 0) # store the corresponding probability for each test class of each entity
    return predict_list

The "predi" function is used for making predictions based upon the multiplied values and test classes given.

In [6]:
def predi (predictions,gnb):
    list2 = []
    for i in range(len(predictions)):  #traverse through the entire lst of predicted values
        list2.append(gnb.classes_[np.where(predictions[i] == np.max(predictions[i]))[0]][0])  #find the max prob. of test class in each entity
    return list2

The function "accu" calculates the accuracy of the predicted series

In [7]:
def accu (pred,og):
    return (pred == og).sum()/og.shape[0]

The function "naive_baye" accepts the loaded dataset and further calls the required functions used for calculating predictions, ultimately returning the final list of predictions 

In [14]:
def naive_baye (dfs):
    x = dfs[dfs.columns[:-1]]        # Assume all the columns, except the last one, as attributes for Training Data
    y = dfs[dfs.columns[-1]]         # Assume the last column as the Output(?) required
    final_predictions_series = pds.Series(index = range(y.shape[0]) )
    for train, test in skf.split(x,y):   # Use Stratified K fold to get Training and Testing Data
        x_num_train = x[x.dtypes[x.dtypes=="int64"].index].iloc[train]  # Extract the Continuous Training Data
        x_num_test = x[x.dtypes[x.dtypes=="int64"].index].iloc[test]    # Extract the Continuous Testing Data
        y_train = y[train]
        y_test = y[test]
        gnb.fit(x_num_train, y_train)    # Fit the training Numerical data in the Gaussian  NB Object 
        num_pred_val = gnb.predict_proba(x_num_test)  # Store the Predicted Probabilities 
        x_cat_train = x[x.dtypes[x.dtypes=="object"].index].iloc[train] # Extract the Categorical Training Data
        x_cat_test = x[x.dtypes[x.dtypes=="object"].index].iloc[test]   # Extract the Categorical Testing Data
        stol,dictio,options = cat_val(x_cat_train, y_train)             # Fit the Training Data
         cat_pred_val = predict_proba_cat(x_cat_test, stol, dictio,options)  #Store the Predicted Probabilites
        predictions_values = np.multiply(num_pred_val,cat_pred_val)     #Multiply the values of Numerical and Categorical Probabilities
        final_predictions = predi(predictions_values,gnb)               #Get the predictions
        final_predictions_series.loc[test] = final_predictions          #Store these predictions in the form of a series with corresponding Test Index
    naive_baye.accuracy = (final_predictions_series == y).sum()/y.shape[0]
    return final_predictions_series 

In [15]:
op = naive_baye(dfs)

In [16]:
naive_baye.accuracy

0.75304054054054059

In [17]:
op

0          >50K
1         <=50K
2          >50K
3         <=50K
4         <=50K
5         <=50K
6          >50K
7         <=50K
8          >50K
9          >50K
10        <=50K
11        <=50K
12         >50K
13         >50K
14        <=50K
15        <=50K
16         >50K
17         >50K
18        <=50K
19        <=50K
20        <=50K
21         >50K
22        <=50K
23         >50K
24         >50K
25        <=50K
26         >50K
27        <=50K
28        <=50K
29        <=50K
          ...  
32530     <=50K
32531      >50K
32532      >50K
32533     <=50K
32534     <=50K
32535     <=50K
32536     <=50K
32537      >50K
32538     <=50K
32539     <=50K
32540     <=50K
32541     <=50K
32542     <=50K
32543     <=50K
32544     <=50K
32545     <=50K
32546     <=50K
32547     <=50K
32548     <=50K
32549     <=50K
32550     <=50K
32551     <=50K
32552     <=50K
32553     <=50K
32554     <=50K
32555     <=50K
32556     <=50K
32557     <=50K
32558     <=50K
32559      >50K
Length: 32560, dtype: ob