NAIVE BAYES

In [1]:
from sklearn import datasets
from sklearn.metrics import confusion_matrix
import numpy as np

In [2]:
def fit(x_train,y_train):
    result={}
    class_values=set(y_train)
    for curr_class in class_values:
        result[curr_class]={}
        result["total_data"]=len(y_train)
        current_class_rows=(y_train==curr_class)
        x_train_current=x_train[current_class_rows]
        y_train_current=y_train[current_class_rows]
        num_features=x_train.shape[1]
        result[curr_class]["total_count"]=len(y_train_current)
        for j in range(1,num_features+1):
            result[curr_class][j]={}
            all_possible_values=set(x_train[:,j-1])
            for current_value in all_possible_values:
                result[curr_class][j][current_value]=(x_train_current[:,j-1]==current_value).sum()
    return result


In [3]:
def probability(dictionary,x,current_class):
    output=np.log(dictionary[current_class]["total_count"])-np.log(dictionary["total_data"])
    num_features=len(dictionary[current_class].keys())-1
    for j in range(1,num_features+1):
        xj=x[j-1]
        count_current_class_with_value_xj=dictionary[current_class][j][xj]+1
        count_current_class=dictionary[current_class]["total_count"]+len(dictionary[current_class][j].keys())
        current_xj_probability=np.log(count_current_class_with_value_xj)-np.log(count_current_class)
        output=output+current_xj_probability
    return output

In [4]:
def predict_single_point(dictionary,x):
    classes=dictionary.keys()
    best_p=-1000
    best_class=-1
    firstrun=True
    for current_class in classes:
        if current_class=="total_data":
            continue
        p_current_class=probability(dictionary,x,current_class)
        if (firstrun or p_current_class)>best_p:
            best_p=p_current_class
            best_class=current_class
        firstrun=False
    return best_class

In [5]:
def predict(dictionary,x_test):
    y_pred=[]
    for x in x_test:
        x_class=predict_single_point(dictionary,x)
        y_pred.append(x_class)
    return y_pred

In [6]:
iris = datasets.load_iris()

In [7]:
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [8]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(iris.data,iris.target)

In [10]:
def makeLabeled(column):
    second_limit=column.mean()
    first_limit=0.5*second_limit
    third_limit=1.5*second_limit
    for i in range(len(column)):
        if column[i]<first_limit:
            column[i]=0
        elif column[i]<second_limit:
            column[i]=1
        elif column[i]<third_limit:
            column[i]=2
        else:
            column[i]=3
    return column


In [11]:
for i in range(0,x_train.shape[-1]):
    x_train[:,i]=makeLabeled(x_train[:,i])
for i in range(0,x_test.shape[-1]):
    x_test[:,i]=makeLabeled(x_test[:,i])
        

In [12]:
dictionary=fit(x_train,y_train)

In [13]:
dictionary

{0: {'total_count': 38,
  1: {1.0: 38, 2.0: 0},
  2: {1.0: 7, 2.0: 31},
  3: {0.0: 37, 1.0: 1, 2.0: 0, 3.0: 0},
  4: {0.0: 37, 1.0: 1, 2.0: 0, 3.0: 0}},
 'total_data': 112,
 1: {'total_count': 38,
  1: {1.0: 17, 2.0: 21},
  2: {1.0: 31, 2.0: 7},
  3: {0.0: 0, 1.0: 5, 2.0: 33, 3.0: 0},
  4: {0.0: 0, 1.0: 7, 2.0: 31, 3.0: 0}},
 2: {'total_count': 36,
  1: {1.0: 4, 2.0: 32},
  2: {1.0: 23, 2.0: 13},
  3: {0.0: 0, 1.0: 0, 2.0: 19, 3.0: 17},
  4: {0.0: 0, 1.0: 0, 2.0: 3, 3.0: 33}}}

In [14]:
y_pred=predict(dictionary,x_test)

In [15]:
y_pred

[0,
 0,
 1,
 1,
 2,
 1,
 0,
 0,
 1,
 1,
 2,
 0,
 2,
 1,
 0,
 1,
 0,
 1,
 0,
 2,
 1,
 1,
 0,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 2,
 0,
 1]

In [16]:
y_test

array([0, 0, 1, 1, 2, 2, 0, 0, 2, 1, 2, 0, 2, 1, 0, 1, 0, 1, 0, 2, 1, 2,
       0, 2, 2, 2, 2, 2, 1, 2, 0, 1, 1, 0, 1, 2, 0, 1])