In [307]:
import pandas as pd
import numpy as np

# generate data with many classes:
def generate_data(classes, method = 'hard'):
    X = np.random.uniform(-1,1,(1000,2))

    # random function
    if method == 'hard':
        Y = np.sin(X)**2 + np.tanh(X)
    else:
        Y = X
    Y = np.sum(Y,1)
    K = np.arange(classes)
    


    boundaries = [np.quantile(Y,i/classes) for i in range(1,classes)]
    boundaries = [np.min(Y)] + boundaries + [np.max(Y) + 10-5]
    Y_class = []
    for y in Y:
        for i in range(0,len(boundaries)-1):
            if (boundaries[i] <= y) and (y < boundaries[i+1]):
                Y_class.append(i)
                break
    Y = np.array(Y_class)
    return X, Y


def generate_categorical_data():
    x1 = np.random.randint(0,4,(1,1000))
    x2 = np.random.randint(0,2,(1,1000))

    X = np.concatenate([x1,x2],0).T

    Y = np.prod(X,1)
    Y = np.array(Y > 1,int)
    
    return X, Y

In [308]:
from scipy.stats import multivariate_normal
import pandas as pd

# contineous case
X,Y = generate_data(5,'easy')

N,p = X.shape

classes = np.unique(Y)
K = len(classes)


data = {}
PCk = {}

for p_i in np.arange(p):
    data[p_i] = {}
    for k in classes:
        data[p_i][k] = {'means':None,'var':None}

        
for k in classes:
    idx = np.where(Y == k)
    X_temp = X[idx]
    n_k,p_k = X_temp.shape
    means = X_temp.mean(0)
    
    for i,m in enumerate(means):
        data[i][k]['means'] = means[i]
        
    var = X_temp.var(0)
    
    for i,m in enumerate(var):
        data[i][k]['var'] = var[i]
        
    PCk[k] = n_k/N
        
# prediction

result = np.zeros((n,K))
for k in classes:
    prob = PCk[k]

    n,p = X.shape
    res = np.zeros((n,p))

    for feature in range(p):
        res[:,feature] = multivariate_normal.pdf(X[:,feature], data[feature][k]['means'],data[feature][k]['var'])
    result[:,k] = np.prod(res,1)
    result[:,k]*=prob
prediction = np.argmax(result, 1)

In [309]:
np.sum(prediction == Y)/len(Y)

0.749

In [328]:
X,Y = generate_categorical_data()

from collections import defaultdict
from collections import Counter

feature = defaultdict(lambda: np.zeros(4))
feature[0]
feature

defaultdict(<function __main__.<lambda>()>, {0: array([0., 0., 0., 0.])})

In [365]:
classes = np.unique(Y)
feature = 0


idx = Y == classes[0]
X_temp = X[idx]


counter_per_class = {}

for x in classes:
    idx = Y == x
    X_temp = X[idx]
    counter_per_class[x] = {p:Counter(X_temp[:,p]) for p in range(0,2)}

counter_per_class[0][0].values()

dict_values([275, 253, 130, 110])

In [360]:
sum(counter_per_class[0][1].values())

768

In [320]:
# categorical case

from collections import defaultdict

X,Y = generate_categorical_data()

N,n_features = X.shape
classes = np.unique(Y)
K = len(classes)

df = pd.DataFrame(X, columns = np.arange(n_features))
df['target'] = Y

tables = {}

# we create a counter table for each feature
for feature in range(0,n_features):
    df_temp = df[[feature,'target']]
    unique_features = np.sort(df_temp[feature].unique())

    tables[feature] = pd.DataFrame(columns = classes, index = unique_features)
    for feat in unique_features:
        for k in classes:
            tables[feature][k].loc[feat] = df_temp[(df_temp[feature] == feat) & (df_temp['target'] == k)].count().values[0]
            
# tables is a dictionary

#tables[k] are dataframe for each feature of the X matrix
# k is the feature name

# each dataframe has columns which are the classes and the rows which are the unique categorical entries of the 
# X column features sapces
PCk = {}
for k in classes:
    nk,pk = df[(df['target'] == k)].shape
    PCk[k] = nk

In [321]:
# so what are the probabilities
def P(feature, class_choice, tables):
    total = tables[feature][kk].sum()
    probs = tables[feature][kk]/total
    return probs


# probability 
result = np.zeros((N,K))
for k in classes:
    res = np.zeros((N,n_features))
    for j in range(0,n_features):
        res[:,j] = P(j,k,tables).values[X[:,j]]
    result[:,k] = PCk[k]*np.prod(res,1)
prediction = np.argmax(result, 1)

In [324]:
res

array([[0.34547908, 0.65721997],
       [0.34547908, 0.34278003],
       [0.34547908, 0.65721997],
       ...,
       [0.17948718, 0.65721997],
       [0.34547908, 0.34278003],
       [0.17948718, 0.65721997]])

In [96]:
X[0]

array([ 0.63756911, -0.71879198])

In [49]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]


tuples = list(zip(*arrays))






In [52]:
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [43]:
df = pd.read_csv('banknotes.txt', header = None)
df.columns = ['X_0','X_1','X_2','X_3','Y']
X = df[['X_0','X_1','X_2','X_3']].values
y = df['Y'].values

In [44]:
np.pi

3.141592653589793

In [45]:
class Normal:
    def __init__(self, mu, sigma):
        self.mu = mu
        self.sigma = sigma
    
    def prob(self, x):
        prefactor = 1/np.sqrt(2*np.pi*self.sigma*self.sigma)
        exponent = -(1/(2*self.sigma*self.sigma))*(x - self.mu)*(x-self.mu)
        result = prefactor*np.exp(exponent)
        return result

In [54]:
def get_features(X,y):
    feature_parameters = {}
    classes_indices = {}
    pCk = {}
    class_set = list(set(y))
    for c in class_set:
        pCk[c] = (y == c).sum()/len(y)
        classes_indices[c] = np.where(y == c)[0]

    for c,v in classes_indices.items():
        x = X[v]
        temp = {}
        for i,col in enumerate(['X_0','X_1','X_2','X_3']):
            temp[col] = [x[:,i].mean(),x[:,i].std()]
            feature_parameters[c] = temp
    
    return feature_parameters, pCk

In [55]:
class p_x_ck:
    def __init__(self, feature_parameter):
        self.feature_parameter = feature_parameter
        
    def _get_probs(self, ck):
        parameters = list(feature_parameters[ck].values())
        dists = [Normal(p[0],p[1]) for p in parameters]
        return dists
    
    def get(self,datapoint, ck):
        res = 1
        for pairs in list(zip(self._get_probs(ck),datapoint)):
            res*=pairs[0].prob(pairs[1])
        return res

In [69]:
class NaiveBayes:
    def __init__(self):
        self.feature_parameters = None
        self.pCk = None
        self.pck = None
    
    def train(self, X,y):
        self.feature_parameters, self.pCk = get_features(X,y)
        self.pck = p_x_ck(self.feature_parameters)
        
    def predict(self, datapoint, c):
        return pck.get(datapoint,c)*pCk[c]
    

In [70]:
nb = NaiveBayes()
nb.train(X,y)


In [74]:
y_pred = []
for x in X:
    y_pred.append(np.argmax([nb.predict(x,0),nb.predict(x,1)]))

In [77]:
np.array(np.array(y_pred) == y).sum()/len(y)

0.84110787172011658