# Naive Bayes

In [1]:
from builtins import enumerate, len
import pandas as pd
from ast import literal_eval
from copy import copy

In [2]:
dataset_paths = [ "../data/df_test_1.csv", "../data/df_test_2.csv"]
ngrams = [2,3]
repl_prod_name = [True, False]
tokens = ["common_tokens", "tokens"]

In [3]:
def load_dataset(path):
    df = pd.read_csv(path)

    train_idxes = df.test_idx.unique()
    print(train_idxes)

    res = []

    for train_idx in train_idxes:
        mask = df.test_idx == train_idx

        df_train = pd.read_csv(path)
        df_test = copy(df_train)

        df_train = df_train[~mask]
        df_test = df_test[mask]

        res += [(df_train, df_test)]

    return res

# Test 1

In [4]:
datasets = load_dataset(dataset_paths[0])

[0 1 2 3 4 5 6]


## Create Feature Vector

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer=literal_eval)
x_train, y_train, x_test, y_test = [], [], [], []

for dataset in datasets:
    df_train = dataset[0]
    df_test = dataset[1]
    
    temp_train = vectorizer.fit_transform(df_train["tokens"])
    x_train.append(temp_train.toarray())
    y_train.append(df_train.bool_rating.tolist())

    temp_test = vectorizer.transform(df_test["tokens"])
    x_test.append(temp_test.toarray())
    y_test.append(df_test.bool_rating.tolist())

    print(len(x_train[-1]), len(x_test[-1]), len(y_train[-1]), len(y_test[-1]))

5792 189 5792 189
4591 1390 4591 1390
4960 1021 4960 1021
4320 1661 4320 1661
5283 698 5283 698
5764 217 5764 217
5176 805 5176 805


## Naive Bayes

In [6]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    clf.fit(x_train[it], y_train[it])
    predicted = clf.predict(x_test[it])

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 145 tn= 19 fp= 14 fn= 11
scoring= 0.8677248677248677


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 1165 tn= 86 fp= 123 fn= 16
scoring= 0.9


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 813 tn= 35 fp= 152 fn= 21
scoring= 0.8305582761998042


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 1386 tn= 85 fp= 141 fn= 49
scoring= 0.8856110776640578


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 561 tn= 43 fp= 76 fn= 18
scoring= 0.8653295128939829


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 183 tn= 9 fp= 10 fn= 15
scoring= 0.8847926267281107
tp= 618 tn= 68 fp= 112 fn= 7
scoring= 0.8521739130434782


  'setting alpha = %.1e' % _ALPHA_MIN)


In [7]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

tp= 4871 tn= 345 fp= 628 fn= 137
scoring= 0.8720949673967564


## Naive Bayes with Laplace smoothing

In [8]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    clf.fit(x_train[it], y_train[it])
    predicted = clf.predict(x_test[it])

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

tp= 145 tn= 28 fp= 5 fn= 11
scoring= 0.9153439153439153
tp= 1161 tn= 143 fp= 66 fn= 20
scoring= 0.9381294964028777
tp= 806 tn= 88 fp= 99 fn= 28
scoring= 0.8756121449559255
tp= 1361 tn= 172 fp= 54 fn= 74
scoring= 0.9229379891631547
tp= 561 tn= 92 fp= 27 fn= 18
scoring= 0.9355300859598854
tp= 193 tn= 12 fp= 7 fn= 5
scoring= 0.9447004608294931
tp= 617 tn= 106 fp= 74 fn= 8
scoring= 0.8981366459627329


In [9]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

tp= 4844 tn= 641 fp= 332 fn= 164
scoring= 0.9170707239592042


## Naive Bayes with Witten Bell

In [10]:
def apply_witten_bell(x):
    print("start")
    df_x = pd.DataFrame(x)
    print("iterating")
    res = []

    for it, row in df_x.iterrows():
        
        if it %500 == 0:
            print(it, x.shape)
        N = sum(row)
        #print("N", N)
        M = sum([1 for x in row if x != 0])
        #print("M", M)
        row = row/row.sum()
        nval = 1/(N+M)
        #print("applying", nval)
        row = row.replace(0, nval)
        #print("set value")
        res.append(row)
        #print("ready with row")
    df_res = pd.DataFrame(res)

    return df_res
    

In [11]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    wb_x_train = apply_witten_bell(x_train[it])
    wb_x_test = apply_witten_bell(x_test[it])




    clf.fit(wb_x_train, y_train[it])
    predicted = clf.predict(wb_x_test)

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

start
iterating
0 (5792, 9137)
500 (5792, 9137)
1000 (5792, 9137)
1500 (5792, 9137)
2000 (5792, 9137)
2500 (5792, 9137)
3000 (5792, 9137)
3500 (5792, 9137)
4000 (5792, 9137)
4500 (5792, 9137)
5000 (5792, 9137)
5500 (5792, 9137)
start
iterating
0 (189, 9137)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 156 tn= 0 fp= 33 fn= 0
scoring= 0.8253968253968254
start
iterating
0 (4591, 7935)
500 (4591, 7935)
1000 (4591, 7935)
1500 (4591, 7935)
2000 (4591, 7935)
2500 (4591, 7935)
3000 (4591, 7935)
3500 (4591, 7935)
4000 (4591, 7935)
4500 (4591, 7935)
start
iterating
0 (1390, 7935)
500 (1390, 7935)
1000 (1390, 7935)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 1181 tn= 0 fp= 209 fn= 0
scoring= 0.8496402877697842
start
iterating
0 (4960, 8068)
500 (4960, 8068)
1000 (4960, 8068)
1500 (4960, 8068)
2000 (4960, 8068)
2500 (4960, 8068)
3000 (4960, 8068)
3500 (4960, 8068)
4000 (4960, 8068)
4500 (4960, 8068)
start
iterating
0 (1021, 8068)
500 (1021, 8068)
1000 (1021, 8068)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 834 tn= 0 fp= 187 fn= 0
scoring= 0.8168462291870715
start
iterating
0 (4320, 7986)
500 (4320, 7986)
1000 (4320, 7986)
1500 (4320, 7986)
2000 (4320, 7986)
2500 (4320, 7986)
3000 (4320, 7986)
3500 (4320, 7986)
4000 (4320, 7986)
start
iterating
0 (1661, 7986)
500 (1661, 7986)
1000 (1661, 7986)
1500 (1661, 7986)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 1435 tn= 0 fp= 226 fn= 0
scoring= 0.863937387116195
start
iterating
0 (5283, 8299)
500 (5283, 8299)
1000 (5283, 8299)
1500 (5283, 8299)
2000 (5283, 8299)
2500 (5283, 8299)
3000 (5283, 8299)
3500 (5283, 8299)
4000 (5283, 8299)
4500 (5283, 8299)
5000 (5283, 8299)
start
iterating
0 (698, 8299)
500 (698, 8299)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 579 tn= 0 fp= 119 fn= 0
scoring= 0.829512893982808
start
iterating
0 (5764, 8823)
500 (5764, 8823)
1000 (5764, 8823)
1500 (5764, 8823)
2000 (5764, 8823)
2500 (5764, 8823)
3000 (5764, 8823)
3500 (5764, 8823)
4000 (5764, 8823)
4500 (5764, 8823)
5000 (5764, 8823)
5500 (5764, 8823)
start
iterating
0 (217, 8823)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 198 tn= 0 fp= 19 fn= 0
scoring= 0.9124423963133641
start
iterating
0 (5176, 8431)
500 (5176, 8431)
1000 (5176, 8431)
1500 (5176, 8431)
2000 (5176, 8431)
2500 (5176, 8431)
3000 (5176, 8431)
3500 (5176, 8431)
4000 (5176, 8431)
4500 (5176, 8431)
5000 (5176, 8431)
start
iterating
0 (805, 8431)
500 (805, 8431)
tp= 625 tn= 0 fp= 180 fn= 0
scoring= 0.7763975155279503


  'setting alpha = %.1e' % _ALPHA_MIN)


In [12]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

tp= 5008 tn= 0 fp= 973 fn= 0
scoring= 0.8373181742183581


## Good Turing Smoothing

(without log-linear smoothing like Sampson, 1997)

In [13]:
def apply_good_turing(x):
    print("start")
    df_x = pd.DataFrame(x)
    
    df_x = df_x + 1 #add one as mentioned in paper
    
    print("iterating")
    res = []

    for it, row in df_x.iterrows():
        r_stars = {}
        if it %500 == 0:
            print(it, df_x.shape)
            
        vc = row.value_counts().to_dict()
        for it in range(0, max(vc.keys())):
            if it not in vc:
                vc[it]= 0
        #print(vc)
        
        for r in sorted(vc):
            Nr_plus_1 = 0 if r+1 not in vc else vc[r+1]
            Nr = vc[r] if r in vc else [vc[r_] for r_ in range(r, 0) if vc[r_1] > 0][0] # take next smallest value
            Nr = Nr if Nr > 0 else 1
            r_star = (r + 1) * (Nr_plus_1/Nr)
            #print("r*", r, Nr, Nr_plus_1, (Nr_plus_1/Nr) , r_star)
            r_stars[r] = r_star
        #print(vc, r_stars)
        for it in range(0, max(r_stars.keys())):
            if it not in r_stars:
                r_stars[it]= 0
        #print(vc)
        
        res.append(row.map(lambda n: r_stars[n]))
      
    df_res = pd.DataFrame(res)

    return df_res

text = [[1,0,1,0,1], [2,1,0,1,2], [5,0,0,1,0], [2,3,0,0,0]]
apply_good_turing(x_train[0])

start
iterating
0 (5792, 9137)
500 (5792, 9137)
1000 (5792, 9137)
1500 (5792, 9137)
2000 (5792, 9137)
2500 (5792, 9137)
3000 (5792, 9137)
3500 (5792, 9137)
4000 (5792, 9137)
4500 (5792, 9137)
5000 (5792, 9137)
5500 (5792, 9137)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9127,9128,9129,9130,9131,9132,9133,9134,9135,9136
0,0.001095,0.001095,0.001095,0.000000,0.001095,0.001095,0.001095,0.001095,0.001095,0.001095,...,0.001095,0.001095,0.001095,0.001095,0.001095,0.001095,0.001095,0.001095,0.001095,0.001095
1,0.003071,0.003071,0.003071,0.000000,0.003071,0.003071,0.003071,0.003071,0.003071,0.003071,...,0.003071,0.003071,0.003071,0.003071,0.003071,0.003071,0.003071,0.003071,0.003071,0.003071
2,0.004836,0.004836,0.004836,0.004836,0.004836,0.004836,0.004836,0.004836,0.004836,0.004836,...,0.004836,0.004836,0.004836,0.004836,0.004836,0.004836,0.004836,0.004836,0.004836,0.004836
3,0.003950,0.003950,0.003950,0.000000,0.003950,0.003950,0.003950,0.003950,0.003950,0.003950,...,0.003950,0.003950,0.003950,0.003950,0.003950,0.003950,0.003950,0.003950,0.003950,0.003950
4,0.001314,0.001314,0.001314,0.001314,0.001314,0.001314,0.001314,0.001314,0.001314,0.001314,...,0.001314,0.001314,0.001314,0.001314,0.001314,0.001314,0.001314,0.001314,0.001314,0.001314
5,0.020198,0.020198,0.020198,0.020198,0.020198,0.527473,0.020198,0.020198,0.020198,0.020198,...,0.020198,0.020198,0.020198,0.020198,0.020198,0.020198,0.020198,0.020198,0.020198,0.020198
6,0.003509,0.003509,0.003509,0.003509,0.003509,0.003509,0.003509,0.003509,0.003509,0.003509,...,0.003509,0.003509,0.003509,0.003509,0.003509,0.003509,0.003509,0.003509,0.003509,0.003509
7,0.004831,0.004831,0.004831,0.000000,0.004831,0.004831,0.004831,0.004831,0.004831,0.004831,...,0.004831,0.004831,0.004831,0.004831,0.004831,0.004831,0.004831,0.004831,0.004831,0.004831
8,0.001753,0.001753,0.001753,0.001753,0.001753,0.001753,0.001753,0.001753,0.001753,0.001753,...,0.001753,0.001753,0.001753,0.001753,0.001753,0.001753,0.001753,0.001753,0.001753,0.001753
9,0.007706,0.007706,0.007706,2.857143,0.007706,0.007706,0.007706,0.007706,0.007706,0.007706,...,0.007706,0.007706,0.007706,0.007706,0.007706,0.007706,0.007706,0.007706,0.007706,0.007706


In [14]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    gt_x_train = apply_good_turing(x_train[it])
    gt_x_test = apply_good_turing(x_test[it])




    clf.fit(gt_x_train, y_train[it])
    predicted = clf.predict(gt_x_test)

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

start
iterating
0 (5792, 9137)
500 (5792, 9137)
1000 (5792, 9137)
1500 (5792, 9137)
2000 (5792, 9137)
2500 (5792, 9137)
3000 (5792, 9137)
3500 (5792, 9137)
4000 (5792, 9137)
4500 (5792, 9137)
5000 (5792, 9137)
5500 (5792, 9137)
start
iterating
0 (189, 9137)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 137 tn= 17 fp= 16 fn= 19
scoring= 0.8148148148148148
start
iterating
0 (4591, 7935)
500 (4591, 7935)
1000 (4591, 7935)
1500 (4591, 7935)
2000 (4591, 7935)
2500 (4591, 7935)
3000 (4591, 7935)
3500 (4591, 7935)
4000 (4591, 7935)
4500 (4591, 7935)
start
iterating
0 (1390, 7935)
500 (1390, 7935)
1000 (1390, 7935)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 1110 tn= 90 fp= 119 fn= 71
scoring= 0.8633093525179856
start
iterating
0 (4960, 8068)
500 (4960, 8068)
1000 (4960, 8068)
1500 (4960, 8068)
2000 (4960, 8068)
2500 (4960, 8068)
3000 (4960, 8068)
3500 (4960, 8068)
4000 (4960, 8068)
4500 (4960, 8068)
start
iterating
0 (1021, 8068)
500 (1021, 8068)
1000 (1021, 8068)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 774 tn= 80 fp= 107 fn= 60
scoring= 0.8364348677766895
start
iterating
0 (4320, 7986)
500 (4320, 7986)
1000 (4320, 7986)
1500 (4320, 7986)
2000 (4320, 7986)
2500 (4320, 7986)
3000 (4320, 7986)
3500 (4320, 7986)
4000 (4320, 7986)
start
iterating
0 (1661, 7986)
500 (1661, 7986)
1000 (1661, 7986)
1500 (1661, 7986)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 1299 tn= 101 fp= 125 fn= 136
scoring= 0.8428657435279951
start
iterating
0 (5283, 8299)
500 (5283, 8299)
1000 (5283, 8299)
1500 (5283, 8299)
2000 (5283, 8299)
2500 (5283, 8299)
3000 (5283, 8299)
3500 (5283, 8299)
4000 (5283, 8299)
4500 (5283, 8299)
5000 (5283, 8299)
start
iterating
0 (698, 8299)
500 (698, 8299)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 509 tn= 52 fp= 67 fn= 70
scoring= 0.8037249283667621
start
iterating
0 (5764, 8823)
500 (5764, 8823)
1000 (5764, 8823)
1500 (5764, 8823)
2000 (5764, 8823)
2500 (5764, 8823)
3000 (5764, 8823)
3500 (5764, 8823)
4000 (5764, 8823)
4500 (5764, 8823)
5000 (5764, 8823)
5500 (5764, 8823)
start
iterating
0 (217, 8823)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 181 tn= 9 fp= 10 fn= 17
scoring= 0.8755760368663594
start
iterating
0 (5176, 8431)
500 (5176, 8431)
1000 (5176, 8431)
1500 (5176, 8431)
2000 (5176, 8431)
2500 (5176, 8431)
3000 (5176, 8431)
3500 (5176, 8431)
4000 (5176, 8431)
4500 (5176, 8431)
5000 (5176, 8431)
start
iterating
0 (805, 8431)
500 (805, 8431)
tp= 559 tn= 74 fp= 106 fn= 66
scoring= 0.7863354037267081


  'setting alpha = %.1e' % _ALPHA_MIN)


In [15]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

tp= 4569 tn= 423 fp= 550 fn= 439
scoring= 0.8346430362815582


## Bigrams with Laplace smoothing

In [16]:
def create_n_grams(tokens, n):
    res = []
    for it in range(0, len(tokens) -(n-1)):
        res.append(" ". join(tokens[it: it +n]))
    return res

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

datasets = load_dataset(dataset_paths[0])

vectorizer = CountVectorizer(tokenizer=literal_eval)
x_train, y_train, x_test, y_test = [], [], [], []

for dataset in datasets:
    df_train = dataset[0]
    df_test = dataset[1]
    df_train["tokens"] = df_train.tokens.map(lambda tokens: str(create_n_grams(literal_eval(tokens), 2)))
    df_test["tokens"] = df_test.tokens.map(lambda tokens: str(create_n_grams(literal_eval(tokens), 2)))
    
    temp_train = vectorizer.fit_transform(df_train["tokens"])
    x_train.append(temp_train.toarray())
    y_train.append(df_train.bool_rating.tolist())

    temp_test = vectorizer.transform(df_test["tokens"])
    x_test.append(temp_test.toarray())
    y_test.append(df_test.bool_rating.tolist())

    print(len(x_train[-1]), len(x_test[-1]), len(y_train[-1]), len(y_test[-1]))

[0 1 2 3 4 5 6]
5792 189 5792 189
4591 1390 4591 1390
4960 1021 4960 1021
4320 1661 4320 1661
5283 698 5283 698
5764 217 5764 217
5176 805 5176 805


In [18]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    clf.fit(x_train[it], y_train[it])
    predicted = clf.predict(x_test[it])

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

tp= 148 tn= 25 fp= 8 fn= 8
scoring= 0.9153439153439153
tp= 1137 tn= 142 fp= 67 fn= 44
scoring= 0.9201438848920863
tp= 812 tn= 107 fp= 80 fn= 22
scoring= 0.9000979431929481
tp= 1419 tn= 128 fp= 98 fn= 16
scoring= 0.9313666465984347
tp= 566 tn= 82 fp= 37 fn= 13
scoring= 0.9283667621776505
tp= 191 tn= 13 fp= 6 fn= 7
scoring= 0.9400921658986175
tp= 622 tn= 106 fp= 74 fn= 3
scoring= 0.9043478260869565


In [19]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

tp= 4895 tn= 603 fp= 370 fn= 113
scoring= 0.919244273532854


# Test 2

In [20]:
datasets = load_dataset(dataset_paths[1])

[0 1 2 3 4 5 6 7 8 9]


## Create Feature Vector

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer=literal_eval)
x_train, y_train, x_test, y_test = [], [], [], []

for dataset in datasets:
    df_train = dataset[0]
    df_test = dataset[1]
    
    temp_train = vectorizer.fit_transform(df_train["tokens"])
    x_train.append(temp_train.toarray())
    y_train.append(df_train.bool_rating.tolist())

    temp_test = vectorizer.transform(df_test["tokens"])
    x_test.append(temp_test.toarray())
    y_test.append(df_test.bool_rating.tolist())

    print(len(x_train[-1]), len(x_test[-1]), len(y_train[-1]), len(y_test[-1]))

4032 448 4032 448
4032 448 4032 448
4032 448 4032 448
4032 448 4032 448
4032 448 4032 448
4032 448 4032 448
4032 448 4032 448
4032 448 4032 448
4032 448 4032 448
4032 448 4032 448


## Naive Bayes

In [22]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    clf.fit(x_train[it], y_train[it])
    predicted = clf.predict(x_test[it])

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 208 tn= 218 fp= 6 fn= 16
scoring= 0.9508928571428571


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 221 tn= 221 fp= 3 fn= 3
scoring= 0.9866071428571429


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 217 tn= 213 fp= 11 fn= 7
scoring= 0.9598214285714286


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 216 tn= 217 fp= 7 fn= 8
scoring= 0.9665178571428571


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 213 tn= 218 fp= 6 fn= 11
scoring= 0.9620535714285714


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 212 tn= 213 fp= 11 fn= 12
scoring= 0.9486607142857143


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 214 tn= 214 fp= 10 fn= 10
scoring= 0.9553571428571429


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 214 tn= 216 fp= 8 fn= 10
scoring= 0.9598214285714286


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 217 tn= 218 fp= 6 fn= 7
scoring= 0.9709821428571429
tp= 213 tn= 214 fp= 10 fn= 11
scoring= 0.953125


  'setting alpha = %.1e' % _ALPHA_MIN)


In [23]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

tp= 2145 tn= 2162 fp= 78 fn= 95
scoring= 0.9613839285714286


## Naive Bayes with Laplace smoothing

In [24]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    clf.fit(x_train[it], y_train[it])
    predicted = clf.predict(x_test[it])

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

tp= 202 tn= 217 fp= 7 fn= 22
scoring= 0.9352678571428571
tp= 211 tn= 218 fp= 6 fn= 13
scoring= 0.9575892857142857
tp= 214 tn= 215 fp= 9 fn= 10
scoring= 0.9575892857142857
tp= 210 tn= 219 fp= 5 fn= 14
scoring= 0.9575892857142857
tp= 214 tn= 218 fp= 6 fn= 10
scoring= 0.9642857142857143
tp= 210 tn= 211 fp= 13 fn= 14
scoring= 0.9397321428571429
tp= 203 tn= 212 fp= 12 fn= 21
scoring= 0.9263392857142857
tp= 210 tn= 219 fp= 5 fn= 14
scoring= 0.9575892857142857
tp= 212 tn= 221 fp= 3 fn= 12
scoring= 0.9665178571428571
tp= 209 tn= 212 fp= 12 fn= 15
scoring= 0.9397321428571429


In [25]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

tp= 2095 tn= 2162 fp= 78 fn= 145
scoring= 0.9502232142857143


## Naive Bayes with Witten Bell

In [26]:
def apply_witten_bell(x):
    print("start")
    df_x = pd.DataFrame(x)
    print("iterating")
    res = []

    for it, row in df_x.iterrows():
        
        if it %500 == 0:
            print(it, x.shape)
        N = sum(row)
        #print("N", N)
        M = sum([1 for x in row if x != 0])
        #print("M", M)
        row = row/row.sum()
        nval = 1/(N+M)
        #print("applying", nval)
        row = row.replace(0, nval)
        #print("set value")
        res.append(row)
        #print("ready with row")
    df_res = pd.DataFrame(res)

    return df_res
    

In [27]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    wb_x_train = apply_witten_bell(x_train[it])
    wb_x_test = apply_witten_bell(x_test[it])




    clf.fit(wb_x_train, y_train[it])
    predicted = clf.predict(wb_x_test)

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

start
iterating
0 (4032, 7418)
500 (4032, 7418)
1000 (4032, 7418)
1500 (4032, 7418)
2000 (4032, 7418)
2500 (4032, 7418)
3000 (4032, 7418)
3500 (4032, 7418)
4000 (4032, 7418)
start
iterating
0 (448, 7418)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 160 tn= 189 fp= 35 fn= 64
scoring= 0.7790178571428571
start
iterating
0 (4032, 7408)
500 (4032, 7408)
1000 (4032, 7408)
1500 (4032, 7408)
2000 (4032, 7408)
2500 (4032, 7408)
3000 (4032, 7408)
3500 (4032, 7408)
4000 (4032, 7408)
start
iterating
0 (448, 7408)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 168 tn= 189 fp= 35 fn= 56
scoring= 0.796875
start
iterating
0 (4032, 7376)
500 (4032, 7376)
1000 (4032, 7376)
1500 (4032, 7376)
2000 (4032, 7376)
2500 (4032, 7376)
3000 (4032, 7376)
3500 (4032, 7376)
4000 (4032, 7376)
start
iterating
0 (448, 7376)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 165 tn= 192 fp= 32 fn= 59
scoring= 0.796875
start
iterating
0 (4032, 7410)
500 (4032, 7410)
1000 (4032, 7410)
1500 (4032, 7410)
2000 (4032, 7410)
2500 (4032, 7410)
3000 (4032, 7410)
3500 (4032, 7410)
4000 (4032, 7410)
start
iterating
0 (448, 7410)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 161 tn= 188 fp= 36 fn= 63
scoring= 0.7790178571428571
start
iterating
0 (4032, 7436)
500 (4032, 7436)
1000 (4032, 7436)
1500 (4032, 7436)
2000 (4032, 7436)
2500 (4032, 7436)
3000 (4032, 7436)
3500 (4032, 7436)
4000 (4032, 7436)
start
iterating
0 (448, 7436)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 159 tn= 199 fp= 25 fn= 65
scoring= 0.7991071428571429
start
iterating
0 (4032, 7366)
500 (4032, 7366)
1000 (4032, 7366)
1500 (4032, 7366)
2000 (4032, 7366)
2500 (4032, 7366)
3000 (4032, 7366)
3500 (4032, 7366)
4000 (4032, 7366)
start
iterating
0 (448, 7366)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 155 tn= 189 fp= 35 fn= 69
scoring= 0.7678571428571429
start
iterating
0 (4032, 7458)
500 (4032, 7458)
1000 (4032, 7458)
1500 (4032, 7458)
2000 (4032, 7458)
2500 (4032, 7458)
3000 (4032, 7458)
3500 (4032, 7458)
4000 (4032, 7458)
start
iterating
0 (448, 7458)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 154 tn= 195 fp= 29 fn= 70
scoring= 0.7790178571428571
start
iterating
0 (4032, 7191)
500 (4032, 7191)
1000 (4032, 7191)
1500 (4032, 7191)
2000 (4032, 7191)
2500 (4032, 7191)
3000 (4032, 7191)
3500 (4032, 7191)
4000 (4032, 7191)
start
iterating
0 (448, 7191)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 154 tn= 191 fp= 33 fn= 70
scoring= 0.7700892857142857
start
iterating
0 (4032, 7414)
500 (4032, 7414)
1000 (4032, 7414)
1500 (4032, 7414)
2000 (4032, 7414)
2500 (4032, 7414)
3000 (4032, 7414)
3500 (4032, 7414)
4000 (4032, 7414)
start
iterating
0 (448, 7414)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 160 tn= 187 fp= 37 fn= 64
scoring= 0.7745535714285714
start
iterating
0 (4032, 7356)
500 (4032, 7356)
1000 (4032, 7356)
1500 (4032, 7356)
2000 (4032, 7356)
2500 (4032, 7356)
3000 (4032, 7356)
3500 (4032, 7356)
4000 (4032, 7356)
start
iterating
0 (448, 7356)
tp= 161 tn= 193 fp= 31 fn= 63
scoring= 0.7901785714285714


  'setting alpha = %.1e' % _ALPHA_MIN)


In [28]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

tp= 1597 tn= 1912 fp= 328 fn= 643
scoring= 0.7832589285714285


## Good Turing Smoothing

(without log-linear smoothing like Sampson, 1997)

In [29]:
def apply_good_turing(x):
    print("start")
    df_x = pd.DataFrame(x)
    
    df_x = df_x + 1 #add one as mentioned in paper
    
    print("iterating")
    res = []

    for it, row in df_x.iterrows():
        r_stars = {}
        if it %500 == 0:
            print(it, df_x.shape)
            
        vc = row.value_counts().to_dict()
        for it in range(0, max(vc.keys())):
            if it not in vc:
                vc[it]= 0
        #print(vc)
        
        for r in sorted(vc):
            Nr_plus_1 = 0 if r+1 not in vc else vc[r+1]
            Nr = vc[r] if r in vc else [vc[r_] for r_ in range(r, 0) if vc[r_1] > 0][0] # take next smallest value
            Nr = Nr if Nr > 0 else 1
            r_star = (r + 1) * (Nr_plus_1/Nr)
            #print("r*", r, Nr, Nr_plus_1, (Nr_plus_1/Nr) , r_star)
            r_stars[r] = r_star
        #print(vc, r_stars)
        for it in range(0, max(r_stars.keys())):
            if it not in r_stars:
                r_stars[it]= 0
        #print(vc)
        
        res.append(row.map(lambda n: r_stars[n]))
      
    df_res = pd.DataFrame(res)

    return df_res

text = [[1,0,1,0,1], [2,1,0,1,2], [5,0,0,1,0], [2,3,0,0,0]]
apply_good_turing(x_train[0])

start
iterating
0 (4032, 7418)
500 (4032, 7418)
1000 (4032, 7418)
1500 (4032, 7418)
2000 (4032, 7418)
2500 (4032, 7418)
3000 (4032, 7418)
3500 (4032, 7418)
4000 (4032, 7418)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7408,7409,7410,7411,7412,7413,7414,7415,7416,7417
0,0.005139,0.005139,0.005139,0.005139,0.005139,0.005139,0.005139,0.005139,0.005139,0.005139,...,0.005139,0.005139,0.005139,0.005139,0.005139,0.005139,0.005139,0.005139,0.005139,0.005139
1,0.002160,0.002160,0.002160,0.750000,0.002160,0.002160,0.002160,0.002160,0.002160,0.002160,...,0.002160,0.002160,0.002160,0.002160,0.002160,0.002160,0.002160,0.002160,0.002160,0.002160
2,0.003785,0.003785,0.003785,0.003785,0.003785,0.003785,0.003785,0.003785,0.003785,0.003785,...,0.003785,0.003785,0.003785,0.003785,0.003785,0.003785,0.003785,0.003785,0.003785,0.003785
3,0.003785,0.003785,0.003785,0.000000,0.003785,0.003785,0.003785,0.003785,0.003785,0.003785,...,0.003785,0.003785,0.003785,0.003785,0.003785,0.003785,0.003785,0.003785,0.003785,0.003785
4,0.005960,0.005960,0.005960,0.005960,0.005960,0.005960,0.005960,0.005960,0.005960,0.005960,...,0.005960,0.005960,0.005960,0.005960,0.005960,0.005960,0.005960,0.005960,0.005960,0.005960
5,0.000810,0.000810,0.000810,1.000000,0.000810,0.000810,0.000810,0.000810,0.000810,0.000810,...,0.000810,0.000810,0.000810,0.000810,0.000810,0.000810,0.000810,0.000810,0.000810,0.000810
6,0.002430,0.002430,0.002430,0.666667,0.002430,0.002430,0.002430,0.002430,0.002430,0.002430,...,0.002430,0.002430,0.002430,0.002430,0.002430,0.002430,0.002430,0.002430,0.002430,0.002430
7,0.004060,0.004060,0.004060,0.004060,0.004060,0.004060,0.004060,0.004060,0.004060,0.004060,...,0.004060,0.004060,0.004060,0.004060,0.004060,0.004060,0.004060,0.004060,0.004060,0.004060
8,0.001079,0.001079,0.001079,0.001079,0.001079,0.001079,0.001079,0.001079,0.001079,0.001079,...,0.001079,0.001079,0.001079,0.001079,0.001079,0.001079,0.001079,0.001079,0.001079,0.001079
9,0.001890,0.001890,0.001890,0.001890,0.001890,0.001890,0.001890,0.001890,0.001890,0.001890,...,0.001890,0.001890,0.001890,0.001890,0.001890,0.001890,0.001890,0.001890,0.001890,0.001890


In [30]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    gt_x_train = apply_good_turing(x_train[it])
    gt_x_test = apply_good_turing(x_test[it])




    clf.fit(gt_x_train, y_train[it])
    predicted = clf.predict(gt_x_test)

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

start
iterating
0 (4032, 7418)
500 (4032, 7418)
1000 (4032, 7418)
1500 (4032, 7418)
2000 (4032, 7418)
2500 (4032, 7418)
3000 (4032, 7418)
3500 (4032, 7418)
4000 (4032, 7418)
start
iterating
0 (448, 7418)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 188 tn= 194 fp= 30 fn= 36
scoring= 0.8526785714285714
start
iterating
0 (4032, 7408)
500 (4032, 7408)
1000 (4032, 7408)
1500 (4032, 7408)
2000 (4032, 7408)
2500 (4032, 7408)
3000 (4032, 7408)
3500 (4032, 7408)
4000 (4032, 7408)
start
iterating
0 (448, 7408)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 183 tn= 188 fp= 36 fn= 41
scoring= 0.828125
start
iterating
0 (4032, 7376)
500 (4032, 7376)
1000 (4032, 7376)
1500 (4032, 7376)
2000 (4032, 7376)
2500 (4032, 7376)
3000 (4032, 7376)
3500 (4032, 7376)
4000 (4032, 7376)
start
iterating
0 (448, 7376)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 197 tn= 185 fp= 39 fn= 27
scoring= 0.8526785714285714
start
iterating
0 (4032, 7410)
500 (4032, 7410)
1000 (4032, 7410)
1500 (4032, 7410)
2000 (4032, 7410)
2500 (4032, 7410)
3000 (4032, 7410)
3500 (4032, 7410)
4000 (4032, 7410)
start
iterating
0 (448, 7410)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 191 tn= 198 fp= 26 fn= 33
scoring= 0.8683035714285714
start
iterating
0 (4032, 7436)
500 (4032, 7436)
1000 (4032, 7436)
1500 (4032, 7436)
2000 (4032, 7436)
2500 (4032, 7436)
3000 (4032, 7436)
3500 (4032, 7436)
4000 (4032, 7436)
start
iterating
0 (448, 7436)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 195 tn= 192 fp= 32 fn= 29
scoring= 0.8638392857142857
start
iterating
0 (4032, 7366)
500 (4032, 7366)
1000 (4032, 7366)
1500 (4032, 7366)
2000 (4032, 7366)
2500 (4032, 7366)
3000 (4032, 7366)
3500 (4032, 7366)
4000 (4032, 7366)
start
iterating
0 (448, 7366)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 188 tn= 179 fp= 45 fn= 36
scoring= 0.8191964285714286
start
iterating
0 (4032, 7458)
500 (4032, 7458)
1000 (4032, 7458)
1500 (4032, 7458)
2000 (4032, 7458)
2500 (4032, 7458)
3000 (4032, 7458)
3500 (4032, 7458)
4000 (4032, 7458)
start
iterating
0 (448, 7458)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 182 tn= 186 fp= 38 fn= 42
scoring= 0.8214285714285714
start
iterating
0 (4032, 7191)
500 (4032, 7191)
1000 (4032, 7191)
1500 (4032, 7191)
2000 (4032, 7191)
2500 (4032, 7191)
3000 (4032, 7191)
3500 (4032, 7191)
4000 (4032, 7191)
start
iterating
0 (448, 7191)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 190 tn= 189 fp= 35 fn= 34
scoring= 0.8459821428571429
start
iterating
0 (4032, 7414)
500 (4032, 7414)
1000 (4032, 7414)
1500 (4032, 7414)
2000 (4032, 7414)
2500 (4032, 7414)
3000 (4032, 7414)
3500 (4032, 7414)
4000 (4032, 7414)
start
iterating
0 (448, 7414)


  'setting alpha = %.1e' % _ALPHA_MIN)


tp= 195 tn= 187 fp= 37 fn= 29
scoring= 0.8526785714285714
start
iterating
0 (4032, 7356)
500 (4032, 7356)
1000 (4032, 7356)
1500 (4032, 7356)
2000 (4032, 7356)
2500 (4032, 7356)
3000 (4032, 7356)
3500 (4032, 7356)
4000 (4032, 7356)
start
iterating
0 (448, 7356)
tp= 189 tn= 185 fp= 39 fn= 35
scoring= 0.8348214285714286


  'setting alpha = %.1e' % _ALPHA_MIN)


In [31]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

tp= 1898 tn= 1883 fp= 357 fn= 342
scoring= 0.8439732142857143


## Bigrams with Laplace smoothing

In [32]:
def create_n_grams(tokens, n):
    res = []
    for it in range(0, len(tokens) -(n-1)):
        res.append(" ". join(tokens[it: it +n]))
    return res

In [33]:
from sklearn.feature_extraction.text import CountVectorizer

datasets = load_dataset(dataset_paths[1])

vectorizer = CountVectorizer(tokenizer=literal_eval)
x_train, y_train, x_test, y_test = [], [], [], []

for dataset in datasets:
    df_train = dataset[0]
    df_test = dataset[1]
    df_train["tokens"] = df_train.tokens.map(lambda tokens: str(create_n_grams(literal_eval(tokens), 2)))
    df_test["tokens"] = df_test.tokens.map(lambda tokens: str(create_n_grams(literal_eval(tokens), 2)))
    
    temp_train = vectorizer.fit_transform(df_train["tokens"])
    x_train.append(temp_train.toarray())
    y_train.append(df_train.bool_rating.tolist())

    temp_test = vectorizer.transform(df_test["tokens"])
    x_test.append(temp_test.toarray())
    y_test.append(df_test.bool_rating.tolist())

    print(len(x_train[-1]), len(x_test[-1]), len(y_train[-1]), len(y_test[-1]))

[0 1 2 3 4 5 6 7 8 9]
4032 448 4032 448
4032 448 4032 448
4032 448 4032 448
4032 448 4032 448
4032 448 4032 448
4032 448 4032 448
4032 448 4032 448
4032 448 4032 448
4032 448 4032 448
4032 448 4032 448


In [34]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    clf.fit(x_train[it], y_train[it])
    predicted = clf.predict(x_test[it])

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

tp= 217 tn= 224 fp= 0 fn= 7
scoring= 0.984375
tp= 219 tn= 220 fp= 4 fn= 5
scoring= 0.9799107142857143
tp= 218 tn= 220 fp= 4 fn= 6
scoring= 0.9776785714285714
tp= 217 tn= 223 fp= 1 fn= 7
scoring= 0.9821428571428571
tp= 218 tn= 223 fp= 1 fn= 6
scoring= 0.984375
tp= 218 tn= 221 fp= 3 fn= 6
scoring= 0.9799107142857143
tp= 220 tn= 220 fp= 4 fn= 4
scoring= 0.9821428571428571
tp= 215 tn= 222 fp= 2 fn= 9
scoring= 0.9754464285714286
tp= 216 tn= 222 fp= 2 fn= 8
scoring= 0.9776785714285714
tp= 220 tn= 219 fp= 5 fn= 4
scoring= 0.9799107142857143


In [35]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

tp= 2178 tn= 2214 fp= 26 fn= 62
scoring= 0.9803571428571428
