# Naive Bayes

In [None]:
from builtins import enumerate, len
import pandas as pd
from ast import literal_eval
from copy import copy

In [None]:
dataset_paths = [ "../data/df_test_1.csv", "../data/df_test_2.csv"]
ngrams = [2,3]
repl_prod_name = [True, False]
tokens = ["common_tokens", "tokens"]

In [None]:
def load_dataset(path):
    df = pd.read_csv(path)

    train_idxes = df.test_idx.unique()
    print(train_idxes)

    res = []

    for train_idx in train_idxes:
        mask = df.test_idx == train_idx

        df_train = pd.read_csv(path)
        df_test = copy(df_train)

        df_train = df_train[~mask]
        df_test = df_test[mask]

        res += [(df_train, df_test)]

    return res

# Test 1

In [None]:
datasets = load_dataset(dataset_paths[0])

## Create Feature Vector

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer=literal_eval)
x_train, y_train, x_test, y_test = [], [], [], []

for dataset in datasets:
    df_train = dataset[0]
    df_test = dataset[1]
    
    temp_train = vectorizer.fit_transform(df_train["tokens"])
    x_train.append(temp_train.toarray())
    y_train.append(df_train.bool_rating.tolist())

    temp_test = vectorizer.transform(df_test["tokens"])
    x_test.append(temp_test.toarray())
    y_test.append(df_test.bool_rating.tolist())

    print(len(x_train[-1]), len(x_test[-1]), len(y_train[-1]), len(y_test[-1]))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    clf.fit(x_train[it], y_train[it])
    predicted = clf.predict(x_test[it])

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

In [None]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

## Naive Bayes with Laplace smoothing

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    clf.fit(x_train[it], y_train[it])
    predicted = clf.predict(x_test[it])

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

In [None]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

## Naive Bayes with Witten Bell

In [None]:
def apply_witten_bell(x):
    print("start")
    df_x = pd.DataFrame(x)
    print("iterating")
    res = []

    for it, row in df_x.iterrows():
        
        if it %500 == 0:
            print(it, x.shape)
        N = sum(row)
        #print("N", N)
        M = sum([1 for x in row if x != 0])
        #print("M", M)
        row = row/row.sum()
        nval = 1/(N+M)
        #print("applying", nval)
        row = row.replace(0, nval)
        #print("set value")
        res.append(row)
        #print("ready with row")
    df_res = pd.DataFrame(res)

    return df_res
    

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    wb_x_train = apply_witten_bell(x_train[it])
    wb_x_test = apply_witten_bell(x_test[it])




    clf.fit(wb_x_train, y_train[it])
    predicted = clf.predict(wb_x_test)

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

In [None]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

## Good Turing Smoothing

(without log-linear smoothing like Sampson, 1997)

In [None]:
def apply_good_turing(x):
    print("start")
    df_x = pd.DataFrame(x)
    
    df_x = df_x + 1 #add one as mentioned in paper
    
    print("iterating")
    res = []

    for it, row in df_x.iterrows():
        r_stars = {}
        if it %500 == 0:
            print(it, df_x.shape)
            
        vc = row.value_counts().to_dict()
        for it in range(0, max(vc.keys())):
            if it not in vc:
                vc[it]= 0
        #print(vc)
        
        for r in sorted(vc):
            Nr_plus_1 = 0 if r+1 not in vc else vc[r+1]
            Nr = vc[r] if r in vc else [vc[r_] for r_ in range(r, 0) if vc[r_1] > 0][0] # take next smallest value
            Nr = Nr if Nr > 0 else 1
            r_star = (r + 1) * (Nr_plus_1/Nr)
            #print("r*", r, Nr, Nr_plus_1, (Nr_plus_1/Nr) , r_star)
            r_stars[r] = r_star
        #print(vc, r_stars)
        for it in range(0, max(r_stars.keys())):
            if it not in r_stars:
                r_stars[it]= 0
        #print(vc)
        
        res.append(row.map(lambda n: r_stars[n]))
      
    df_res = pd.DataFrame(res)

    return df_res

text = [[1,0,1,0,1], [2,1,0,1,2], [5,0,0,1,0], [2,3,0,0,0]]
apply_good_turing(x_train[0])

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    gt_x_train = apply_good_turing(x_train[it])
    gt_x_test = apply_good_turing(x_test[it])




    clf.fit(gt_x_train, y_train[it])
    predicted = clf.predict(gt_x_test)

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

In [None]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

## Bigrams with Laplace smoothing

In [None]:
def create_n_grams(tokens, n):
    res = []
    for it in range(0, len(tokens) -(n-1)):
        res.append(" ". join(tokens[it: it +n]))
    return res

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

datasets = load_dataset(dataset_paths[0])

vectorizer = CountVectorizer(tokenizer=literal_eval)
x_train, y_train, x_test, y_test = [], [], [], []

for dataset in datasets:
    df_train = dataset[0]
    df_test = dataset[1]
    df_train["tokens"] = df_train.tokens.map(lambda tokens: str(create_n_grams(literal_eval(tokens), 2)))
    df_test["tokens"] = df_test.tokens.map(lambda tokens: str(create_n_grams(literal_eval(tokens), 2)))
    
    temp_train = vectorizer.fit_transform(df_train["tokens"])
    x_train.append(temp_train.toarray())
    y_train.append(df_train.bool_rating.tolist())

    temp_test = vectorizer.transform(df_test["tokens"])
    x_test.append(temp_test.toarray())
    y_test.append(df_test.bool_rating.tolist())

    print(len(x_train[-1]), len(x_test[-1]), len(y_train[-1]), len(y_test[-1]))

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    clf.fit(x_train[it], y_train[it])
    predicted = clf.predict(x_test[it])

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

In [None]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

# Test 2

In [None]:
datasets = load_dataset(dataset_paths[1])

## Create Feature Vector

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer=literal_eval)
x_train, y_train, x_test, y_test = [], [], [], []

for dataset in datasets:
    df_train = dataset[0]
    df_test = dataset[1]
    
    temp_train = vectorizer.fit_transform(df_train["tokens"])
    x_train.append(temp_train.toarray())
    y_train.append(df_train.bool_rating.tolist())

    temp_test = vectorizer.transform(df_test["tokens"])
    x_test.append(temp_test.toarray())
    y_test.append(df_test.bool_rating.tolist())

    print(len(x_train[-1]), len(x_test[-1]), len(y_train[-1]), len(y_test[-1]))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    clf.fit(x_train[it], y_train[it])
    predicted = clf.predict(x_test[it])

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

In [None]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

## Naive Bayes with Laplace smoothing

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    clf.fit(x_train[it], y_train[it])
    predicted = clf.predict(x_test[it])

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

In [None]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

## Naive Bayes with Witten Bell

In [None]:
def apply_witten_bell(x):
    print("start")
    df_x = pd.DataFrame(x)
    print("iterating")
    res = []

    for it, row in df_x.iterrows():
        
        if it %500 == 0:
            print(it, x.shape)
        N = sum(row)
        #print("N", N)
        M = sum([1 for x in row if x != 0])
        #print("M", M)
        row = row/row.sum()
        nval = 1/(N+M)
        #print("applying", nval)
        row = row.replace(0, nval)
        #print("set value")
        res.append(row)
        #print("ready with row")
    df_res = pd.DataFrame(res)

    return df_res
    

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    wb_x_train = apply_witten_bell(x_train[it])
    wb_x_test = apply_witten_bell(x_test[it])




    clf.fit(wb_x_train, y_train[it])
    predicted = clf.predict(wb_x_test)

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

In [None]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

## Good Turing Smoothing

(without log-linear smoothing like Sampson, 1997)

In [None]:
def apply_good_turing(x):
    print("start")
    df_x = pd.DataFrame(x)
    
    df_x = df_x + 1 #add one as mentioned in paper
    
    print("iterating")
    res = []

    for it, row in df_x.iterrows():
        r_stars = {}
        if it %500 == 0:
            print(it, df_x.shape)
            
        vc = row.value_counts().to_dict()
        for it in range(0, max(vc.keys())):
            if it not in vc:
                vc[it]= 0
        #print(vc)
        
        for r in sorted(vc):
            Nr_plus_1 = 0 if r+1 not in vc else vc[r+1]
            Nr = vc[r] if r in vc else [vc[r_] for r_ in range(r, 0) if vc[r_1] > 0][0] # take next smallest value
            Nr = Nr if Nr > 0 else 1
            r_star = (r + 1) * (Nr_plus_1/Nr)
            #print("r*", r, Nr, Nr_plus_1, (Nr_plus_1/Nr) , r_star)
            r_stars[r] = r_star
        #print(vc, r_stars)
        for it in range(0, max(r_stars.keys())):
            if it not in r_stars:
                r_stars[it]= 0
        #print(vc)
        
        res.append(row.map(lambda n: r_stars[n]))
      
    df_res = pd.DataFrame(res)

    return df_res

text = [[1,0,1,0,1], [2,1,0,1,2], [5,0,0,1,0], [2,3,0,0,0]]
apply_good_turing(x_train[0])

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    gt_x_train = apply_good_turing(x_train[it])
    gt_x_test = apply_good_turing(x_test[it])




    clf.fit(gt_x_train, y_train[it])
    predicted = clf.predict(gt_x_test)

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

In [None]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))

## Bigrams with Laplace smoothing

In [None]:
def create_n_grams(tokens, n):
    res = []
    for it in range(0, len(tokens) -(n-1)):
        res.append(" ". join(tokens[it: it +n]))
    return res

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

datasets = load_dataset(dataset_paths[1])

vectorizer = CountVectorizer(tokenizer=literal_eval)
x_train, y_train, x_test, y_test = [], [], [], []

for dataset in datasets:
    df_train = dataset[0]
    df_test = dataset[1]
    df_train["tokens"] = df_train.tokens.map(lambda tokens: str(create_n_grams(literal_eval(tokens), 2)))
    df_test["tokens"] = df_test.tokens.map(lambda tokens: str(create_n_grams(literal_eval(tokens), 2)))
    
    temp_train = vectorizer.fit_transform(df_train["tokens"])
    x_train.append(temp_train.toarray())
    y_train.append(df_train.bool_rating.tolist())

    temp_test = vectorizer.transform(df_test["tokens"])
    x_test.append(temp_test.toarray())
    y_test.append(df_test.bool_rating.tolist())

    print(len(x_train[-1]), len(x_test[-1]), len(y_train[-1]), len(y_test[-1]))

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1)

tps, tns, fps, fns = [],[],[],[]

for it,_ in enumerate(x_train):
    clf.fit(x_train[it], y_train[it])
    predicted = clf.predict(x_test[it])

    tp, tn, fp, fn = 0,0,0,0
    for itx, x in enumerate(predicted):
        if x and y_test[it][itx]:
            tp += 1
        elif x and not y_test[it][itx]:
            fp += 1
        elif not x and y_test[it][itx]:
            fn += 1
        elif not x and not y_test[it][itx]:
            tn += 1

    print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    
    tps.append(tp)
    fps.append(fp)
    tns.append(tn)
    fns.append(fn)

In [None]:
print("tp=", sum(tps),"tn=", sum(tns),"fp=", sum(fps),"fn=", sum(fns))
print("scoring=", sum([sum(tps), sum(tns)])/sum([sum(tps), sum(tns), sum(fps), sum(fns)]))