# Bayes

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm, tnrange
from math import log

## Preparing data

In [None]:
def get_data_from_file(filename):
    legit = 0
    if "legit" in filename:
        legit = 1
    
    with open(filename, "r") as file:
        subject_words = file.readline()[9:-1].split()
        file.readline()
        message_words = file.readline()[:-1].split()
        
    return list(map(int, subject_words)), list(map(int, message_words)), legit

In [None]:
def get_ngrams_from_lists(subject_words, message_words, n):
    subject_ngrams = [tuple(subject_words[j:j+n]) for j in range(len(subject_words)-n+1)]
    message_ngrams = [tuple(message_words[j:j+n]) for j in range(len(message_words)-n+1)]
    
    if not subject_ngrams:
        return message_ngrams
    
    return subject_ngrams + message_ngrams

In [None]:
def get_vector_from_ngrams(letter, ndict):
    vec = np.zeros(len(ndict))
    
    for ngram in letter:
        vec[ndict[ngram]] = 1
    
    return vec

In [None]:
def get_prepared_data_from_dir(n, data_dirname='messages/'):
    X = []
    y = []
    X_ngrams = []
    all_ngrams = []

    ngram_dict = {}

    for dirname in os.listdir(data_dirname):
        filenames = os.listdir(data_dirname + dirname)
        full_filenames = [data_dirname + dirname + '/' + filename for filename in filenames]
    
        part_x = []
        part_y = []
        for name in full_filenames:
            subject_words, message_words, legit = get_data_from_file(name)
            cur_ngrams = get_ngrams_from_lists(subject_words, message_words, n)
            part_x.append(cur_ngrams)
            part_y.append(legit)
            all_ngrams += cur_ngrams
    
        X_ngrams.append(part_x)
        y.append(part_y)
        
    all_ngrams = list(set(all_ngrams))
    
    for i in range(len(all_ngrams)):
        ngram_dict[all_ngrams[i]] = i
    
    for part in X_ngrams:
        vpart = []
        for letter in part:
            vec = get_vector_from_ngrams(letter, ngram_dict)
            vpart.append(vec)
        X.append(vpart)
        
    return X, y

## Bayes clf

In [None]:
def get_classes_a_priori_proba(labels):
    legit = np.count_nonzero(labels)
    return legit/len(labels), (len(labels)-legit)/len(labels)

In [None]:
def get_laplas_proba(wcount, all_count, alpha):
    return (wcount + alpha)/(all_count + alpha*2)

In [None]:
def get_all_words_cond_proba(X, y, alpha):
    legit = []
    spam = []
    for i in range(len(X)):
        if y[i] == 1:
            legit.append(X[i])
        else:
            spam.append(X[i])
    
    nvec = len(X[0])
    
    vlegit = np.zeros(nvec)
    vspam = np.zeros(nvec)
    
    for i in range(nvec):
        flegit = list(filter(lambda x: x[i], legit))
        fspam = list(filter(lambda x: not x[i], spam))
        vlegit[i] = get_laplas_proba(len(flegit), len(legit), alpha)
        vspam[i] = get_laplas_proba(len(fspam), len(spam), alpha)
    
    return vlegit, vspam

In [None]:
def get_letter_proba(vletter, apri, vproba, lambda_):
    lproba = [vproba[i] if vletter[i] else 1-vproba[i] for i in range(len(vletter))]
    log_proba = list(map(log, lproba))
    sum_cond_proba = sum(log_proba)
    
    return log(lambda_ * apri) + sum_cond_proba

In [None]:
def get_letter_class(vletter, apri_legit, apri_spam, vlegit, vspam, lambda_legit, lambda_spam):
    legit_proba = get_letter_proba(vletter, apri_legit, vlegit, lambda_legit)
    spam_proba = get_letter_proba(vletter, apri_spam, vspam, lambda_spam)
    
    print(legit_proba, spam_proba)
    
    if (legit_proba > spam_proba):
        return (1, legit_proba)

    return (0, spam_proba)

## Cross-validation and finding hyper-params

### Evaluation

In [None]:
def get_accuracy_and_legit_neg(y_actual, y_pred):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    
    for (a, p) in zip(y_actual, y_pred):
        if a == p:
            if p == 1:
                tp += 1
            else:
                tn += 1
        else:
            if p == 1:
                fp += 1
            else:
                fn += 1
                
    accuracy = (tp + tn) / (tp + fp + tn + fn) 
    
    return accuracy, fn

### Splits

In [None]:
def get_train_test_data(n):
    X, y = get_prepared_data_from_dir(n)
    train_x = []
    train_y = []
    test_x = []
    test_y = []
    
    for i in range(10):
        cur_train_x = np.delete(X, i, 0)
        cur_train_y = np.delete(y, i, 0)
        cur_test_x = X[:][i]
        cur_test_y = y[:][i]
        train_x.append(np.concatenate(cur_train_x))
        train_y.append(np.concatenate(cur_train_y))
        test_x.append(cur_test_x)
        test_y.append(cur_test_y)
        
    return train_x, train_y, test_x, test_y

### Params

In [None]:
ns = [1]
alphas = [0.0001]
lambda_legit = 1
lambda_spam = 100

### Clf with finding best params

In [None]:
best_acc = 0
min_ln = 100000
best_params = []
best_pred = []
legit_neg = []
accs = []

with tqdm(total=len(ns)*len(alphas)*1, desc="clfprogress", leave=False) as trp:
    for n in ns:
        for alpha in alphas:
            accs = []
            legit_neg = []
            pred = []
            train_x, train_y, test_x, test_y = get_train_test_data(n)
            for i in range(1):
                vlegit, vspam = get_all_words_cond_proba(train_x[i], train_y[i], alpha)
                apri_legit, apri_spam = get_classes_a_priori_proba(train_y[i])
                test_pred_y = list(map(lambda x: get_letter_class(x, apri_legit, apri_spam, vlegit, vspam, lambda_legit, lambda_spam), test_x[i]))
                accuracy, ln = get_accuracy_and_legit_neg(test_y[i], list(zip(*test_pred_y))[0])
                accs.append(accuracy)
                legit_neg.append(ln)
                pred.append(test_pred_y)

                trp.update(1)
            average_acc = sum(accs)/len(accs)
            sum_ln = sum(legit_neg)
            print(str(average_acc) + " " + str(sum_ln))
            if sum_ln < min_ln or sum_ln == min_ln and average_acc > best_acc:
                best_acc = average_acc
                min_l = sum_ln
                best_params = [n, alpha, lambda_legit]
                best_pred = pred

## Graphs

In [None]:
best_pred

In [None]:
print(best_params)

In [None]:
def build_roc(pred):
    

In [None]:
plt.title("ROC Curve")
plt.plot(best_roc[0], best_roc[1])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.show()

In [None]:
average_accs = []

lambdas_legits = [1, 100, 1000, 10000]
for ll in lambdas_legit:
    accs = []
    train_x, train_y, test_x, test_y, ndict = get_train_test_data_and_dict(best_params[0])
    for i in range(10):
        vlegit, vspam = get_all_words_cond_proba(train_x[i], train_y[i], ndict, best_params[1])
        apri_legit, apri_spam = get_classes_a_priori_proba(train_y[i])
        test_pred_y = list(map(lambda x: get_letter_class(x, apri_legit, apri_spam, vlegit, vspam, ll, lambda_spam), test_x[i]))
        accuracy, ln, tpr, fpr = get_accuracy_and_legit_neg(test_y[i], test_pred_y)
        accs.append(accuracy)
    average_accs.append(sum(accs)/len(accs))

In [None]:
plt.plot(lambdas_legit, average_accs)
plt.xlabel("lambda legit")
plt.ylabel("accuracy")
plt.show()