In [1]:
import os
from tqdm import tqdm

## Reading data

In [2]:
messages_dir = '../data/part'
messages = []

for i in range(1, 11):
    messages_part = []
    for message_filename in os.listdir(messages_dir + str(i)):
        message_file = open(messages_dir + str(i) + "/" + message_filename, "r")
        subject = message_file.readline()[9:-1]
        message_file.readline()
        text = message_file.readline()[:-1]
        is_spam = 1 if 'spmsg' in message_filename else 0
        messages_part.append([subject + text, is_spam])
    messages.append(messages_part)

## Accuracy score 

In [3]:
def get_accuracy_score(y_true, y_predicted):
    return sum(1 if y_true_i == y_predicted_i else 0 for y_true_i, y_predicted_i in zip(y_true, y_predicted)) / len(y_true)

## Cross Validation

In [4]:
import sys
sys.path.append('../../../cf')
from bayes_F import BayesClassifier 

In [5]:
def cross_validate(messages, classifier_params):
    scores = []
    for i in tqdm(range(len(messages))):
        test = messages[i]
        train = messages[:i] + messages[i + 1:]
        X_train = []
        y_train = []
        for messages_part in train:
            for message in messages_part:
                X_train.append(message[0].split())
                y_train.append(message[1])
        classifier = BayesClassifier(*classifier_params)
        classifier.fit(X_train, y_train)
        y_true = []
        y_predicted = []
        for message in test:
            X_test = message[0].split()
            y_test = message[1]
            y_true.append(y_test)
            y_predicted.append(classifier.predict(X_test))
        scores.append(get_accuracy_score(y_true, y_predicted))
    return sum(scores) / len(scores), classifier_params

## Hyperparameter Optimization

In [6]:
import multiprocessing as mp

def hyperparameter_optimization():
    pool = mp.Pool(mp.cpu_count())
    best_accuracy_score = -1
    best_params = None
    all_params = []
    for alpha in [1e-5, 1e-2]:
        for gram_len in [1, 2, 3]:
            all_params.append( [alpha, gram_len, [1, 1]])
    results = [pool.apply(cross_validate, args=(messages, params)) for params in all_params]
    pool.close()   
    print(results)
    for result in results:
        if result[0] > best_accuracy_score:
            best_accuracy_score = result[0]
            best_params = result[1]
    print('Best accuracy score is {0} for params {1}'.format(best_accuracy_score, best_params))
    return best_accuracy_score, best_params

## Model analysis

In [None]:
hyperparameter_optimization()