In this Notebook I will test different models with different N to check which one produces the best results

In [1]:
from data_processor import DataProcessor
import ngram_authorship_classifier
import os
from dotenv import load_dotenv

Load Data

In [3]:
load_dotenv()
data_dir = os.getenv('TRAIN_DATA_PATH')

In [4]:
author_files = ["austen_utf8.txt","dickens_utf8.txt","tolstoy_utf8.txt","wilde_utf8.txt"]
author_names = ["Austen","Dickens","Tolstoy","Wilde"]

data_proc = DataProcessor()

authors_train_data = dict()
authors_test_data = dict()
for i in range(len(author_files)):
    print("Processing data for author: " + author_names[i])
    trainset,devset = data_proc.process_split_file(os.path.join(data_dir, author_files[i]))
    authors_train_data[author_names[i]] = trainset
    authors_test_data[author_names[i]] = devset


  from .autonotebook import tqdm as notebook_tqdm


Processing data for author: Austen
Splitting into training and development...
Processing data for author: Dickens
Splitting into training and development...
Processing data for author: Tolstoy
Splitting into training and development...
Processing data for author: Wilde
Splitting into training and development...


Trying different combinations

In [44]:
models = ["mle","sb","lp","kn"]
ngrams = [1,2,3,4,5]

In [35]:
print(authors_test_data.keys())

dict_keys(['Austen', 'Dickens', 'Tolstoy', 'Wilde'])


Evaluating the Different Combinations

In [53]:
import importlib
importlib.reload(ngram_authorship_classifier)

<module 'ngram_authorship_classifier' from 'c:\\Users\\Yassin\\Desktop\\NLP\\Homeworks\\HW3\\ngram_authorship_classifier.py'>

In [54]:
best_acc = 0
best_model = None
best_n = None
for model in models:
    for n in ngrams:
        print("Training model: " + model + " with ngram: " + str(n))
        classifier = ngram_authorship_classifier.NgramAuthorshipClassifier(smoothing=model,n=n)
        classifier.train(authors_train_data)
        print("Evaluating model: " + model + " with ngram: " + str(n))
        avg_acc = classifier.evaluate_devset(authors_test_data, show_accuracy=True)
        if avg_acc > best_acc:
            best_acc = avg_acc
            best_model = model
            best_n = n

Training model: mle with ngram: 1
Training LMs... (this may take a while)
Evaluating model: mle with ngram: 1
Results on dev set:
Austen 	 0.93 correct
Dickens 	 0.37 correct
Tolstoy 	 0.45 correct
Wilde 	 0.40 correct
Training model: mle with ngram: 2
Training LMs... (this may take a while)
Evaluating model: mle with ngram: 2
Results on dev set:
Austen 	 0.99 correct
Dickens 	 0.05 correct
Tolstoy 	 0.02 correct
Wilde 	 0.04 correct
Training model: mle with ngram: 3
Training LMs... (this may take a while)
Evaluating model: mle with ngram: 3
Results on dev set:
Austen 	 1.00 correct
Dickens 	 0.02 correct
Tolstoy 	 0.01 correct
Wilde 	 0.01 correct
Training model: mle with ngram: 4
Training LMs... (this may take a while)
Evaluating model: mle with ngram: 4
Results on dev set:
Austen 	 1.00 correct
Dickens 	 0.01 correct
Tolstoy 	 0.01 correct
Wilde 	 0.01 correct
Training model: mle with ngram: 5
Training LMs... (this may take a while)
Evaluating model: mle with ngram: 5
Results on dev

In [55]:
print("Best model: " + best_model + " with ngram: " + str(best_n) + " with accuracy: " + str(best_acc))

Best model: lp with ngram: 1 with accuracy: 0.9289940828402367
