In this Notebook I will test different models with different N to check which one produces the best results

In [2]:
from data_processor import DataProcessor
import ngram_authorship_classifier
import os
from dotenv import load_dotenv

Load Data

In [3]:
load_dotenv()
data_dir = os.getenv('TRAIN_DATA_PATH')

data_dir: ./author_texts
Contents of data_dir: ['dickens_utf8.txt', 'austen_utf8.txt', 'tolstoy_utf8.txt', 'wilde_utf8.txt']


In [4]:
author_files = ["austen_utf8.txt","dickens_utf8.txt","tolstoy_utf8.txt","wilde_utf8.txt"]
author_names = ["Austen","Dickens","Tolstoy","Wilde"]

data_proc = DataProcessor()

authors_train_data = dict()
authors_test_data = dict()
for i in range(len(author_files)):
    print("Processing data for author: " + author_names[i])
    trainset, devset = data_proc.process_split_file(os.path.join(data_dir, author_files[i]))
    authors_train_data[author_names[i]] = trainset
    authors_test_data[author_names[i]] = devset


Processing data for author: Austen
Splitting into training and development...
Processing data for author: Dickens
Splitting into training and development...
Processing data for author: Tolstoy
Splitting into training and development...
Processing data for author: Wilde
Splitting into training and development...


Trying different combinations

In [5]:
models = ["mle","sb","lp","kn"]
ngrams = [1,2,3,4,5]

In [6]:
print(authors_test_data.keys())

dict_keys(['Austen', 'Dickens', 'Tolstoy', 'Wilde'])


Evaluating the Different Combinations

In [15]:
import importlib
importlib.reload(ngram_authorship_classifier)

<module 'ngram_authorship_classifier' from '/content/ngram_authorship_classifier.py'>

In [8]:
best_acc = 0
best_model = None
best_n = None
for model in models:
    for n in ngrams:
        print("Training model: " + model + " with ngram: " + str(n))
        classifier = ngram_authorship_classifier.NgramAuthorshipClassifier(smoothing=model,n=n)
        classifier.train(authors_train_data)
        print("Evaluating model: " + model + " with ngram: " + str(n))
        avg_acc = classifier.evaluate_devset(authors_test_data, show_accuracy=True)
        if avg_acc > best_acc:
            best_acc = avg_acc
            best_model = model
            best_n = n

Training model: mle with ngram: 1
Training LMs... (this may take a while)
Evaluating model: mle with ngram: 1
Results on dev set:
Austen 	 88.77% correct
Dickens 	 87.11% correct
Tolstoy 	 93.09% correct
Wilde 	 89.94% correct
Training model: mle with ngram: 2
Training LMs... (this may take a while)
Evaluating model: mle with ngram: 2
Results on dev set:
Austen 	 89.84% correct
Dickens 	 84.77% correct
Tolstoy 	 92.82% correct
Wilde 	 79.88% correct
Training model: mle with ngram: 3
Training LMs... (this may take a while)
Evaluating model: mle with ngram: 3
Results on dev set:
Austen 	 86.63% correct
Dickens 	 79.69% correct
Tolstoy 	 92.54% correct
Wilde 	 71.60% correct
Training model: mle with ngram: 4
Training LMs... (this may take a while)
Evaluating model: mle with ngram: 4
Results on dev set:
Austen 	 79.68% correct
Dickens 	 75.78% correct
Tolstoy 	 85.91% correct
Wilde 	 63.91% correct
Training model: mle with ngram: 5
Training LMs... (this may take a while)
Evaluating model: 

The best performing model is n = 1 with accuracy 93%, but there is a problem in the accuracy function so we computed it manually using the above results for each model & n

In [23]:
print("Best model: " + best_model + " with ngram: " + str(best_n) + " with accuracy: " + str(best_acc))

Best model: lp with ngram: 5 with accuracy: 0.9822485207100592


In [25]:
classifier = ngram_authorship_classifier.NgramAuthorshipClassifier(smoothing="lp",n=1)
classifier.train(authors_train_data)



top_features = classifier.get_top_features(top_k=5)
print("\n")
print("Top Five Most common features for authors \n")

for author, features in top_features.items():
    print(f"Top features for {author}:")
    for ngram, count in features:
        print(f"  {ngram} -> {count} times")

Training LMs... (this may take a while)


Top Five Most common features for authors 

Top features for Austen:
  (',',) -> 8683 times
  ('.',) -> 4244 times
  ('the',) -> 3752 times
  ('to',) -> 3645 times
  ('of',) -> 3244 times
Top features for Dickens:
  (',',) -> 10806 times
  ('.',) -> 4472 times
  ('the',) -> 4403 times
  ('and',) -> 3482 times
  ('I',) -> 3231 times
Top features for Tolstoy:
  (',',) -> 13075 times
  ('the',) -> 8187 times
  ('.',) -> 7255 times
  ('and',) -> 5671 times
  ('to',) -> 4578 times
Top features for Wilde:
  (',',) -> 6289 times
  ('.',) -> 5067 times
  ('the',) -> 3972 times
  ('and',) -> 2492 times
  ('of',) -> 2389 times
