In [5]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from data_processor import DataProcessor
import hf_sequence_classifier as hf
import os
from dotenv import load_dotenv

### Loading Data

In [6]:
load_dotenv()
data_dir = os.getenv('TRAIN_DATA_PATH')

In [7]:
author_files = ["austen_utf8.txt","dickens_utf8.txt","tolstoy_utf8.txt","wilde_utf8.txt"]
author_names = ["Austen","Dickens","Tolstoy","Wilde"]

data_proc = DataProcessor()

authors_train_data = dict()
authors_test_data = dict()
for i in range(len(author_files)):
    print("Processing data for author: " + author_names[i])
    trainset,devset = data_proc.process_split_file(os.path.join(data_dir, author_files[i]))
    authors_train_data[author_names[i]] = trainset
    authors_test_data[author_names[i]] = devset


Processing data for author: Austen
Splitting into training and development...
Processing data for author: Dickens
Splitting into training and development...
Processing data for author: Tolstoy
Splitting into training and development...
Processing data for author: Wilde
Splitting into training and development...


In [27]:
import random

In [None]:
authors_test_data_sample = dict()
for author_name, trainset in authors_test_data.items():
    authors_test_data_sample[author_name] = random.sample(trainset, min(50, len(trainset)))

print("Authors test data sample:")
print(authors_test_data_sample)

### Load the Model

In [46]:
import importlib
importlib.reload(hf)

<module 'hf_sequence_classifier' from 'c:\\Users\\Yassin\\Desktop\\NLP\\Homeworks\\HW3\\hf_sequence_classifier.py'>

In [47]:
# Load the SequenceClassifier instead
classifier = hf.SequenceClassifier(num_labels=len(authors_test_data.keys()))

# Load the trained model
classifier.model = AutoModelForSequenceClassification.from_pretrained("./results_backup")

# Load the tokenizer
classifier.tokenizer = AutoTokenizer.from_pretrained("./results_backup")

# Restore id2label
classifier.id2label = {i: label for i, label in enumerate(authors_test_data.keys())}


In [48]:
print("id2label mapping:", classifier.id2label)
print("Model num_labels:", classifier.num_labels)


id2label mapping: {0: 'Austen', 1: 'Dickens', 2: 'Tolstoy', 3: 'Wilde'}
Model num_labels: 4


In [49]:
# Now evaluate
pred = classifier.evaluate_devset(authors_test_data_sample, show_accuracy=True)

Results on dev set:
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actual: Austen
predicted: Austen actua