# SHAKESPEAREAN PRONOUN TAGGER  
A project by Maria Irena Szawerna for Machine learning for statistical NLP: Advanced LT2326 at the University of Gothenburg.  

The aim of this project is testing whether it is possible to build a tagger for "you" pronouns in Shakespeare's works (whether they are singular, plural, or unknown) using modern tools (BERT embeddings and LSTM models) and annotated plays ("Hamlet" and "As You Like It") as the training data.

### IMPORTING THE NECESSARY LIBRARIES

In [1]:
# uncomment once if running on mltgpu
# pip install nltk

In [2]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import random
import pandas as pd
import math
import transformers
from transformers import BertTokenizerFast
from transformers import BertModel
model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.optim as optim
import pickle
import sklearn

### DATA-PROCESSING FUNCTIONS

In [3]:
# defining the files the data will be sourced from
shakespeare_plays = ["trimmed-hamlet.txt", "trimmed-as-you-like-it.txt"]

In [4]:
def extract_you_sents(filename, double=False):
# this function is intended for opening a file with trimmed (character names and didaskalia removed), annotated play data; it
# takes as an argument the name of the file and returns a list of sentences from the file that include any form of a "you" 
# pronoun, with all newline, tabs, and other such stuff removed.
    with open(filename) as f: 
        # reading from the file
        text = f.read()
        # splitting the text into sentences using NLTK's sentence tokenizer
        sents = sent_tokenize(text)
        # the sentences provided by NLTK still have newline characters in them, inside the lines (as they come from a play, 
        # not a book) which need to be removed
        # we also only need the sentences with "you" pronouns, and an option to only pick out sentences with one such pronoun
        # present.
    double_sents = []  # for more than one occurrence of "you"
    single_sents = []  # for exactly one occurrence of "you"
    for sent in sents:
        tracker = 0  # to track how many "you" forms there are in the sentence
        for element in word_tokenize(sent):
            if '_SG' in element or '_PL' in element or '_UNK' in element:
                tracker += 1
        if tracker > 1:
            double_sents.append(sent.replace('\n', ' ').replace('\t', '').replace('\\', ''))
        elif tracker == 1:
            single_sents.append(sent.replace('\n', ' ').replace('\t', '').replace('\\', ''))
        else:  # if there are no "you" forms
            continue
    
    # deciding which sentences will be returned by the function, the ones with only one "you" or also the ones with
    # multiple ones
    if double == False:
        you_sents = single_sents
    else:
        you_sents = single_sents + double_sents

    return you_sents

In [5]:
def extract_from_files(file_list, double=False):
# this function allows for obtaining you-sentences from many files simultaneously
    all_sents = []
    for filename in file_list:
        file_sents = extract_you_sents(filename, double=double)
        all_sents += file_sents
    
    return all_sents

In [6]:
def create_samples(sentence_list, tokenizer):
    # this function takes in a list of sentences with "you" pronoun(s) in them and creates samples of the shape (tokenized_sent,
    # class, pronoun_index, sentence_length)
    # the "you" pronouns are defined as the following variants: you -> [ye] [your] [yours] [yourself] [yourselves], as used
    # in the original study in the corpus analysis software AntConc
    # maximum_length is set to 200 as this is a bit above what the longest sentence is in this dataset; however, since we are
    # not feeding this to the tokenizer in batches, we cannot just pad 
    prons = ['you', 'ye', 'your', 'yours', 'yourself', 'yourselves']  # all the "you" forms
    converted_prons = tokenizer.convert_tokens_to_ids(prons)
    conv_underscore = tokenizer.convert_tokens_to_ids('_')
    
    all_samples = []
    for sent in sentence_list:
        sent_cls = []
        tokenized = tokenizer(sent)  # tokenizing the sentence using BERT tokenizer
        for i in range(0, len(tokenized['input_ids'])):
            # iterating through the tokenized sentence to find all the classes
            if tokenized['input_ids'][i] == conv_underscore:
                cls = tokenizer.convert_ids_to_tokens(tokenized['input_ids'][i+1])
                sent_cls.append(cls)

        # removing the annotation (as now we have that information ordered in sent_clss)
        new_sent = sent.replace('_SG', '').replace('_PL', '').replace('_UNK', '')
        
        # creating and appending a sample for each "you" pronoun in the sentence; it is necessary to use the tokenizer on
        # it to retrieve the appropriate index (and this is important as we have samples where there may be more than one
        # "you" token); we do not save the tokenized sentence as we actually want those to be padded to the length of the
        # longest sentence in the batch, which is also why we are saving the length here
        full_sent = tokenizer(new_sent)
        for i in range(0, len(full_sent['input_ids'])):
            if full_sent['input_ids'][i] in converted_prons:
                cls = sent_cls.pop(0)
                idx = i  
                all_samples.append([new_sent, cls, idx])
    
    return all_samples

In [7]:
def samples_splits(samples, train_split=0.8, test_split=0.2, equalize=True):
    # creating a function to mix and split the data stored in a list according to parameters introduced by the user
    # avoiding potential errors
    if (train_split + test_split) != 1:
        print('Invalid data split proportions!')
        return
    
    if equalize == False:
        random.shuffle(samples)  # shuffling the data 
        test_size = math.ceil(len(samples) * test_split)  # deciding the point at which to split the samples
        test_samples = samples[:test_size]
        train_samples = samples[test_size:]
    
    # implementing "equalizing" so that all classes are equally represented, removing UNK as that one is significantly smaller
    # than the other classes, effectively turning this into a binary classification problem
    else:
        sg_samples = []
        pl_samples = []
        for sample in samples:
            if sample[1] == 'sg':
                sg_samples.append(sample)
            elif sample[1] == 'pl':
                pl_samples.append(sample)
            else:
                continue
        
        if len(sg_samples) > len(pl_samples):
            max_len = len(pl_samples)
        else:
            max_len = len(sg_samples)
        
        test_size = math.ceil(max_len * test_split)
            
        random.shuffle(sg_samples)
        random.shuffle(pl_samples)
        
        selected_sg = sg_samples[:max_len]
        selected_pl = pl_samples[:max_len]
        
        test_sg = selected_sg[:test_size]
        train_sg = selected_sg[test_size:]
        
        test_pl = selected_pl[:test_size]
        train_pl = selected_pl[test_size:]
        
        test_samples = test_sg + test_pl
        train_samples = train_sg + train_pl
        
        random.shuffle(test_samples)
        random.shuffle(train_samples)
    
    return train_samples, test_samples

### DATA-ENCODING AND BATCHING FUNCTIONS

In [8]:
# BERT is imported and set in eval mode so that we can retrieve embeddings from it
bert_model = BertModel.from_pretrained(model_name, return_dict=True, output_hidden_states=True)
bert_model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [9]:
def get_embeddings(input_ids, attention_mask, bert_device):
    # a function to retrieve the word embeddings from BERT, required BERT to be initialized as bert_model beforehand and put
    # in eval mode (see above), all of that with return_hidden_states=True
    # inspired by https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#3-extracting-embeddings
    # the same function was used in my Assignment 1 for this course
    with torch.no_grad():
        # getting the parts of the batch that are relevant
        bert_model.to(bert_device)
        output = bert_model(input_ids, attention_mask)
        # hidden states for every layer are stored here
        hidden_states = output[2]
        # there are 13 elements in the hidden states, meaning the initial embeddings and hidden states from 12 layers;
        # according to the link I provided above, there are mant approaches for what elements of these to use as word
        # embeddings; I will use the second-to-last hidden layer
        penultimate_layer = hidden_states[11]
        # the penultimate layer now has the shape of batch * max_len of the sentence * 768 (BERT embedding size)
    return penultimate_layer

In [10]:
class ShakespeareCollate():
    # custom collate class that processes the data from the batched list of samples as they are accessed 
    def __init__(self, tokenizer, device, bert_device):
        # defining the tokenizer and the devices on which BERT and the actual model will run
        self.tokenizer = tokenizer
        self.device = device
        self.bert_device = bert_device
        
    def __call__(self, batch):
        # performing the necessary operations on every batch
        sentences = []
        classes = []
        indices = []
        
        for element in batch:
            sentences.append(element[0]) 
            classes.append(element[1])
            indices.append(torch.tensor([element[2]]))
        
        # one-hot encoding classes    
        for i in range(0, len(classes)):
            if classes[i] == 'sg':
                classes[i] = torch.FloatTensor([1])
            elif classes[i] == 'pl':
                classes[i] = torch.FloatTensor([0])

        # calling the BERT tokenizer on the sentences
        tokens = self.tokenizer(sentences, 
                                            add_special_tokens=True, 
                                            return_tensors='pt',
                                            padding=True, 
                                            return_attention_mask=True,
                                            is_split_into_words=False)
         
        input_ids=tokens['input_ids'].to(self.bert_device)
        attention_masks=tokens['attention_mask'].to(self.bert_device)
        # calling the embeddings function since we want to feed the network the BERT embeddings; make sure that BERT is
        # on the same device as the rest before that
        # all of the output is moved to the model device, classes and indices are stacked since it has to be a tensor, 
        # not a list
        bert_embeddings = get_embeddings(input_ids, attention_masks, self.bert_device).to(self.device)  
        classes = torch.stack(classes).to(self.device)
        indices = torch.stack(indices).to(self.device)
        
        return bert_embeddings, classes, indices

In [11]:
def shakespeare_dataloader(data, tokenizer, device, bert_device, batch_size=32, shuffle=True): 
    # little function to easily obtain dataloaders using our dataset and custom ShakespeareCollate class which does all the
    # hard work of transforming sentences into BERT embedding representations, one-hot-encoding the classes and turning what
    # is needed to be turned into tensors.
    loader = DataLoader(
        data,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=True,
        collate_fn=ShakespeareCollate(tokenizer, device, bert_device) )
    return loader

### TRAINING LOOP AND MODEL SAVING

In [12]:
class ShakespeareanClassifier(nn.Module):
    # class of the model itself; 
    def __init__(self, hidden_size):
        super(ShakespeareanClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.output_dim = 1  # for the binary classification 0/1
        self.embedding_dim = 768  # BERT embedding size


        # the text processing layer is a bidirectional LSTM; the input size here is the size of BERT embeddings, the hidden
        # size is set by the user; it is a bidirectional LSTM 
        self.LSTM = nn.LSTM(
                            input_size=self.embedding_dim, 
                            hidden_size=self.hidden_size, 
                            num_layers=1, 
                            batch_first=True, 
                            bidirectional=True
                            )
        
        self.classification = nn.Sequential( 
                            nn.Dropout(0.05),
                            nn.Linear(self.hidden_size*2, self.hidden_size),
                            nn.LeakyReLU(),
                            nn.Linear(self.hidden_size, self.output_dim),
                            nn.Sigmoid()
        )


    def forward(self, bert_embeddings, indices, device):
        # for calling the model we need to input the BERT embeddings from the batch 
        converted_embeddings = bert_embeddings.type(torch.FloatTensor).to(device)

        # the captions are fed through the LSTM, and the hidden state at the index of the pronoun is saved, as per the 
        # documentation "When bidirectional=True, output will contain a concatenation of the forward and reverse hidden 
        # states at each time step in the sequence." (https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html), so
        # I do not need to concatenate anything myself and selecting an indexed timestep representation already gives
        # me something that a) is double the size of the hidden layer b) contains information from both directions
        timestep_representation, (final_hidden, final_cell) = self.LSTM(converted_embeddings)
        processed_embeddings = torch.stack([tp[i] for i, tp in zip(indices, timestep_representation)])
        # the data is fed to the classification layer
        output = self.classification(processed_embeddings)

        return output

In [13]:
def train(model, params, train_samples):
    # this function is intended for training an instance of the ShakespeareanClassifier
    model.to(params['device'])
    loss_function = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=params['lr'])

    for epoch in range(1,params['epochs']+1):  # so that it prints out nicely
    
        train_iter = shakespeare_dataloader(
            train_samples, tokenizer, params['device'], params['bert_device'], batch_size=params['batch_size'], shuffle=True
        )
        
        total_loss = 0
        
        for i, batch in enumerate(train_iter):

            if True:  # i < 20: For testing
                sentences = batch[0]
                classes = batch[1]
                indices = batch[2]

                # send your batch of sentences to the model
                output = model(sentences, indices, params['device'])
                
                loss = loss_function(torch.squeeze(output, dim=1), classes)  # output is squeezed to fit the original classes
                total_loss += loss.item()

                if i%5==0:
                    print(f' Batch {i} : Average Loss = {round(total_loss/(i+1),5)}')#, end='\r')

                # calculate gradients
                loss.backward()
                # update model weights
                optimizer.step()
                # reset gradients
                optimizer.zero_grad()


        print(f'Epoch {epoch} : Average Training Loss = {round(total_loss/(i+1),5)}')#, end='\r')
        
    return model

In [14]:
def save(item, file_name):
    # a small function intended to be used to save a trained model
    pickle.dump(item, open(file_name, 'wb'))

def load(file_name):
    # a small function intended to be used to load a trained model
    item = pickle.load(open(file_name, 'rb'))

    return item

### TESTING AND EVALUATING THE MODEL

In [15]:
def test(model, params, test_samples):
    model.to(params['device'])
    model.eval()
    all_predictions = []
    all_classes = []
    with torch.no_grad():
        test_iter = shakespeare_dataloader(
            test_samples, tokenizer, params['device'], params['bert_device'], batch_size=params['batch_size'], shuffle=False
        )    
        for j, batch in enumerate(test_iter):
            sentences = batch[0]
            indices = batch[2]

            o = model(sentences, indices, params['device'])
            predictions = torch.squeeze(o).tolist()  # making sure that we get predictions in the correct format
            # encoding the predictions to reflect not probabilities, but classes; 
            for i in range(0, len(predictions)):
                if predictions[i] > 0.5:
                    predictions[i] = 1.0
                else:
                    predictions[i] = 0.0
            all_predictions += predictions
            all_classes += torch.squeeze(batch[1]).tolist()

    print('Testing complete!')

    return all_predictions, all_classes              

In [16]:
def measures(predicted_classes, true_classes):
    # this function goes over the output of the testing function and provides some basic evaluation measures, such as
    # accuracy, recall, precision, and f1, using sklearn's functions (although I did manually implement it in assignment 1)
    print('The following measures have been recorded for this model:')
    
    accuracy = sklearn.metrics.accuracy_score(true_classes, predicted_classes)
    recall = sklearn.metrics.recall_score(true_classes, predicted_classes)
    precision = sklearn.metrics.precision_score(true_classes, predicted_classes)
    f1 = sklearn.metrics.f1_score(true_classes, predicted_classes)
    
    print(f'\tAccuracy = {accuracy}')
    print(f'\tRecall = {recall}')
    print(f'\tPrecision = {precision}')
    print(f'\tF1 = {f1}')  

In [17]:
def make_dataframe(predicted_classes, test_samples):
    # a function for easy making of a dataframe to display the results together with the sentences
    rows = []
    for i in range(0, len(predicted_classes)):
        prediction = predicted_classes[i]
        if prediction == 1.0:
            prediction = 'sg'
        else:
            prediction = 'pl'
        truth = test_samples[i][1]
        sentence = test_samples[i][0]
        rows.append([sentence, prediction, truth])

    model_df = pd.DataFrame(rows)
    model_df.columns = ["sentence", "predicted class", "true class"]

    return model_df

### ANNOTATING NEW DATA

In [18]:
def annotate_sents(filename, tokenizer, model, params):
    # this function 
    full_play = []
    prons = ['you', 'ye', 'your', 'yours', 'yourself', 'yourselves']  # all the "you" forms
    pron_set = set(prons)
    converted_prons = tokenizer.convert_tokens_to_ids(prons)
    model.to(params['device'])
    model.eval
    with open(filename) as f: 
        # reading from the file
        text = f.read()
        # splitting the text into sentences using NLTK's sentence tokenizer
        sents = sent_tokenize(text)

    for sent in sents:
        if '[' in sent:
            full_play.append(sent)
        elif ']' in sent:
            full_play.append(sent)
        elif '=' in sent:
            full_play.append(sent)
        else:
            split_sent = sent.split()
            sent_set = set(split_sent)
            if not (pron_set & sent_set):
                full_play.append(sent)
            else:
                new_sent = " ".join([word for word in split_sent if not word.isupper()])
                
                sent_samples = []
                tokenized = tokenizer(new_sent)  # tokenizing the sentence using BERT tokenizer
                for i in range(0, len(tokenized['input_ids'])):
                # iterating through the tokenized sentence to find all the classes
                    if tokenized['input_ids'][i] in converted_prons:
                        pron_samples = []
                        idx = i
                        cls = 'sg'  # need to add something for it to work
                        pron_samples.append([new_sent, cls, idx])
                        for j in range(0, params['batch_size']):  # so we have a full batch and it can drop one "last" batch
                            pron_samples.append(['dummy sentence', 'pl', 1])
                        sent_samples.append(pron_samples)
                        
                preds = []    
                for sample in sent_samples:
                    with torch.no_grad():
                        sent_iter = shakespeare_dataloader(
                            sample, tokenizer, params['device'], params['bert_device'], batch_size=params['batch_size'], shuffle=False
                        ) 
                        for j, batch in enumerate(sent_iter):
                            sentences = batch[0]
                            indices = batch[2]

                            o = model(sentences, indices, params['device'])
                            predictions = torch.squeeze(o).tolist()
                            
                            if predictions[0] > 0.5:
                                preds.append('_SG')
                            else:
                                preds.append('_PL')
                new_sent = []
                for word in split_sent:
                    if word.lower() in prons:
                        pred = preds.pop(0)
                        new_word = word + pred
                        new_sent.append(new_word)
                    else:
                        new_sent.append(word)
                        
                annotated_sent = " ".join(new_sent)
                full_play.append(annotated_sent)
                
    return full_play

In [19]:
def save_play(text, filename):
    #
    with open(filename,'w') as file:
        for line in text:
            file.write(line)
            file.write('\n')  

### RUNNING THE CODE

In [18]:
you_sents = extract_from_files(shakespeare_plays, double=True)
samples = create_samples(you_sents, tokenizer)

In [19]:
train_samples, test_samples = samples_splits(samples)

In [20]:
save(train_samples, 'train_samples_full.pickle')
save(test_samples, 'test_samples_full.pickle')

In [54]:
params = {'lr':0.00005, 'batch_size':8, 'hidden_size':1024, 'epochs':10, 'device':'cuda:2', 'bert_device':'cuda:3'} 
model = ShakespeareanClassifier(params['hidden_size'])
trained_model = train(model, params, train_samples)

 Batch 0 : Average Loss = 0.69039
 Batch 5 : Average Loss = 0.69695
 Batch 10 : Average Loss = 0.68914
 Batch 15 : Average Loss = 0.6905
 Batch 20 : Average Loss = 0.68573
 Batch 25 : Average Loss = 0.68191
 Batch 30 : Average Loss = 0.67996
 Batch 35 : Average Loss = 0.67833
 Batch 40 : Average Loss = 0.67587
Epoch 1 : Average Training Loss = 0.67619
 Batch 0 : Average Loss = 0.61459
 Batch 5 : Average Loss = 0.59897
 Batch 10 : Average Loss = 0.61753
 Batch 15 : Average Loss = 0.62454
 Batch 20 : Average Loss = 0.60617
 Batch 25 : Average Loss = 0.60339
 Batch 30 : Average Loss = 0.59976
 Batch 35 : Average Loss = 0.59833
 Batch 40 : Average Loss = 0.59476
Epoch 2 : Average Training Loss = 0.59147
 Batch 0 : Average Loss = 0.45151
 Batch 5 : Average Loss = 0.49626
 Batch 10 : Average Loss = 0.45687
 Batch 15 : Average Loss = 0.486
 Batch 20 : Average Loss = 0.48431
 Batch 25 : Average Loss = 0.48427
 Batch 30 : Average Loss = 0.45084
 Batch 35 : Average Loss = 0.45861
 Batch 40 : Ave

In [55]:
save(trained_model, 'trained_model_acc80.pickle')

In [56]:
predicted_classes, true_classes = test(trained_model, params, test_samples)

Testing complete!


In [57]:
measures(predicted_classes, true_classes)

The following measures have been recorded for this model:
	Accuracy = 0.8
	Recall = 0.7692307692307693
	Precision = 0.8108108108108109
	F1 = 0.7894736842105263


In [58]:
pd.set_option('display.max_rows', 250)
pd.set_option('display.max_colwidth', None)
model_df = make_dataframe(predicted_classes, test_samples)
display(model_df)

Unnamed: 0,sentence,predicted class,true class
0,Call you this railing?,pl,sg
1,"My good friends, I'll leave you till night.",pl,pl
2,"Ay.--Fare you well, fair gentleman.",sg,sg
3,"Orlando doth commend him to you both, And to that youth he calls his Rosalind He sends this bloody napkin.",pl,pl
4,"Beggar that I am, I am even poor in thanks; but I thank you, and sure, dear friends, my thanks are too dear a halfpenny.",pl,pl
5,"Therefore put you in your best array, bid your friends; for if you will be married tomorrow, you shall, and to Rosalind, if you will.",sg,sg
6,"So please you, he is here at the door and importunes access to you.",pl,sg
7,"Not out of your apparel, and yet out of your suit.",pl,sg
8,"Moreover that we much did long to see you, The need we have to use you did provoke Our hasty sending.",pl,pl
9,"And I charge you, O men, for the love you bear to women--as I perceive by your simpering, none of you hates them--that between you and the women the play may please.",pl,pl


In [20]:
trained_model = load('trained_model_acc80.pickle')

In [22]:
params = {'lr':0.00005, 'batch_size':8, 'hidden_size':1024, 'epochs':10, 'device':'cuda:2', 'bert_device':'cuda:1'} 
full_play = annotate_sents('macbeth.txt', tokenizer, trained_model, params)

In [23]:
for sentence in full_play:
    print(sentence)

Macbeth
by William Shakespeare
Edited by Barbara A. Mowat and Paul Werstine
  with Michael Poston and Rebecca Niles
Folger Shakespeare Library
https://shakespeare.folger.edu/shakespeares-works/macbeth/
Created on Jul 31, 2015, from FDT version 0.9.2

Characters in the Play
Three Witches, the Weird Sisters
DUNCAN, king of Scotland
MALCOLM, his elder son
DONALBAIN, Duncan's younger son
MACBETH, thane of Glamis
LADY MACBETH
SEYTON, attendant to Macbeth
Three Murderers in Macbeth's service
Both attending upon Lady Macbeth:
  A Doctor
  A Gentlewoman
A Porter
BANQUO, commander, with Macbeth, of Duncan's army
FLEANCE, his son
MACDUFF, a Scottish noble
LADY MACDUFF
Their son
Scottish Nobles:
  LENNOX
  ROSS
  ANGUS
  MENTEITH
  CAITHNESS
SIWARD, commander of the English forces
YOUNG SIWARD, Siward's son
A Captain in Duncan's army
An Old Man
A Doctor at the English court
HECATE
Apparitions: an Armed Head, a Bloody Child, a Crowned Child, and eight nonspeaking kings
Three Messengers, Three Serv

In [24]:
save_play(full_play, 'macbeth_annotated.txt')