In [1]:
from collections import Counter, defaultdict
import re
import string

# Non-speaking lines don't start with spaces, all speaking lines have some spaces. (2 or 4 usually)
non_speaking_pattern = '^\S'

# Stage directions appear inside speaking lines in square braces.
stage_direction_pattern = '\[.*?\]'

# Stage directions also sometimes indicated by more than 4 spaces at the start
alt_stage_direction = '^ {5,}'

# The start of the line, then two spaces, then a shortened version of the name, then a period, then a space
new_speaker_pattern = '^  ([A-Z1-9][a-z\.]* ?[A-Za-z]*?)\. '
current_speaker = ''
lines_by_speaker = Counter()
word_counts_by_speaker = defaultdict(Counter)

with open('book-texts/romeo-and-juliet-no-header-footer.txt', 'r') as rj_reader:
    line = 'will be ignored...'

    while line != '':
        line = rj_reader.readline()
        # Don't bother counting non-speaking lines
        if re.search(non_speaking_pattern, line) or re.search(alt_stage_direction, line):
            print("skipped", line)
            continue

        # There are other non-speaking lines, if the first nonwhitespace is "Enter" or "Exit"
        # these are stage directions.
        stripped_line = line.strip()
        if stripped_line.startswith("Enter") or stripped_line.startswith("Exit"):
            print("skipped", line)
            continue

        # Now we want to check if the current speaker has changed.
        match = re.search(new_speaker_pattern, line)
        if match:
            current_speaker = match.group(1)
            
            # We also don't want to count the names as spoken words.
            line_without_speaker = re.sub(new_speaker_pattern, '', line)
            print("speaker change: ", current_speaker, '\n', line_without_speaker)
        else:
            line_without_speaker = line # bit of a hack honestly.
            
           
        # After all these checks, we believe we have a speaking line.
        # But we want to remove any stage directions from it.
        if re.search(stage_direction_pattern, line_without_speaker):
            minus_stage_directions = re.sub(stage_direction_pattern, '', line_without_speaker)
            print("removed stage dir", line_without_speaker, minus_stage_directions)
        else:
            minus_stage_directions = line_without_speaker # another similar hack
        
        # If we are here, we know it's a speaking line, so lets update the count and words
        lines_by_speaker[current_speaker] += 1
        
        # Now we'll remove punctuation and lowercase everything:
        minus_stage_directions = minus_stage_directions.lower()
        minus_stage_directions = minus_stage_directions.translate(str.maketrans('-—', '  ', string.punctuation + '“' + '”'))
        words_in_this_line = minus_stage_directions.strip().split()
        for word in words_in_this_line:
            word_counts_by_speaker[current_speaker][word] += 1

skipped Dramatis Personae

speaker change:  Chorus 
 

skipped                             SCENE.--Verona; Mantua.

skipped                         THE PROLOGUE

skipped                         Enter Chorus.

speaker change:  Chor 
 Two households, both alike in dignity,

skipped                                                          [Exit.]

skipped ACT I. Scene I.

skipped Verona. A public place.

skipped Enter Sampson and Gregory (with swords and bucklers) of the house

skipped of Capulet.

speaker change:  Samp 
 Gregory, on my word, we'll not carry coals.

speaker change:  Greg 
 No, for then we should be colliers.

speaker change:  Samp 
 I mean, an we be in choler, we'll draw.

speaker change:  Greg 
 Ay, while you live, draw your neck out of collar.

speaker change:  Samp 
 I strike quickly, being moved.

speaker change:  Greg 
 But thou art not quickly moved to strike.

speaker change:  Samp 
 A dog of the house of Montague moves me.

speaker change:  Greg 
 To move is to st

In [2]:
# The unique speakers detected. 
for speaker, lines in lines_by_speaker.most_common():
    print(speaker, lines)

Rom 809
Jul 699
Friar 407
Nurse 384
Cap 346
Mer 308
Ben 244
Prince 100
Par 94
Wife 66
Chorus 63
Lady 63
Tyb 55
Mon 53
Samp 52
Chor 42
Serv 40
Greg 37
Chief Watch 33
Mother 31
Bal 30
Laur 27
Pet 23
1. Mus 20
John 19
Man 14
Cap. Wife 13
2. Mus 13
Apoth 11
Abr 10
Peter 9
Citizen 9
1. Serv 8
Boy 8
3. Serv 5
2. Cap 5
Fellow 5
Page 5
Citizens 4
3. Watch 4
2. Serv 3
Officer 2
Father 2
3. Mus 2
2. Watch 2
 1


In [3]:
for speaker, lines in lines_by_speaker.most_common():
    print(f'{speaker} had {lines} lines.')
    print('  In those lines, they said these words...')
    for word, count in word_counts_by_speaker[speaker].most_common():
        print('    ', word, count)
        

Rom had 809 lines.
  In those lines, they said these words...
     i 132
     and 128
     the 112
     to 93
     that 84
     my 84
     a 80
     of 76
     me 73
     in 72
     is 71
     thou 64
     not 56
     with 51
     this 46
     love 45
     it 44
     for 42
     thy 40
     but 39
     thee 38
     what 36
     o 35
     be 34
     so 32
     her 32
     do 26
     as 26
     more 25
     she 25
     will 22
     have 21
     death 21
     no 20
     from 20
     by 19
     am 17
     ill 17
     how 17
     than 16
     shall 15
     then 15
     on 15
     his 14
     here 14
     too 14
     fair 14
     juliet 14
     eyes 13
     mine 13
     doth 13
     you 13
     hath 13
     when 13
     may 13
     upon 13
     dear 13
     tell 12
     good 12
     man 12
     he 12
     let 12
     an 12
     was 11
     all 11
     farewell 11
     if 11
     hand 11
     there 11
     yet 10
     at 10
     beauty 10
     now 10
     they 10
     come 10
     our 10
    