# DATA CLEANING AND PREPROCESSING

In [2]:
import nltk
import re

## General Functions
##### These functions are gonna be apllied at the whole play level, either by processing the first corpus input, or to describe the further details of the different characters

In [3]:
# Regular expresions which match with different editions ways of pointing out actions outside the speech, 
# which are gonna be applied in the paragraphs_cleaner function
squared_braquets = "\[.*?]"
normal_braquets = "\(.*?\)"
squared_to_point = "\\[.*?\."
normal_to_point = "\\(.*?\."
nothing = "xxxxxx"

In [4]:
def paragraphs_cleaner(file, regex1, regex2):
    '''It takes a text file and returns a list of word tokenized paragraphs, 
    each one starting with the name of the character and being followed by 
    the free speech, free from the internal descriptions'''
    f = open(file, "r")
    contents = f.read()         
    f.close()     
    paragraph_cutter = re.sub('\n{2,}', '\n\n', contents) #Paragraph identifier
    paragraphs = paragraph_cutter.split('\n\n') #It cuts the paragraphs by the identified points
    clean_result = []
    for par in paragraphs:
        output = re.sub("\\n", " ", par) # Removes the single new-line characters within the paragraphs
        pseudo_clean = re.sub(regex1 , "", output) # Removes the additional text 
        clean = re.sub(regex2 , "", pseudo_clean)
        clean_result.append(clean)
    result = []
    for par in clean_result:
        output = nltk.word_tokenize(par) #Word tokenizer for each paragraph
        result.append(output)
    
    return result

In [5]:
def characters_details (play_characters):
    '''It takes a diccionary in the form of character:speech lines, 
    and counts the sentences and words for every character'''
    for key in play_characters:
        word_count = 0
        for sen in play_characters[key]:
            output = len(re.findall(r"\w+", sen))
            word_count += output
        print ('%s contains %d sentences and %d words'%(key, len(play_characters[key]), word_count))

## Partial Functions

##### These functions are gonna be applied at the character level, with a more partial aproach aimed at dealing with the different writting conventions and the irregularities found in different editions

In [6]:
def character_cleaner (character_raw, n):
    '''It takes a list of characters tokenized by words paragraphs,
    and returns a list of sentences, where n is the point
    where ends the character's name'''
    no_name = []
    for par in character_raw: #It cuts by the previusly identified point
        output = par[n:]
        no_name.append(output)
    output = []
    for sen in no_name: #It joins all the words into one string
        for word in sen:
            output.append(word)
    one_string = " ".join(output)
    result = nltk.sent_tokenize(one_string) #It tokenizes the string into sentences
    for sen in  result: #Removes the single points sentences for a best estimation of the sentneces lenght
        if sen == '.':
            result.remove(sen)
    return result

In [7]:
def string_joiner(strings):
    '''It take a list of word_tokenized paragraphs,
    and returns a sentence tokenized string'''
    output = []
    for string in strings:
        for word in string:
            output.append(word)
    one_string = " ".join(output)
    result = nltk.sent_tokenize(one_string)
    return result

In [8]:
def character_cleaner_by_name (character_raw, name1, name2, name3):
    '''It removes the different names and ways of introducting the character,
    when neither the name nor the ending position of the expression are regular'''
    for sen in character_raw:
        if name1 in sen:
            sen.remove(name1)
    for sen in character_raw:
        if name2 in sen:
            sen.remove(name2)
    for sen in character_raw:
        if name3 in sen:
            sen.remove(name3)
    output = string_joiner(character_raw)
    for sen in  output: #Removes the single points sentences for a best estimation of the lenghth
        if sen == '.':
            output.remove(sen)
    return output

In [9]:
def speech_filter (character):
    '''Removes the remaining sentneces which 
    have less than one character lenght'''
    output = []
    for sen in character:
        if len(sen) > 1:
            output.append(sen)
    result = string_joiner(output)
    return result

## English Plays


#### An Ideal Husband - Oscar Wilde


In [10]:
token_paragraphs_aih = paragraphs_cleaner("an_ideal_husband_wilde.txt", squared_braquets, nothing)

In [11]:
#It gets rid of the introducting paragraphs
token_paragraphs_aih = token_paragraphs_aih[48:]

In [12]:
# An empty dictionary was created for every character with a minimum presence in the play for the further filtering.
caversham_raw = []
goring_raw = []
chiltern_raw = []
ladychiltern_raw = []
mabel_raw = []
cheveley_raw = []
for par in token_paragraphs_aih:
    if 'CAVERSHAM' in par:
        caversham_raw.append(par)
    if 'GORING' in par:
        goring_raw.append(par)
    if 'ROBERT' in par:
        chiltern_raw.append(par)
    if 'LADY' and 'CHILTERN' in par and not 'MABEL' in par and not 'SIR' in par:
        ladychiltern_raw.append(par)
    if 'MABEL' in par:
        mabel_raw.append(par)
    if 'CHEVELEY' in par:
        cheveley_raw.append(par)
        

In [13]:
# Individualized cleaning by character, in this case with the positional cleaning function.
caversham = character_cleaner(caversham_raw, 3)
goring = character_cleaner(goring_raw, 3)
chiltern = character_cleaner(chiltern_raw, 4)
ladychiltern = character_cleaner(ladychiltern_raw, 3)
mabel = character_cleaner(mabel_raw, 3)
cheveley = character_cleaner(cheveley_raw, 3)


In [14]:
# We create a dict of the plays's characters and their speech lines, thst we will store later
an_ideal_husband_characters = {'Caversham':caversham, 'Goring':goring, 'Chiltern':chiltern, 'Lady Chiltern':ladychiltern, 'Mabel':mabel, 'Cheveley':cheveley}

In [15]:
characters_details(an_ideal_husband_characters)

Caversham contains 201 sentences and 1507 words
Goring contains 775 sentences and 6233 words
Chiltern contains 534 sentences and 4937 words
Lady Chiltern contains 369 sentences and 2917 words
Mabel contains 235 sentences and 1937 words
Cheveley contains 522 sentences and 4380 words


In [16]:
# We are gonna store every dict for a more convenient handling of the data in the subsequent analysis phases
%store an_ideal_husband_characters

Stored 'an_ideal_husband_characters' (dict)


#### A Woman of No Importance - Oscar Wilde

In [17]:
token_paragraphs_awoni = paragraphs_cleaner("a_woman_of_no_importance_wilde.txt", squared_braquets, nothing)

In [18]:
token_paragraphs_awoni = token_paragraphs_awoni[50:]

In [19]:
illingworth_raw = []
allonby_raw = []
gerald_raw = []
mrsarbuthnot_raw = []
hunstanton_raw = []
hester_raw = []

for par in token_paragraphs_awoni:
    if 'ILLINGWORTH' in par:
        illingworth_raw.append(par)
    if 'ALLONBY' in par:
        allonby_raw.append(par)
    if 'GERALD' in par:
        gerald_raw.append(par)
    if 'MRS' and 'ARBUTHNOT' in par:
        mrsarbuthnot_raw.append(par)
    if 'HUNSTANTON' in par:
        hunstanton_raw.append(par)
    if 'HESTER' in par:
        hester_raw.append(par)



In [20]:
illingworth = character_cleaner(illingworth_raw, 3)
allonby = character_cleaner(allonby_raw, 3)
gerald = character_cleaner(gerald_raw, 2)
mrsarbuthnot = character_cleaner(mrsarbuthnot_raw, 2)
hunstanton = character_cleaner(hunstanton_raw, 2)
hester = character_cleaner(hester_raw, 2)

In [200]:
a_woman_of_no_importance_characters = {'Illingorth':illingworth, 'Allonby':allonby, 'Gerald':gerald, 'Mrs Artbuthnot':mrsarbuthnot, 'Hunstanton':hunstanton, 'Hester':hester}

In [201]:
characters_details(a_woman_of_no_importance_characters)

Illingorth contains 426 sentences and 4183 words
Allonby contains 213 sentences and 2091 words
Gerald contains 255 sentences and 2427 words
Mrs Artbuthnot contains 356 sentences and 3079 words
Hunstanton contains 367 sentences and 3658 words
Hester contains 141 sentences and 1342 words


In [202]:
# Removing characters with less than 1500 words
del a_woman_of_no_importance_characters['Hester']

In [23]:
%store a_woman_of_no_importance_characters

Stored 'a_woman_of_no_importance_characters' (dict)


 #### Lady Windermer's Fan - Oscar Wilde

In [24]:
token_paragraphs_lwf = paragraphs_cleaner("lady_windermeres_fan_wilde.txt", squared_braquets, nothing)

In [25]:
token_paragraphs_lwf = token_paragraphs_lwf[47:]

In [26]:
lordwindermere_raw = []
erlynne_raw = []
augustus_raw = []
windermere_raw = []
darlington_raw = []
berwick_raw = []
for par in token_paragraphs_lwf:
    if 'LORD' and 'WINDERMERE' in par and not 'LADY' in par:
        lordwindermere_raw.append(par)
    if 'ERLYNNE' in par:
        erlynne_raw.append(par)
    if 'AUGUSTUS' in par:
        augustus_raw.append(par)
    if 'LADY' and 'WINDERMERE' in par:
        windermere_raw.append(par)
    if 'DARLINGTON' in par:
        darlington_raw.append(par)
    if 'BERWICK' in par:
        berwick_raw.append(par)
  

In [27]:
lordwindermere = character_cleaner(lordwindermere_raw, 3)
erlynne = character_cleaner(erlynne_raw, 3)
augustus = character_cleaner(augustus_raw, 3)
windermere = character_cleaner(windermere_raw, 3)
darlington = character_cleaner(darlington_raw,3)
berwick = character_cleaner(berwick_raw, 3)

In [28]:
lady_windermeres_fan_characters = {'Lord Windermere':lordwindermere, 'Erlynne':erlynne, 'Augustus':augustus, 'Lady Windermere':windermere, 'Darlington':darlington, 'Berwick':berwick}

In [29]:
characters_details(lady_windermeres_fan_characters)

Lord Windermere contains 233 sentences and 2138 words
Erlynne contains 395 sentences and 3521 words
Augustus contains 91 sentences and 628 words
Lady Windermere contains 727 sentences and 6027 words
Darlington contains 190 sentences and 1704 words
Berwick contains 189 sentences and 2037 words


In [203]:
del lady_windermeres_fan_characters['Augustus']

In [204]:
%store lady_windermeres_fan_characters

Stored 'lady_windermeres_fan_characters' (dict)


#### The Importance of Being Earnest - Oscar Wilde

In [31]:
token_paragraphs_tiobe = paragraphs_cleaner("the_importance_of_being_earnest_wilde.txt", squared_braquets, nothing)

In [32]:
token_paragraphs_tiobe = token_paragraphs_tiobe[25:]

In [33]:
jack_raw = []
algernon_raw = []
gwendolen_raw = []
bracknell_raw = []
cecily_raw = []
prism_raw = []
chasuble_raw = []
for par in token_paragraphs_tiobe:
    if 'Jack' in par[0:1]:
        jack_raw.append(par)
    if 'Algernon' in par[0:1]:
        algernon_raw.append(par)
    if 'Gwendolen' in par[0:1]:
        gwendolen_raw.append(par)
    if 'Bracknell' in par[0:2]:
        bracknell_raw.append(par)
    if 'Cecily' in par[0:1]:
        cecily_raw.append(par)
    if 'Prism' in par[0:2]:
        prism_raw.append(par)
    if 'Chasuble' in par[0:1]:
        chasuble_raw.append(par)
    
  

In [34]:
jack = character_cleaner(jack_raw, 2)
algernon = character_cleaner(algernon_raw, 2)
gwendolen = character_cleaner(gwendolen_raw, 2)
bracknell = character_cleaner(bracknell_raw, 3)
cecily = character_cleaner(cecily_raw, 3)
prism = character_cleaner(prism_raw, 3)
chasuble = character_cleaner(chasuble_raw, 2)

In [35]:
the_importance_of_being_earnest_characters = {'Jack':jack, 'Algernon':algernon, 'Gwendolen':gwendolen, 'Bracknell':bracknell, 'Cecily':cecily, 'Prism':prism, 'Chasuble':chasuble}

In [36]:
characters_details(the_importance_of_being_earnest_characters)

Jack contains 473 sentences and 4161 words
Algernon contains 439 sentences and 4116 words
Gwendolen contains 244 sentences and 2242 words
Bracknell contains 280 sentences and 2936 words
Cecily contains 291 sentences and 2638 words
Prism contains 102 sentences and 960 words
Chasuble contains 82 sentences and 778 words


In [205]:
del the_importance_of_being_earnest_characters['Prism']
del the_importance_of_being_earnest_characters['Chasuble']

In [206]:
%store the_importance_of_being_earnest_characters

Stored 'the_importance_of_being_earnest_characters' (dict)


#### Pygmalion - George Bernard Shaw

In [38]:
token_paragraphs_p = paragraphs_cleaner("pygmalion_shaw.txt", squared_braquets, nothing)

In [39]:
token_paragraphs_p = token_paragraphs_p[21:]

In [40]:
pickering_raw = []
higgins_raw = []
liza_raw = []
gentleman_raw = []
taker_raw = []
girl_raw = []
mrshiggins_raw = []
doolittle_raw = []
for par in token_paragraphs_p:
    if 'PICKERING' in par:
        pickering_raw.append(par)
    if 'HIGGINS' in par and not 'MRS.'in par:
        higgins_raw.append(par)
    if 'LIZA' in par:
        liza_raw.append(par)
    if 'GENTLEMAN' in par:
        gentleman_raw.append(par)
    if 'TAKER' in par:
        taker_raw.append(par)
    if 'GIRL' in par:
        girl_raw.append(par)
    if 'MRS.' in par and 'HIGGINS' in par[0:2]:
        mrshiggins_raw.append(par)
    if 'DOOLITTLE' in par:
        doolittle_raw.append(par)

In [41]:
pickering = character_cleaner_by_name(pickering_raw, 'PICKERING', 'xxxxxx', 'xxxxxx')
higgins = character_cleaner_by_name(higgins_raw, 'HIGGINS', 'xxxxx', 'xxxxx')
liza = character_cleaner_by_name(liza_raw, 'LIZA', 'xxxxx', 'xxxxx')
gentelman = character_cleaner_by_name(gentleman_raw, 'THE', 'GENTLEMAN', 'xxxxx')
taker = character_cleaner_by_name(taker_raw, 'THE', 'NOTE', 'TAKER')
girl = character_cleaner_by_name(girl_raw, 'THE', 'FLOWER', 'GIRL')
mrshiggins = character_cleaner_by_name(mrshiggins_raw, 'MRS', 'MRS.', 'HIGGINS')
doolittle = character_cleaner_by_name(doolittle_raw, 'DOOLITTLE', 'xxxxxx', 'xxxxxx')

In [42]:
#Same characters, just been joined afterwards due to computational convenience
pickering = pickering + gentelman
higgins = higgins + taker
liza = liza + girl

In [43]:
pygmalion_characters = {'Pickering':pickering, 'Higgins':higgins, 'Liza':liza, 'Mrs Higgins':mrshiggins, 'Doolittle':doolittle}

In [44]:
characters_details(pygmalion_characters)

Pickering contains 251 sentences and 1774 words
Higgins contains 869 sentences and 7394 words
Liza contains 495 sentences and 4820 words
Mrs Higgins contains 178 sentences and 1466 words
Doolittle contains 285 sentences and 2950 words


In [207]:
del pygmalion_characters['Mrs Higgins']

In [208]:
%store pygmalion_characters

Stored 'pygmalion_characters' (dict)


#### Androcles and the Lion - George Bernard Shaw

In [46]:
token_paragraphs_aatl = paragraphs_cleaner("androcles_and_the_lion_shaw.txt", normal_braquets, nothing)

In [47]:
token_paragraphs_aatl = token_paragraphs_aatl[16:]

In [48]:
lavinia_raw = []
captain_raw = []
androcles_raw = []
megaera_raw = []
centurion_raw = []
spintho_raw = []
ferrovius_raw = []
for par in token_paragraphs_aatl:
    if 'LAVINIA' in par:
        lavinia_raw.append(par)
    if 'CAPTAIN' in par:
        captain_raw.append(par)
    if 'ANDROCLES' in par:
        androcles_raw.append(par)
    if 'MEGAERA' in par:
        megaera_raw.append(par)
    if 'CENTURION' in par:
        centurion_raw.append(par)
    if 'SPINTHO' in par:
        spintho_raw.append(par)
    if 'FERROVIUS' in par:
        ferrovius_raw.append(par)
   

In [49]:
lavinia = character_cleaner_by_name(lavinia_raw, 'LAVINIA', 'xxxx', 'xxxx')
captain = character_cleaner_by_name(captain_raw, 'CAPTAIN', 'xxxx', 'xxxx')
androcles = character_cleaner_by_name(androcles_raw, 'ANDROCLES', 'xxxx', 'xxxx')
megaera = character_cleaner_by_name(megaera_raw, 'MEGAERA', 'xxxx', 'xxxx')
centurion = character_cleaner_by_name(centurion_raw, 'CENTURION', 'xxxx', 'xxxx')
spintho = character_cleaner_by_name(spintho_raw, 'SPINTHO', 'xxxx', 'xxxx')
ferrovius = character_cleaner_by_name(ferrovius_raw, 'FERROVIUS', 'xxxx', 'xxxx')

In [50]:
androcles_and_the_lion_characters = {'Lavinia':lavinia, 'Captain':captain, 'Androcles':androcles, 'Megaera':megaera, 'Centurion':centurion, 'Spintho':spintho, 'Ferrovius':ferrovius}

In [51]:
characters_details(androcles_and_the_lion_characters)

Lavinia contains 173 sentences and 1637 words
Captain contains 155 sentences and 1548 words
Androcles contains 219 sentences and 1864 words
Megaera contains 68 sentences and 671 words
Centurion contains 93 sentences and 590 words
Spintho contains 57 sentences and 390 words
Ferrovius contains 184 sentences and 1454 words


In [209]:
del androcles_and_the_lion_characters['Megaera']
del androcles_and_the_lion_characters['Centurion']
del androcles_and_the_lion_characters['Spintho']
del androcles_and_the_lion_characters['Ferrovius']

In [210]:
%store androcles_and_the_lion_characters

Stored 'androcles_and_the_lion_characters' (dict)


#### Caesar and Cleopatra - George Bernard Shaw    

In [53]:
token_paragraphs_cac = paragraphs_cleaner("caesar_and_cleopatra_shaw.txt", normal_braquets, nothing)

In [54]:
token_paragraphs_cac = token_paragraphs_cac[18:]

In [55]:
caesar_raw = []
cleopatra_raw = []
pothinus_raw = []
rufio_raw = []
ftatateeta_raw =[]
apollodorus_raw = []
for par in token_paragraphs_cac:
    if 'CAESAR' in par:
        caesar_raw.append(par)
    if 'CLEOPATRA' in par:
        cleopatra_raw.append(par)
    if 'POTHINUS' in par:
        pothinus_raw.append(par)
    if 'RUFIO' in par:
        rufio_raw.append(par)
    if 'FTATATEETA' in par:
        ftatateeta_raw.append(par)
    if 'APOLLODORUS' in par:
        apollodorus_raw.append(par)

In [56]:
caesar = character_cleaner_by_name(caesar_raw, 'CAESAR', 'xxxx', 'xxxx')
cleopatra = character_cleaner_by_name(cleopatra_raw, 'CLEOPATRA', 'xxxx', 'xxxx')
pothinus = character_cleaner_by_name(pothinus_raw, 'POTHINUS', 'xxxx', 'xxxx')
rufio = character_cleaner_by_name(rufio_raw, 'RUFIO', 'xxxx', 'xxxx')
ftatateeta = character_cleaner_by_name(ftatateeta_raw, 'FTATATEETA', 'xxxx', 'xxxx')
apollodorus = character_cleaner_by_name(apollodorus_raw, 'APOLLODORUS', 'xxxx', 'xxxx')

In [57]:
caesar_and_cleopatra_characters = {'Caesar':caesar, 'Cleopatra':cleopatra, 'Pothinus':pothinus, 'Rufio':rufio, 'Ftatateeta':ftatateeta, 'Apollodorus':apollodorus}

In [58]:
characters_details(caesar_and_cleopatra_characters)

Caesar contains 756 sentences and 6016 words
Cleopatra contains 554 sentences and 4494 words
Pothinus contains 118 sentences and 1163 words
Rufio contains 268 sentences and 2067 words
Ftatateeta contains 110 sentences and 1012 words
Apollodorus contains 188 sentences and 1653 words


In [211]:
del caesar_and_cleopatra_characters['Pothinus']
del caesar_and_cleopatra_characters['Ftatateeta']

In [212]:
%store caesar_and_cleopatra_characters

Stored 'caesar_and_cleopatra_characters' (dict)


#### Candida - George Bernard Shaw

In [60]:
token_paragraphs_can = paragraphs_cleaner("candida_shaw.txt", normal_braquets, nothing)

In [61]:
token_paragraphs_can = token_paragraphs_can[17:]

In [62]:
candida_raw = []
marchbanks_raw =[]
morell_raw = []
burgess_raw = []
proserpine_raw = []
for par in token_paragraphs_can:
    if 'CANDIDA' in par:
        candida_raw.append(par)
    if 'MARCHBANKS' in par:
        marchbanks_raw.append(par)
    if 'MORELL' in par:
        morell_raw.append(par)
    if 'BURGESS' in par:
        burgess_raw.append(par)
    if 'PROSERPINE' in par:
        proserpine_raw.append(par)

In [63]:
candida = character_cleaner_by_name(candida_raw, 'CANDIDA', 'xxxx', 'xxxx')
marchbanks = character_cleaner_by_name(marchbanks_raw, 'MARCHBANKS', 'xxxx', 'xxxx')
morell = character_cleaner_by_name(morell_raw, 'MORELL', 'xxxx', 'xxxx')
burgess = character_cleaner_by_name(burgess_raw, 'BURGESS', 'xxxx', 'xxxx')
proserpine = character_cleaner_by_name(proserpine_raw, 'PROSERPINE', 'xxxx', 'xxxx')

In [64]:
candida_characters = {'Candida':candida, 'Marchbanks':marchbanks, 'Morell':morell, 'Burgess':burgess, 'Proserpine':proserpine}

In [65]:
characters_details(candida_characters)

Candida contains 354 sentences and 3208 words
Marchbanks contains 388 sentences and 3832 words
Morell contains 443 sentences and 4022 words
Burgess contains 238 sentences and 2237 words
Proserpine contains 147 sentences and 1195 words


In [213]:
del candida_characters['Proserpine']

In [214]:
%store candida_characters

Stored 'candida_characters' (dict)


#### Man And Superman - George Bernard Shaw

In [67]:
token_paragraphs_mas = paragraphs_cleaner("man_and_superman_shaw.txt", squared_braquets, nothing)

In [68]:
token_paragraphs_mas = token_paragraphs_mas[54:]

In [69]:
ramsden_raw = []
octavius_raw = []
tanner_raw = []
ann_raw = []
whitefield_raw = []
missramsden_raw = []
violet_raw = []
hector_raw = []
straker_raw = []
mendoza_raw = []
juan_raw = []
devil_raw = []
ana_raw = []
for par in token_paragraphs_mas:
    if  'RAMSDEN' in par:
         ramsden_raw.append(par)
    if  'OCTAVIUS' in par:
         octavius_raw.append(par)
    if  'TANNER' in par:
         tanner_raw.append(par)
    if  'ANN' in par:
         ann_raw.append(par)
    if  'WHITEFIELD' in par and 'MRS' or 'MRS.' in par:
         whitefield_raw.append(par)
    if  'RAMSDEN' in par and 'MISS' in par:
         missramsden_raw.append(par)
    if  'VIOLET' in par:
         violet_raw.append(par)
    if  'HECTOR' in par:
         hector_raw.append(par)
    if  'STRAKER' in par:
         straker_raw.append(par)
    if  'MENDOZA' in par:
         mendoza_raw.append(par)
    if  'JUAN' in par:
         juan_raw.append(par)
    if  'DEVIL' in par:
         devil_raw.append(par)
    if  'ANA' in par:
         ana_raw.append(par)        
    

In [70]:
ramsden = character_cleaner_by_name(ramsden_raw, 'RAMSDEN', 'xxxx', 'xxxx')
octavius = character_cleaner_by_name(octavius_raw, 'OCTAVIUS', 'xxxx', 'xxxx')
tanner = character_cleaner_by_name(tanner_raw, 'TANNER', 'xxxx', 'xxxx')
ann = character_cleaner_by_name(ann_raw, 'ANN', 'xxxx', 'xxxx')
missramsden = character_cleaner_by_name(whitefield_raw, 'WHITEFIELD', 'MRS', 'MRS.')
violet = character_cleaner_by_name(violet_raw, 'VIOLET', 'xxxx', 'xxxx')
hector = character_cleaner_by_name(hector_raw, 'HECTOR', 'xxxx', 'xxxx')
straker = character_cleaner_by_name(straker_raw, 'STRAKER', 'xxxx', 'xxxx')
mendoza = character_cleaner_by_name(mendoza_raw, 'MENDOZA', 'xxxx', 'xxxx')
juan = character_cleaner_by_name(juan_raw, 'JUAN', 'DON', 'xxxx')
devil = character_cleaner_by_name(devil_raw, 'DEVIL', 'THE', 'xxxx')
ana = character_cleaner_by_name(ana_raw, 'ANA', 'xxxx', 'xxxx')

In [71]:
man_and_superman_characters = {'Ramsden':ramsden, 'Octavius':octavius, 'Tanner':tanner, 'Ann':ann, 'Miss Ramsden':missramsden, 'Violet':violet, 'Hector':hector, 'Straker':straker, 'Mendoza':mendoza, 'Don Juan':juan, 'The Devil':devil, 'Dona Ana':ana}

In [72]:
characters_details(man_and_superman_characters)

Ramsden contains 293 sentences and 2706 words
Octavius contains 267 sentences and 2402 words
Tanner contains 970 sentences and 10945 words
Ann contains 419 sentences and 3675 words
Miss Ramsden contains 97 sentences and 1056 words
Violet contains 162 sentences and 1526 words
Hector contains 128 sentences and 1258 words
Straker contains 159 sentences and 1359 words
Mendoza contains 192 sentences and 1943 words
Don Juan contains 516 sentences and 8995 words
The Devil contains 237 sentences and 3511 words
Dona Ana contains 131 sentences and 1062 words


In [215]:
del man_and_superman_characters['Miss Ramsden']
del man_and_superman_characters['Hector']
del man_and_superman_characters['Straker']
del man_and_superman_characters['Dona Ana']

In [216]:
%store man_and_superman_characters

Stored 'man_and_superman_characters' (dict)


#### Cyntia's Revels - Ben Jonson


In [74]:
token_paragraphs_cr = paragraphs_cleaner("cynthias_revels_jonson.txt", squared_braquets, nothing)

In [75]:
token_paragraphs_cr = token_paragraphs_cr[153:]

In [76]:
mercury_raw = []
echo_raw = []
cupid_raw = []
amorphus_raw = []
asotus_raw = []
crites_raw = []
anaides_raw = []
hedon_raw = []
arete_raw = []
phantase_raw = []
philautia_raw = []
cyntia_raw = []
for par in token_paragraphs_cr:
    if  'MER' in par:
         mercury_raw.append(par)
    if  'ECHO' in par:
         echo_raw.append(par)
    if  'CUP' in par:
         cupid_raw.append(par)
    if  'AMO' in par:
         amorphus_raw.append(par)
    if  'ASO' in par:
         asotus_raw.append(par)
    if  'CRI' in par:
         crites_raw.append(par)
    if  'ANA' in par:
         anaides_raw.append(par)
    if  'HED' in par:
         hedon_raw.append(par)
    if  'ARE' in par:
         arete_raw.append(par)
    if  'PHA' in par:
         phantase_raw.append(par)
    if  'PHI' in par:
         philautia_raw.append(par)
    if  'CYN' in par:
         cyntia_raw.append(par)
    

In [77]:
mercury = character_cleaner_by_name(mercury_raw, 'MER', 'xxxx', 'xxxx')
echo = character_cleaner_by_name(echo_raw, 'ECHO', 'xxxx', 'xxxx')
cupid = character_cleaner_by_name(cupid_raw, 'CUP', 'xxxx', 'xxxx')
amorphus = character_cleaner_by_name(amorphus_raw, 'AMO', 'xxxx', 'xxxx')
asotus = character_cleaner_by_name(asotus_raw, 'ASO', 'xxxx', 'xxxx')
crites = character_cleaner_by_name(crites_raw, 'CRI', 'xxxx', 'xxxx')
anaides = character_cleaner_by_name(anaides_raw, 'ANA', 'xxxx', 'xxxx')
hedon = character_cleaner_by_name(hedon_raw, 'HED', 'xxxx', 'xxxx')
arete = character_cleaner_by_name(arete_raw, 'ARE', 'xxxx', 'xxxx')
phantase = character_cleaner_by_name(phantase_raw, 'PHA', 'xxxx', 'xxxx')
philautia = character_cleaner_by_name(philautia_raw, 'PHI', 'xxxx', 'xxxx')
cyntia = character_cleaner_by_name(cyntia_raw, 'CYN', 'xxxx', 'xxxx')

In [78]:
cynthias_revels_characters = {'Mercury':mercury, 'Echo':echo, 'Cupid':cupid, 'Amorphus':amorphus, 'Asotus':asotus, 'Crites':crites, 'Anaides':anaides, 'Hedon':hedon, 'Arete':arete, 'Phantase':phantase, 'Philautia':philautia, 'Cyntia':cyntia}

In [79]:
characters_details(cynthias_revels_characters)

Mercury contains 293 sentences and 4467 words
Echo contains 22 sentences and 487 words
Cupid contains 148 sentences and 2502 words
Amorphus contains 389 sentences and 5878 words
Asotus contains 216 sentences and 2257 words
Crites contains 175 sentences and 4003 words
Anaides contains 164 sentences and 1743 words
Hedon contains 147 sentences and 1484 words
Arete contains 37 sentences and 928 words
Phantase contains 149 sentences and 1801 words
Philautia contains 91 sentences and 984 words
Cyntia contains 46 sentences and 1073 words


In [217]:
del cynthias_revels_characters['Echo']
del cynthias_revels_characters['Hedon']
del cynthias_revels_characters['Arete']
del cynthias_revels_characters['Philautia']
del cynthias_revels_characters['Cyntia']

In [218]:
%store cynthias_revels_characters

Stored 'cynthias_revels_characters' (dict)


#### Every Man On His Humor - Ben Jonson

In [81]:
token_paragraphs_emohh = paragraphs_cleaner("every_man_on_his_humour_jonson.txt", squared_braquets, squared_to_point)


In [82]:
token_paragraphs_emohh = token_paragraphs_emohh[82:]


In [83]:
edknowell_raw = []
stephen_raw = []
mathew_raw = []
cob_raw = []
bobadill_raw = []
tib_raw = []
kitely_raw = []
cash_raw = []
downright_raw = []
damekitely_raw = []
brainworm_raw = []
knowell_raw = []
wellbred_raw = []
for par in token_paragraphs_emohh:
    if  'Know' in par and 'E.' in par[:1]:
         edknowell_raw.append(par)
    if  'Step' in par[:2]:
         stephen_raw.append(par)
    if  'Mat' in par[:2]:
         mathew_raw.append(par)
    if  'Cob' in par[:2]:
         cob_raw.append(par)
    if  'Bob' in par[:2]:
         bobadill_raw.append(par)
    if  'Tib' in par[:2]:
         tib_raw.append(par)
    if  'Kit' in par[:2]:
         kitely_raw.append(par)
    if  'Cash' in par[:2]:
         cash_raw.append(par)
    if  'Dow' in par[:2]:
         downright_raw.append(par)
    if  'Dame' in par[:2] and 'K.' in par[:3]:
         damekitely_raw.append(par)
    if  'Brai' in par[:2]:
         brainworm_raw.append(par)
    if  'Know' in par[:2] and not 'E.' in par:
         knowell_raw.append(par)
    if  'Wel' in par[:2]:
         wellbred_raw.append(par)
    

In [84]:
stephen = character_cleaner_by_name(stephen_raw, 'Step', 'xxxx', 'xxxx')
edknowell = character_cleaner_by_name(edknowell_raw, 'E.', 'Know', 'xxxx')
mathew = character_cleaner_by_name(mathew_raw, 'Mat', 'xxxx', 'xxxx')
cob = character_cleaner_by_name(cob_raw, 'Cob', 'xxxx', 'xxxx')
bobadill = character_cleaner_by_name(bobadill_raw, 'Bob', 'xxxx', 'xxxx')
tib = character_cleaner_by_name(tib_raw, 'Tib', 'xxxx', 'xxxx')
kitely = character_cleaner_by_name(kitely_raw, 'Kit', 'xxxx', 'xxxx')
cash = character_cleaner_by_name(cash_raw, 'Cash', 'xxxx', 'xxxx')
downright = character_cleaner_by_name(downright_raw, 'Dow', 'xxxx', 'xxxx')
damekitely = character_cleaner_by_name(damekitely_raw, 'Dame', 'K', 'xxxx')
brainworm = character_cleaner_by_name(brainworm_raw, 'Brai', 'xxxx', 'xxxx')
knowell = character_cleaner_by_name(knowell_raw, 'Know', 'xxxx', 'xxxx')
wellbred = character_cleaner_by_name(wellbred_raw, 'Wel', 'xxxx', 'xxxx')

In [85]:
every_man_on_his_humour_characters = {'Stephen':stephen, 'Ed Knowell':edknowell, 'Mathew':mathew, 'Cob':cob, 'Bobadill':bobadill, 'Tib':tib, 'Kitely':kitely, 'Cash':cash, 'Downright':downright, 'Dame Kitely':damekitely, 'Brainworm':brainworm, 'Knowell':knowell, 'Wellbred':wellbred}

In [86]:
characters_details(every_man_on_his_humour_characters)

Stephen contains 170 sentences and 1753 words
Ed Knowell contains 186 sentences and 2064 words
Mathew contains 142 sentences and 1486 words
Cob contains 148 sentences and 1987 words
Bobadill contains 195 sentences and 3122 words
Tib contains 31 sentences and 230 words
Kitely contains 268 sentences and 3620 words
Cash contains 75 sentences and 646 words
Downright contains 107 sentences and 1152 words
Dame Kitely contains 54 sentences and 560 words
Brainworm contains 171 sentences and 2998 words
Knowell contains 133 sentences and 2121 words
Wellbred contains 145 sentences and 1718 words


In [219]:
del every_man_on_his_humour_characters['Mathew']
del every_man_on_his_humour_characters['Tib']
del every_man_on_his_humour_characters['Cash']
del every_man_on_his_humour_characters['Downright']
del every_man_on_his_humour_characters['Dame Kitely']

In [220]:
%store every_man_on_his_humour_characters

Stored 'every_man_on_his_humour_characters' (dict)


#### Volpone, Or The Fox - Ben jonson

In [88]:
token_paragraphs_votf = paragraphs_cleaner("volpone_or_the_fox_jonson.txt", squared_braquets, nothing)

In [89]:
token_paragraphs_votf = token_paragraphs_votf[110:]

In [90]:
volpone_raw = []
mosca_raw = []
nano_raw = []
androgyno_raw = []
voltore_raw = []
corbaccio_raw = []
corvino_raw = []
peregrine_raw = []
sirpolitick_raw = []
bonario_raw = []
ladywouldbe_raw = []
for par in token_paragraphs_votf:
    if  'VOLP' in par:
         volpone_raw.append(par)
    if  'MOS' in par:
         mosca_raw.append(par)
    if  'NAN' in par:
         nano_raw.append(par)
    if  'AND' in par and ':' in par[0:2]:
         androgyno_raw.append(par)
    if  'VOLT' in par:
         voltore_raw.append(par)
    if  'CORB' in par:
         corbaccio_raw.append(par)
    if  'CORV' in par:
         corvino_raw.append(par)
    if  'PER' in par:
         peregrine_raw.append(par)
    if  'SIR' in par and 'P' in par:
         sirpolitick_raw.append(par)
    if  'BON' in par:
         bonario_raw.append(par)
    if  'LADY' in par and 'P' in par:
         ladywouldbe_raw.append(par)

In [91]:
volpone = character_cleaner(volpone_raw, 1)
mosca = character_cleaner(mosca_raw, 2)
nano = character_cleaner(nano_raw, 2)
androgyno = character_cleaner(androgyno_raw, 2)
voltore = character_cleaner(voltore_raw, 2)
corbaccio = character_cleaner(corbaccio_raw, 2)
corvino = character_cleaner(corvino_raw, 2)
peregrine = character_cleaner(peregrine_raw, 2)
sirpolitick = character_cleaner(sirpolitick_raw, 3)
bonario = character_cleaner(bonario_raw, 2)
ladywouldbe = character_cleaner(ladywouldbe_raw, 3)

In [92]:
volpone_or_the_fox_characters = {'Volpone':volpone, 'Mosca':mosca, 'Nano':nano, 'Androgyno':androgyno, 'Voltore':voltore, 'Corbaccio':corbaccio, 'Peregrine':peregrine, 'Sir Politick': sirpolitick, 'Bonario':bonario, 'Lady Would-be':ladywouldbe}

In [93]:
characters_details(volpone_or_the_fox_characters)

Volpone contains 444 sentences and 6690 words
Mosca contains 498 sentences and 6904 words
Nano contains 33 sentences and 670 words
Androgyno contains 12 sentences and 143 words
Voltore contains 113 sentences and 1486 words
Corbaccio contains 136 sentences and 733 words
Peregrine contains 148 sentences and 1322 words
Sir Politick contains 148 sentences and 2335 words
Bonario contains 38 sentences and 369 words
Lady Would-be contains 108 sentences and 1428 words


In [221]:
del volpone_or_the_fox_characters['Nano']
del volpone_or_the_fox_characters['Androgyno']
del volpone_or_the_fox_characters['Voltore']
del volpone_or_the_fox_characters['Corbaccio']
del volpone_or_the_fox_characters['Peregrine']
del volpone_or_the_fox_characters['Bonario']
del volpone_or_the_fox_characters['Lady Would-be']

In [222]:
%store volpone_or_the_fox_characters

Stored 'volpone_or_the_fox_characters' (dict)


#### The Alchemist - Ben Jonson

In [95]:
token_paragraphs_ta = paragraphs_cleaner("the_alchemist_jonson.txt", squared_braquets, nothing)

In [96]:
token_paragraphs_ta = token_paragraphs_ta[97:]

In [97]:
face_raw = []
subtle_raw = []
dol_raw = []
dapper_raw = []
drugger_raw = []
mammon_raw = []
surly_raw = []
ananias_raw = []
tribulation_raw = []
kastril_raw = []
for par in token_paragraphs_ta:
    if  'FACE' in par:
         face_raw.append(par)
    if  'SUB' in par:
         subtle_raw.append(par)
    if  'DOL' in par:
         dol_raw.append(par)
    if  'DAP' in par:
         dapper_raw.append(par)
    if  'DRUG' in par:
         drugger_raw.append(par)
    if  'MAM' in par:
         mammon_raw.append(par)
    if  'SUR' in par:
         surly_raw.append(par)
    if  'ANA' in par:
         ananias_raw.append(par)
    if  'TRI' in par:
         tribulation_raw.append(par)
    if  'KAS' in par:
         kastril_raw.append(par)
    

In [98]:
face = character_cleaner(face_raw, 2)
subtle = character_cleaner(subtle_raw, 2)
dol = character_cleaner(dol_raw, 2)
dapper = character_cleaner(dapper_raw, 2)
drugger = character_cleaner(drugger_raw, 2)
mammon = character_cleaner(mammon_raw, 2)
surly = character_cleaner(surly_raw, 2)
ananias = character_cleaner(ananias_raw, 2)
tribulation = character_cleaner(tribulation_raw, 2)
kastril = character_cleaner(kastril_raw, 2)

In [99]:
the_alchemist_characters = {'Face':face, 'Subtle':subtle, 'Dol':dol, 'Dapper':dapper, 'Drugger':drugger, 'Mammon':mammon, 'Surly': surly, 'Ananias':ananias, 'Tribulation':tribulation, 'Kastril':kastril}

In [100]:
characters_details(the_alchemist_characters)

Face contains 814 sentences and 8561 words
Subtle contains 669 sentences and 7226 words
Dol contains 143 sentences and 1464 words
Dapper contains 86 sentences and 573 words
Drugger contains 44 sentences and 589 words
Mammon contains 276 sentences and 3410 words
Surly contains 139 sentences and 1568 words
Ananias contains 64 sentences and 749 words
Tribulation contains 45 sentences and 631 words
Kastril contains 110 sentences and 860 words


In [223]:
del the_alchemist_characters['Dol']
del the_alchemist_characters['Dapper']
del the_alchemist_characters['Drugger']
del the_alchemist_characters['Ananias']
del the_alchemist_characters['Tribulation']
del the_alchemist_characters['Kastril']

In [224]:
%store the_alchemist_characters

Stored 'the_alchemist_characters' (dict)


#### Macbeth - William Shakespeare

In [102]:
token_paragraphs_m = paragraphs_cleaner("macbeth_characters_shakespeare.txt", squared_braquets, nothing)

In [103]:
# Due to the structural configuration of the file, the split has been made by simple slicing.
macbeth_raw = token_paragraphs_m[:440]
banquo_raw = token_paragraphs_m[440:541]
malcom_raw = token_paragraphs_m[541:663]
ladymacbeth_raw = token_paragraphs_m[663:842]
macduff_raw = token_paragraphs_m[842:1021]
ross_raw = token_paragraphs_m[1021:]

In [104]:
macbeth = speech_filter(macbeth_raw)
banquo = speech_filter(banquo_raw)
malcom = speech_filter(malcom_raw)
ladymacbeth = speech_filter(ladymacbeth_raw)
macduff = speech_filter(macduff_raw)
ross = speech_filter(ross_raw)

In [105]:
macbeth_characters = {'Macbeth':macbeth, 'Banquo':banquo, 'Malcom':malcom, 'Lady Macbeth':ladymacbeth, 'Macduff': macduff, 'Ross':ross}

In [106]:
characters_details(macbeth_characters)

Macbeth contains 350 sentences and 5515 words
Banquo contains 61 sentences and 805 words
Malcom contains 73 sentences and 1543 words
Lady Macbeth contains 135 sentences and 1965 words
Macduff contains 109 sentences and 1209 words
Ross contains 55 sentences and 946 words


In [225]:
del macbeth_characters['Banquo']
del macbeth_characters['Macduff']
del macbeth_characters['Ross']

In [226]:
%store macbeth_characters

Stored 'macbeth_characters' (dict)


#### Romeo And Juliet - William Shakespeare

In [108]:
token_paragraphs_raj = paragraphs_cleaner("romeo_and_juliet_characters_shakespeare.txt", squared_braquets, nothing)

In [109]:
romeo_raw = token_paragraphs_raj[:492]
juliet_raw = token_paragraphs_raj[492:848]
benvolio_raw = token_paragraphs_raj[848:1042]
mercutio_raw = token_paragraphs_raj[1042:1230]
nurse_raw = token_paragraphs_raj[1230:1502]
capulet_raw = token_paragraphs_raj[1502:1657]
ladycapulet_raw = token_paragraphs_raj[1657:]

In [110]:
romeo = speech_filter(romeo_raw)
juliet = speech_filter(juliet_raw)
benvolio = speech_filter(benvolio_raw)
mercutio = speech_filter(mercutio_raw)
nurse = speech_filter(nurse_raw)
capulet = speech_filter(capulet_raw)
ladycapulet = speech_filter(ladycapulet_raw)

In [111]:
romeo_and_juliet_characters = {'Romeo':romeo, 'Juliet':juliet, 'Benvolio': benvolio, 'Nurse':nurse, 'Capulet':capulet, 'Lady Capulet':ladycapulet}

In [112]:
characters_details(romeo_and_juliet_characters)

Romeo contains 335 sentences and 4839 words
Juliet contains 298 sentences and 4414 words
Benvolio contains 77 sentences and 1190 words
Nurse contains 213 sentences and 2300 words
Capulet contains 168 sentences and 2237 words
Lady Capulet contains 76 sentences and 907 words


In [227]:
del romeo_and_juliet_characters['Benvolio']
del romeo_and_juliet_characters['Lady Capulet']

In [228]:
%store romeo_and_juliet_characters

Stored 'romeo_and_juliet_characters' (dict)


#### Othello - William Shakespeare

In [114]:
token_paragraphs_o = paragraphs_cleaner("othello_characters_shakespeare.txt", squared_braquets, nothing)

In [115]:
othello_raw = token_paragraphs_o[:825]
roderigo_raw = token_paragraphs_o[825:1004]
iago_raw = token_paragraphs_o[1004:1822]
cassio_raw = token_paragraphs_o[1822:2154]
desdemona_raw = token_paragraphs_o[2154:2651]
emilia_raw = token_paragraphs_o[2651:]

In [116]:
othello = speech_filter(othello_raw)
roderigo = speech_filter(roderigo_raw)
iago = speech_filter(iago_raw)
cassio = speech_filter(cassio_raw)
desdemona = speech_filter(desdemona_raw)
emilia = speech_filter(emilia_raw)

In [117]:
othello_characters = {'Othello':othello, 'Roderigo':roderigo, 'Iago':iago, 'Cassio':cassio, 'Desdemona':desdemona, 'Emilia':emilia}

In [118]:
characters_details(othello_characters)

Othello contains 539 sentences and 6473 words
Roderigo contains 76 sentences and 881 words
Iago contains 560 sentences and 8623 words
Cassio contains 178 sentences and 2013 words
Desdemona contains 238 sentences and 2830 words
Emilia contains 178 sentences and 1865 words


In [229]:
del othello_characters['Roderigo']

In [230]:
%store othello_characters

Stored 'othello_characters' (dict)


#### Hamlet - William Sahakespeare

In [120]:
token_paragraphs_h =  paragraphs_cleaner("hamlet_characters_shakespeare.txt", squared_braquets, nothing)

In [121]:
hamlet_raw = token_paragraphs_h[:500] + token_paragraphs_h[500:1077]
ophelia_raw = token_paragraphs_h[1077:1253]
polonius_raw = token_paragraphs_h[1253:1513]
claudius_raw = token_paragraphs_h[1513:1821]
horatio_raw = token_paragraphs_h[1821:2150]
laertes_raw = token_paragraphs_h[2150:2338]
gertrude_raw = token_paragraphs_h[2338:]

In [122]:
hamlet = speech_filter(hamlet_raw)
ophelia = speech_filter(ophelia_raw)
polonius = speech_filter(polonius_raw)
claudius = speech_filter(claudius_raw)
horatio = speech_filter(horatio_raw)
laertes = speech_filter(laertes_raw)
gertrude = speech_filter(gertrude_raw)

In [123]:
hamlet_characters = {'Hamlet':hamlet, 'Ophelia':ophelia, 'Polonius': polonius, 'Claudius': claudius, 'Horatio': horatio, 'Laertes': laertes, 'Gertrude':gertrude}

In [124]:
characters_details(hamlet_characters)

Hamlet contains 1020 sentences and 12233 words
Ophelia contains 106 sentences and 1261 words
Polonius contains 218 sentences and 2750 words
Claudius contains 299 sentences and 4234 words
Horatio contains 184 sentences and 2127 words
Laertes contains 131 sentences and 1493 words
Gertrude contains 107 sentences and 1091 words


In [231]:
del hamlet_characters['Ophelia']
del hamlet_characters['Laertes']
del hamlet_characters['Gertrude']

In [232]:
%store hamlet_characters

Stored 'hamlet_characters' (dict)


#### King Lear - William Shakespeare

In [126]:
token_paragraphs_kl = paragraphs_cleaner("king_lear_characters_shakespeare.txt", squared_braquets, nothing)

In [127]:
goneril_raw = token_paragraphs_kl[:161]
edmund_raw = token_paragraphs_kl[161:400]
regan_raw = token_paragraphs_kl[400:621]
lear_raw = token_paragraphs_kl[621:1187]
fool_raw = token_paragraphs_kl[1187:1363]
earlofkent_raw = token_paragraphs_kl[1363:]

In [128]:
goneril = speech_filter(goneril_raw)
edmund = speech_filter(edmund_raw)
regan = speech_filter(regan_raw)
lear = speech_filter(lear_raw)
fool = speech_filter(fool_raw)
earlofkent = speech_filter(earlofkent_raw)

In [129]:
king_lear_characters = {'Goneril':goneril, 'Edmund':edmund, 'Regan':regan,'Lear': lear, 'Fool': fool, 'Earl Of Kent':earlofkent}

In [130]:
characters_details(king_lear_characters)

Goneril contains 128 sentences and 1514 words
Edmund contains 208 sentences and 2441 words
Regan contains 146 sentences and 1460 words
Lear contains 645 sentences and 5875 words
Fool contains 131 sentences and 1827 words
Earl Of Kent contains 237 sentences and 2708 words


In [233]:
del king_lear_characters['Regan']

In [234]:
%store king_lear_characters

Stored 'king_lear_characters' (dict)


## German Plays

#### Kabale und Liebe - Friedrich Schiller

In [132]:
token_paragraphs_kul = paragraphs_cleaner('kabale_und_liebe_schiller.txt', normal_braquets, nothing)

In [133]:
token_paragraphs_kul = token_paragraphs_kul[22:]

In [134]:
präsident_raw = []
ferdinand_raw = []
hofmarschall_raw = []
ladymilford_raw = []
wurm_raw = []
miller_raw = []
frau_raw = []
luise_raw = []
sophie_raw = []
for par in token_paragraphs_kul:
    if  'Präsident' in par[:1] and '.' in par[:2]:
         präsident_raw.append(par)
    if  'Ferdinand' in par[:1] and '.' in par[:2]:
         ferdinand_raw.append(par)
    if  'Hofmarschall' in par[:1] and '.' in par[:2]:
         hofmarschall_raw.append(par)
    if  'Lady' in par[:1] and '.' in par [:2]:
         ladymilford_raw.append(par)
    if  'Wurm' in par[:1] and '.' in par[:2]:
         wurm_raw.append(par)
    if  'Miller' in par[:1] and '.' in par[:2]:
         miller_raw.append(par)
    if  'Frau' in par[:1] and '.' in par[:2]:
         frau_raw.append(par)
    if  'Luise' in par[:1] and '.' in par[:2]:
         luise_raw.append(par)
    if  'Sophie' in par[:1] and '.' in par[:2]:
         sophie_raw.append(par)


In [135]:
präsident = character_cleaner(präsident_raw, 2)
ferdinand = character_cleaner(ferdinand_raw, 2)
hofmarschall = character_cleaner(hofmarschall_raw, 2)
ladymilford = character_cleaner(ladymilford_raw, 2)
wurm = character_cleaner(wurm_raw, 2)
miller = character_cleaner(miller_raw, 2)
frau = character_cleaner(frau_raw, 2)
luise = character_cleaner(luise_raw, 2)
sophie = character_cleaner(sophie_raw, 2)

In [136]:
kabale_und_liebe_characters = {'Präsident':präsident, 'Ferdinand':ferdinand, 'Hofmarschall':hofmarschall, 'Lady Milford':ladymilford, 'Wurm':wurm, 'Miller':miller, 'Frau':frau, 'Luise':luise, 'Sophie':sophie}

In [137]:
characters_details(kabale_und_liebe_characters)

Präsident contains 307 sentences and 2716 words
Ferdinand contains 607 sentences and 5730 words
Hofmarschall contains 155 sentences and 1132 words
Lady Milford contains 271 sentences and 3418 words
Wurm contains 206 sentences and 2221 words
Miller contains 389 sentences and 3586 words
Frau contains 78 sentences and 633 words
Luise contains 537 sentences and 4756 words
Sophie contains 36 sentences and 356 words


In [235]:
del kabale_und_liebe_characters['Hofmarschall']
del kabale_und_liebe_characters['Frau']
del kabale_und_liebe_characters['Sophie']

In [236]:
%store kabale_und_liebe_characters

Stored 'kabale_und_liebe_characters' (dict)


#### Die Verschwoerung des Fiesco zu Genua - Friedrich Schiller

In [139]:
token_paragraphs_dvdfzg = paragraphs_cleaner('die_verschwoerung_des_fiesco_zu_genua_schiller.txt', normal_braquets, nothing)

In [140]:
token_paragraphs_dvdfzg = token_paragraphs_dvdfzg[48:]

In [141]:
andreas_raw = []
gianettino_raw = []
fiesco_raw = []
verrina_raw = []
bourgognino_raw = []
calcagno_raw = []
sacco_raw = []
lomellin_raw = []
mohr_raw = []
for par in token_paragraphs_dvdfzg:
    if  'Andreas' in par[:1] and '.' in par[:2]:
         andreas_raw.append(par)
    if  'Gianettino' in par[:1] and '.' in par[:2]:
         gianettino_raw.append(par)
    if  'Fiesco' in par[:1] and '.' in par[:2]:
         fiesco_raw.append(par)
    if  'Verrina' in par[:1] and '.' in par [:2]:
         verrina_raw.append(par)
    if  'Bourgognino' in par[:1] and '.' in par[:2]:
         bourgognino_raw.append(par)
    if  'Calcagno' in par[:1] and '.' in par[:2]:
         calcagno_raw.append(par)
    if  'Sacco' in par[:1] and '.' in par[:2]:
         sacco_raw.append(par)
    if  'Lomellin' in par[:1] and '.' in par[:2]:
         lomellin_raw.append(par)
    if  'Mohr' in par[:1] and '.' in par[:2]:
         mohr_raw.append(par)
    

In [142]:
andreas = character_cleaner(andreas_raw, 2)
gianettino = character_cleaner(gianettino_raw, 2)
fiesco = character_cleaner(fiesco_raw, 2)
verrina = character_cleaner(verrina_raw, 2)
bourgognino = character_cleaner(bourgognino_raw, 2)
calcagno = character_cleaner(calcagno_raw, 2)
sacco = character_cleaner(sacco_raw, 2)
lomellin  = character_cleaner(lomellin_raw, 2)
mohr = character_cleaner(mohr_raw, 2)

In [143]:
die_verschwoerung_des_fiesco_zu_genua_characters = {'Andreas':andreas, 'Gianettino':gianettino, 'Fiesco':fiesco, 'Verrina':verrina, 'Bourgognino':bourgognino, 'Clacagno':calcagno, 'Sacco':sacco, 'Lomellon':lomellin, 'Mohr':mohr}

In [144]:
characters_details(die_verschwoerung_des_fiesco_zu_genua_characters)

Andreas contains 68 sentences and 569 words
Gianettino contains 172 sentences and 1251 words
Fiesco contains 916 sentences and 7307 words
Verrina contains 307 sentences and 2575 words
Bourgognino contains 135 sentences and 843 words
Clacagno contains 145 sentences and 1186 words
Sacco contains 57 sentences and 484 words
Lomellon contains 99 sentences and 713 words
Mohr contains 260 sentences and 2057 words


In [237]:
del die_verschwoerung_des_fiesco_zu_genua_characters['Andreas']
del die_verschwoerung_des_fiesco_zu_genua_characters['Gianettino']
del die_verschwoerung_des_fiesco_zu_genua_characters['Bourgognino']
del die_verschwoerung_des_fiesco_zu_genua_characters['Clacagno']
del die_verschwoerung_des_fiesco_zu_genua_characters['Sacco']
del die_verschwoerung_des_fiesco_zu_genua_characters['Lomellon']

In [238]:
%store die_verschwoerung_des_fiesco_zu_genua_characters

Stored 'die_verschwoerung_des_fiesco_zu_genua_characters' (dict)


#### Die Räuber - Friedrich Schiller

In [146]:
token_paragraphs_dr = paragraphs_cleaner('die_räuber_schiller.txt', normal_braquets, nothing)

In [147]:
token_paragraphs_dr = token_paragraphs_dr[40:]

In [148]:
karl_raw = []
franz_raw = []
amalia_raw = []
spiegelberg_raw = []
schweizer_raw = []
grimm_raw = []
razmann_raw = []
schufterle_raw = []
roller_raw = []
kosisnsky_raw = []
schwarz_raw = []
moor_raw = []
for par in token_paragraphs_dr:
    if  '~Franz~' in par or '~Franz.~' in par:
         franz_raw.append(par)
    if  '~Amalia~' in par or '~Amalia.~' in par:
         amalia_raw.append(par)
    if  '~Spiegelberg~' in par or '~Spiegelberg.~' in par:
         spiegelberg_raw.append(par)
    if  '~Schweizer~' in par or '~Schweizer.~' in par:
         schweizer_raw.append(par)
    if  '~Grimm~' in par or '~Grimm.~' in par:
         grimm_raw.append(par)
    if  '~Razmann~' in par or '~Razmann.~' in par:
         razmann_raw.append(par)
    if  '~Schufterle~' in par or '~Schufterle.~' in par:
         schufterle_raw.append(par)
    if  '~Roller~' in par or '~Roller.~' in par:
         roller_raw.append(par)
    if  '~Kosinsky~' in par or '~Kosinsky.~' in par:
         kosisnsky_raw.append(par)
    if  '~Schwarz~' in par or '~Schwarz.~' in par:
         schwarz_raw.append(par)
    if  '~Moor~' in par or '~Moor.~' in par:
         moor_raw.append(par)
    

In [149]:
franz = character_cleaner(franz_raw, 1)
amalia = character_cleaner(amalia_raw, 1)
spiegelberg = character_cleaner(spiegelberg_raw, 1)
schweizer = character_cleaner(schweizer_raw, 1)
grimm = character_cleaner(grimm_raw, 1)
razmann = character_cleaner(razmann_raw, 1)
schufterle = character_cleaner(schufterle_raw, 1)
roller = character_cleaner(roller_raw, 1)
kosisnsky = character_cleaner(kosisnsky_raw, 1)
schwarz = character_cleaner(schwarz_raw, 1)
moor = character_cleaner(moor_raw, 1)

In [150]:
die_räuber_characters = { 'Franz': franz, 'Amalia':amalia, 'Spiegelberg':spiegelberg, 'Schweizer':schweizer, 'Grimm':grimm, 'Razmann':razmann, 'Schufterle':schufterle, 'Roller':roller, 'Kosisnky':kosisnsky, 'Schwartz':schwarz, 'Moor':moor}

In [151]:
characters_details(die_räuber_characters)

Franz contains 670 sentences and 7937 words
Amalia contains 269 sentences and 2183 words
Spiegelberg contains 269 sentences and 3863 words
Schweizer contains 166 sentences and 1550 words
Grimm contains 50 sentences and 291 words
Razmann contains 91 sentences and 867 words
Schufterle contains 21 sentences and 303 words
Roller contains 82 sentences and 877 words
Kosisnky contains 69 sentences and 960 words
Schwartz contains 73 sentences and 548 words
Moor contains 621 sentences and 6569 words


In [239]:
del die_räuber_characters['Grimm']
del die_räuber_characters['Razmann']
del die_räuber_characters['Schufterle']
del die_räuber_characters['Roller']
del die_räuber_characters['Kosisnky']
del die_räuber_characters['Schwartz']

In [240]:
%store die_räuber_characters

Stored 'die_räuber_characters' (dict)


#### Die Jungfrau von Orleans - Friedrich Schiller

In [153]:
token_paragraphs_djvo = paragraphs_cleaner('die_jungfrau_von_orleans_schiller.txt', normal_braquets, nothing)

In [154]:
token_paragraphs_djvo = token_paragraphs_djvo[84:]

In [155]:
karl_raw = []
lahire_raw = []
duchatel_raw = []
dunois_raw = []
johanna_raw = []
talbot_raw = []
lionel_raw = []
burgund_raw = []
raimund_raw = []
sorel_raw = []
for par in token_paragraphs_djvo:
    if  'KARL' in par:
         karl_raw.append(par)
    if  'LA' in par:
         lahire_raw.append(par)
    if  'CHATEL' in par:
         duchatel_raw.append(par)
    if  'DUNOIS' in par:
         dunois_raw.append(par)
    if  'JOHANNA' in par:
         johanna_raw.append(par)
    if  'TALBOT' in par:
         talbot_raw.append(par)
    if  'LIONEL' in par:
         lionel_raw.append(par)
    if  'BURGUND' in par:
         burgund_raw.append(par)
    if  'RAIMOND' in par:
         raimund_raw.append(par)
    if  'SOREL' in par:
         sorel_raw.append(par)

In [156]:
karl = character_cleaner(karl_raw, 2)
lahire = character_cleaner(lahire_raw, 3)
dunois = character_cleaner(dunois_raw, 2)
johanna = character_cleaner(johanna_raw, 2)
lionel = character_cleaner(lionel_raw, 2)
burgund = character_cleaner(burgund_raw, 2)
sorel = character_cleaner(sorel_raw, 2)

In [157]:
die_jungfrau_von_orleans_characters = {'Karl':karl, 'La Hire':lahire, 'Dunois':dunois, 'Johanna':johanna, 'Lionel':lionel, 'Burgund':burgund, 'Sorel':sorel}

In [158]:
characters_details(die_jungfrau_von_orleans_characters)

Karl contains 249 sentences and 2556 words
La Hire contains 72 sentences and 794 words
Dunois contains 160 sentences and 1787 words
Johanna contains 474 sentences and 5687 words
Lionel contains 105 sentences and 914 words
Burgund contains 133 sentences and 1179 words
Sorel contains 120 sentences and 1343 words


In [241]:
del die_jungfrau_von_orleans_characters['La Hire']
del die_jungfrau_von_orleans_characters['Lionel']
del die_jungfrau_von_orleans_characters['Burgund']
del die_jungfrau_von_orleans_characters['Sorel']

In [242]:
%store die_jungfrau_von_orleans_characters

Stored 'die_jungfrau_von_orleans_characters' (dict)


#### Faust I - Johann Wolfgang von Goethe

In [160]:
token_paragraphs_f1 = paragraphs_cleaner('faust_1_goethe.txt', normal_braquets, nothing)

In [161]:
token_paragraphs_f1 = token_paragraphs_f1[17:]

In [162]:
faust_raw = []
mephistopheles_raw = []
margarete_raw = []
diehexe_raw = []
marthe_raw = []
wagner_raw = []
for par in token_paragraphs_f1:
    if  'FAUST' in par:
         faust_raw.append(par)
    if  'MEPHISTOPHELES' in par:
         mephistopheles_raw.append(par)
    if  'MARGARETE' in par:
         margarete_raw.append(par)
    if  'DIE' in par:
         diehexe_raw.append(par)


In [163]:
faust = character_cleaner(faust_raw, 2)
mephistopheles = character_cleaner(mephistopheles_raw, 2)
margarete = character_cleaner(margarete_raw, 2)
diehexe = character_cleaner(diehexe_raw, 3)

In [164]:
faust_1_characters = {'Faust':faust, 'Mephistopheles':mephistopheles, 'Margarete':margarete, 'Die Hexe':diehexe}

In [165]:
characters_details(faust_1_characters)

Faust contains 784 sentences and 8550 words
Mephistopheles contains 703 sentences and 7973 words
Margarete contains 234 sentences and 2041 words
Die Hexe contains 59 sentences and 462 words


In [243]:
del faust_1_characters['Die Hexe']

In [244]:
%store faust_1_characters

Stored 'faust_1_characters' (dict)


#### Faust II - Johann Wolfgang von Goethe

In [167]:
token_paragraphs_f2 = paragraphs_cleaner('faust_2_goethe.txt', nothing, nothing)

In [168]:
token_paragraphs_f2 = token_paragraphs_f2[45:]

In [169]:
mephistopheles2_raw = []
kaiser_raw = []
kanzler_raw = []
gemurmel_raw = []
herold_raw = []
plutus_raw = []
heermeister_raw = []
marshalk_raw = []
faust2_raw = []
homunculus_raw = []
phorkyas_raw = []
sirenen_raw = []
helena_raw = []
for par in token_paragraphs_f2:
    if  'MEPHISTOPHELES' in par:
         mephistopheles2_raw.append(par)
    if  'KAISER' in par:
         kaiser_raw.append(par)
    if  'KANZLER' in par:
         kanzler_raw.append(par)
    if  'GEMURMEL' in par:
         gemurmel_raw.append(par)
    if  'HEROLD' in par:
         herold_raw.append(par)
    if  'PLUTUS' in par:
         plutus_raw.append(par)
    if  'HEERMEISTER' in par:
         heermeister_raw.append(par)
    if  'MARSCHALK' in par:
         marshalk_raw.append(par)
    if  'FAUST' in par:
         faust2_raw.append(par)
    if  'HOMUNCULUS' in par:
         homunculus_raw.append(par)
    if  'PHORKYAS' in par:
         phorkyas_raw.append(par)
    if  'SIRENEN' in par:
         sirenen_raw.append(par)
    if  'HELENA' in par:
         helena_raw.append(par)

In [170]:
mephistopheles2 = character_cleaner(mephistopheles2_raw, 2) 
kaiser = character_cleaner(kaiser_raw, 2)
kanzler = character_cleaner(kanzler_raw, 2)
gemurmel = character_cleaner(gemurmel_raw, 2)
herold = character_cleaner(herold_raw, 2)
plutus = character_cleaner(plutus_raw, 2)
heermeister = character_cleaner(heermeister_raw, 2)
marshalk = character_cleaner(marshalk_raw, 2)
faust2 = character_cleaner(faust2_raw, 2)
homunculus = character_cleaner(homunculus_raw, 2)
phorkyas = character_cleaner(phorkyas_raw, 2)
sirenen = character_cleaner(sirenen_raw, 2)
helena = character_cleaner(helena_raw, 2)

In [247]:
faust2_characters = {'Mephistopheles II':mephistopheles2, 'Kaiser':kaiser, 'Kanzler':kanzler, 'Gemurmel':gemurmel, 'Herold':herold, 'Plutus':plutus, 'Heer Meister':heermeister, 'Marcshalk':marshalk, 'Faust II':faust2, 'Homunculus':homunculus, 'Phorkyas':phorkyas, 'Sirenen':sirenen, 'Helena':helena}

In [248]:
characters_details(faust2_characters)

Mephistopheles II contains 655 sentences and 8073 words
Kaiser contains 161 sentences and 2147 words
Kanzler contains 28 sentences and 419 words
Gemurmel contains 9 sentences and 196 words
Herold contains 103 sentences and 1455 words
Plutus contains 40 sentences and 444 words
Heer Meister contains 8 sentences and 148 words
Marcshalk contains 19 sentences and 308 words
Faust II contains 429 sentences and 5306 words
Homunculus contains 75 sentences and 876 words
Phorkyas contains 155 sentences and 2122 words
Sirenen contains 52 sentences and 616 words
Helena contains 149 sentences and 2165 words


In [249]:
del faust2_characters['Kanzler']
del faust2_characters['Gemurmel']
del faust2_characters['Herold']
del faust2_characters['Plutus']
del faust2_characters['Heer Meister']
del faust2_characters['Marcshalk']
del faust2_characters['Homunculus']
del faust2_characters['Sirenen']

In [250]:
%store faust2_characters

Stored 'faust2_characters' (dict)


#### Egmont - Johann Wolfgang von Goethe

In [174]:
token_paragraphs_e = paragraphs_cleaner('egmont_goethe.txt', normal_braquets, nothing)

In [175]:
token_paragraphs_e = token_paragraphs_e[66:]

In [176]:
egmont_raw = []
klärchen_raw = []
mutter_raw = []
machiavell_raw = []
regentin_raw = []
klare_raw = []
brackenburg_raw = []
soest_raw = []
sekretär_raw = []
for par in token_paragraphs_e:
    if  'Egmont' in par[:1] and '.' in par[:2]:
         egmont_raw.append(par)
    if  'Klärchen' in par[:1] and '.' in par[:2]:
         klärchen_raw.append(par)
    if  'Mutter' in par[:1] and '.' in par[:2]:
         mutter_raw.append(par)
    if  'Machiavell' in par[:1] and '.' in par [:2]:
         machiavell_raw.append(par)
    if  'Regentin' in par[:1] and '.' in par[:2]:
         regentin_raw.append(par)
    if  'Klare' in par[:1] and '.' in par[:2]:
         klare_raw.append(par)
    if  'Brackenburg' in par[:1] and '.' in par[:2]:
         brackenburg_raw.append(par)
    if  'Soest' in par[:1] and '.' in par[:2]:
         soest_raw.append(par)
    if  'Sekretär' in par[:1] and '.' in par[:2]:
         sekretär_raw.append(par)

In [177]:
egmont = character_cleaner(egmont_raw,2)
klärchen = character_cleaner(klärchen_raw,2)
mutter = character_cleaner(mutter_raw,2)
machiavell = character_cleaner(machiavell_raw,2)
regentin = character_cleaner(regentin_raw,2)
klare = character_cleaner(klare_raw,2)
brackenburg = character_cleaner(brackenburg_raw,2)
soest = character_cleaner(soest_raw,2)
sekretär = character_cleaner(sekretär_raw,2)

In [178]:
egmont_characters = {'Egmont':egmont, 'Klärchen':klärchen, 'Mutter':mutter, 'Machiavel':machiavell, 'Regentin':regentin, 'Klare':klare, 'Brackenburg':brackenburg, 'Soest':soest, 'Sekretär':sekretär}

In [179]:
characters_details(egmont_characters)

Egmont contains 492 sentences and 5974 words
Klärchen contains 292 sentences and 2440 words
Mutter contains 51 sentences and 386 words
Machiavel contains 60 sentences and 697 words
Regentin contains 133 sentences and 2073 words
Klare contains 83 sentences and 815 words
Brackenburg contains 131 sentences and 1203 words
Soest contains 90 sentences and 645 words
Sekretär contains 62 sentences and 594 words


In [251]:
del egmont_characters['Mutter']
del egmont_characters['Machiavel']
del egmont_characters['Klare']
del egmont_characters['Brackenburg']
del egmont_characters['Soest']
del egmont_characters['Sekretär']

In [252]:
%store egmont_characters

Stored 'egmont_characters' (dict)


#### Iphigenie auf Tauris - Johann Wolfgang von Goethe

In [181]:
token_paragraphs_iat = paragraphs_cleaner('iphigenie_auf_tauris_goethe.txt', normal_braquets, nothing)

In [182]:
token_paragraphs_iat = token_paragraphs_iat[14:]

In [183]:
iphigenie_raw = []
orest_raw = []
thoas_raw = []
pylades_raw = []
arkas_raw = []
for par in token_paragraphs_iat:
    if  'Iphigenie' in par[:1] and '.' in par[:2]:
         iphigenie_raw.append(par)
    if  'Orest' in par[:1] and '.' in par[:2]:
         orest_raw.append(par)
    if  'Thoas' in par[:1] and '.' in par[:2]:
         thoas_raw.append(par)
    if  'Pylades' in par[:1] and '.' in par [:2]:
         pylades_raw.append(par)
    if  'Arkas' in par[:1] and '.' in par[:2]:
         arkas_raw.append(par)
    

In [184]:
iphigenie = character_cleaner(iphigenie_raw, 2)
orest = character_cleaner(orest_raw, 2)
thoas = character_cleaner(thoas_raw, 2)
pylades = character_cleaner(pylades_raw, 2)
arkas = character_cleaner(arkas_raw, 2)

In [185]:
iphigenie_auf_tauris_characters = {'Iphigenie':iphigenie, 'Orest':orest, 'Thoas':thoas, 'Pylades':pylades, 'Arkas':arkas}

In [186]:
characters_details(iphigenie_auf_tauris_characters)

Iphigenie contains 432 sentences and 6371 words
Orest contains 211 sentences and 3032 words
Thoas contains 97 sentences and 1347 words
Pylades contains 174 sentences and 2457 words
Arkas contains 80 sentences and 1240 words


In [253]:
del iphigenie_auf_tauris_characters['Thoas']
del iphigenie_auf_tauris_characters['Arkas']

In [254]:
%store iphigenie_auf_tauris_characters

Stored 'iphigenie_auf_tauris_characters' (dict)


#### Die Laune des Verliebten - Johann Wolfgang von Goethe

In [188]:
token_paragraphs_dldv = paragraphs_cleaner('die_laune_des_verliebten_goethe.txt', squared_braquets, nothing)

In [189]:
token_paragraphs_dldv = token_paragraphs_dldv[14:]

In [190]:
egle_raw = []
amine_raw = []
eridon_raw = []
lamon_raw = []
for par in token_paragraphs_dldv:
    if  'Egle' in par[:1]: 
         egle_raw.append(par)
    if  'Amine' in par[:1]: 
         amine_raw.append(par)
    if  'Eridon' in par[:1]: 
         eridon_raw.append(par)
    if  'Lamon' in par[:1]: 
         lamon_raw.append(par)
   

In [191]:
egle = character_cleaner(egle_raw, 2)
amine = character_cleaner(amine_raw, 2)
eridon = character_cleaner(eridon_raw, 2)
lamon = character_cleaner(lamon_raw, 2)

In [192]:
die_laune_des_verliebten_characters = {'Egle':egle, 'Amine':amine, 'Eridon':eridon, 'Lamon':lamon}

In [193]:
characters_details(die_laune_des_verliebten_characters)

Egle contains 256 sentences and 2434 words
Amine contains 182 sentences and 1309 words
Eridon contains 113 sentences and 910 words
Lamon contains 42 sentences and 302 words


In [255]:
del die_laune_des_verliebten_characters['Amine']
del die_laune_des_verliebten_characters['Eridon']
del die_laune_des_verliebten_characters['Lamon']

In [256]:
%store die_laune_des_verliebten_characters

Stored 'die_laune_des_verliebten_characters' (dict)
