In [58]:
import nltk
from nltk import ngrams
import re
import pandas as pd
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
%matplotlib inline 

In [59]:
%store -r

In [60]:
pd.set_option('display.max_columns', None)

#  Functions

In [61]:
def partition (char_sens, n):
    return [char_sens[i::n] for i in range(n)]

In [62]:
def corpus_split (play_characters, n):
    play_characters_part = {}
    for char in play_characters.keys():
        play_characters_part[char] = partition(play_characters[char], n)
    return play_characters_part  

In [63]:
def split_partitions (play_part):
    part_1 = {}
    part_2 = {}
    part_3 = {}
    part_4 = {}
    part_5 = {}
    for char in play_part:
        for part in char:
            part_1[char] = play_part[char][0] + play_part[char][2] + play_part[char][3] + play_part[char][4]
            part_2[char] = play_part[char][0] + play_part[char][1] + play_part[char][3] + play_part[char][4]
            part_3[char] = play_part[char][0] + play_part[char][1] + play_part[char][2] + play_part[char][4]
            part_4[char] = play_part[char][0] + play_part[char][1] + play_part[char][2] + play_part[char][3]
            part_5[char] = play_part[char][1] + play_part[char][2] + play_part[char][3] + play_part[char][4]
    return part_1, part_2, part_3, part_4, part_5

In [64]:
def split_test (play_part):
    test_1 = {}
    test_2 = {}
    test_3 = {}
    test_4 = {}
    test_5 = {}
    for char in play_part:
        for part in char:
            test_1[char] = play_part[char][1]
            test_2[char] = play_part[char][2]
            test_3[char] = play_part[char][3]
            test_4[char] = play_part[char][4]
            test_5[char] = play_part[char][0]
    return test_1, test_2, test_3, test_4, test_5

In [65]:
def word_tok_no_punct (char):
    result = []
    for sen in char:
        sen = sen.lower()
        output = tokenizer.tokenize(sen)
        result.append(output)
    return result

In [66]:
def sentences_unifier (character):
    output = []
    for sen in character:
        for word in sen:
            output.append(word)
    one_string = " ".join(output)
    return output

In [67]:
def dict_word_tokenizer (play_characters):
    chars = list(play_characters.keys())
    char_tokens = {}
    for char in chars:
        output = word_tok_no_punct(play_characters[char])
        result = sentences_unifier(output)
        char_tokens[char] = result
    return char_tokens

In [68]:
def ngrams_tokenizer (play_characters, n):
    chars = list(play_characters.keys())
    char_bigram_tokens = {}
    for char in chars:
        output = ngrams(play_characters[char],n)
        result = list(output)
        char_bigram_tokens[char] = result
    return char_bigram_tokens

In [69]:
def whole_corpus_generator (partition):
    whole_corpus = []
    for char in partition.keys():
        for word in partition[char]:
            whole_corpus.append(word)
    return whole_corpus

In [13]:
def features_generator (whole_corpus_freq, partition):
    features = [word for word,freq in whole_corpus_freq]
    feature_freqs = {}
    for char in partition:
        feature_freqs[char] = {} 
        overall = len(partition[char])
        for feature in features:
            presence = partition[char].count(feature)
            feature_freqs[char][feature] = presence / overall
    return  feature_freqs

In [14]:
def zscores (df):
    cols = list(df.columns)
    for col in cols:
        if type(col) == tuple:
            join_col = col[0] + '-' + col[1]
            col_zscore = join_col + '_zscore'
        else:        
            col_zscore = col + '_zscore'
        df[col_zscore] = (df[col] - df[col].mean())/df[col].std(ddof=0)
    df = df.drop(cols, axis = 1)
    return df

In [15]:
def delta_distance (play_characters, test_zscores, part_zscores, character):    
    chars = play_characters.keys()
    delta = {}
    for char in chars:
        delta[char] = (abs(test_zscores.loc[character] - part_zscores.loc[char])).sum()/50
    return delta

In [16]:
def get_deltas (writer_characters, test_zscores, train_zscores):
    deltas = {}
    for char in writer_characters.keys():
        result = delta_distance(writer_characters, test_zscores, train_zscores, char)
        deltas[char]= result
        df = pd.DataFrame.from_dict(deltas)
        df = df.reindex(sorted(df.columns), axis=1)
    return df

In [17]:
def model_predictions (plays_data, plays_test, n, plays_characters, column_name):
   
    plays_data_corpus = whole_corpus_generator(plays_data)
    plays_data_corpus_freq = list(nltk.FreqDist(plays_data_corpus).most_common(n))
    plays_data_features = features_generator(plays_data_corpus_freq, plays_data)
    df_plays_data = pd.DataFrame.from_dict(plays_data_features, orient = 'index')
    plays_data_zscores = zscores(df_plays_data)
    plays_test_features = features_generator(plays_data_corpus_freq, plays_test)
    df_plays_test = pd.DataFrame.from_dict(plays_test_features, orient = 'index')
    plays_test_zscores = zscores(df_plays_test)
    deltas = get_deltas(plays_characters, plays_test_zscores, plays_data_zscores)
    predictions = deltas.idxmin()
    df_predictions = pd.DataFrame(predictions, columns = [column_name])
    
    return df_predictions

In [18]:
def success_rate (author_results):
    result = []
    for char in author_results.index:
        output = sum(list(author_results.loc[char] == char))/5
        result.append(output)
    author_results['Results'] = result
    
    return author_results

# Burrows Delta Method 

# Oscar Wilde 

### Characters (+1500 words)

In [19]:
wilde_characters = {**an_ideal_husband_characters, **a_woman_of_no_importance_characters, **lady_windermeres_fan_characters, **the_importance_of_being_earnest_characters}

In [20]:
del wilde_characters['Chasuble']
del wilde_characters['Prism']
del wilde_characters['Augustus']
del wilde_characters['Hester']

In [21]:
len(wilde_characters.keys())

21

In [22]:
wilde_characters_split = corpus_split(wilde_characters, 5)

In [23]:
wilde_characters_partition = split_partitions(wilde_characters_split)
wilde_characters_test = split_test(wilde_characters_split)

In [24]:
wilde_part_1 = dict_word_tokenizer(wilde_characters_partition[0])
wilde_part_2 = dict_word_tokenizer(wilde_characters_partition[1])
wilde_part_3 = dict_word_tokenizer(wilde_characters_partition[2])
wilde_part_4 = dict_word_tokenizer(wilde_characters_partition[3])
wilde_part_5 = dict_word_tokenizer(wilde_characters_partition[4])

In [25]:
wilde_test_1 = dict_word_tokenizer(wilde_characters_test[0])
wilde_test_2 = dict_word_tokenizer(wilde_characters_test[1])
wilde_test_3 = dict_word_tokenizer(wilde_characters_test[2])
wilde_test_4 = dict_word_tokenizer(wilde_characters_test[3])
wilde_test_5 = dict_word_tokenizer(wilde_characters_test[4])

In [26]:
wilde_bigrams_part_1 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_partition[0]), 2)
wilde_bigrams_part_2 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_partition[1]), 2)
wilde_bigrams_part_3 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_partition[2]), 2)
wilde_bigrams_part_4 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_partition[3]), 2)
wilde_bigrams_part_5 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_partition[4]), 2)

In [27]:
wilde_bigrams_test_1 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_test[0]), 2)
wilde_bigrams_test_2 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_test[1]), 2)
wilde_bigrams_test_3 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_test[2]), 2)
wilde_bigrams_test_4 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_test[3]), 2)
wilde_bigrams_test_5 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_test[4]), 2)

### Top 7 characters (+4000 words)

In [244]:
wilde_top_7_characters = ['Goring', 'Chiltern', 'Cheveley', 'Illingorth', 'Lady Windermere', 'Algernon', 'Jack']

In [281]:
wilde_top_7 = {}
for key in wilde_characters:
    if key in wilde_top_7_characters:
        wilde_top_7[key] = wilde_characters[key]

In [282]:
wilde_top_7.keys()

dict_keys(['Goring', 'Chiltern', 'Cheveley', 'Illingorth', 'Lady Windermere', 'Jack', 'Algernon'])

## Partition 1

#### Train z - scores

In [31]:
wilde_part_1_corpus = whole_corpus_generator(wilde_part_1)
wilde_part_1_corpus_freq = list(nltk.FreqDist(wilde_part_1_corpus).most_common(50))
wilde_part_1_features = features_generator(wilde_part_1_corpus_freq, wilde_part_1)
df_wilde_1 = pd.DataFrame.from_dict(wilde_part_1_features, orient = 'index')
wilde_zscores_1 = zscores(df_wilde_1)
wilde_zscores_1;

#### Test z - scores

In [32]:
wilde_test_1_features = features_generator(wilde_part_1_corpus_freq, wilde_test_1)
df_wilde_test_1 = pd.DataFrame.from_dict(wilde_test_1_features, orient = 'index')
wilde_zscores_test_1 = zscores(df_wilde_test_1)
wilde_zscores_test_1;

### Delta Distances

In [33]:
wilde_1_deltas = get_deltas(wilde_characters, wilde_zscores_test_1, wilde_zscores_1)
predictions_wilde_1 = wilde_1_deltas.idxmin()
wilde_df_1 = pd.DataFrame(predictions_wilde_1, columns = ['Part_1'])
wilde_df_1;

In [34]:
wilde_bigram_df_1 = model_predictions(wilde_bigrams_part_1, wilde_bigrams_test_1, 50, wilde_characters, 'Part_1')

## Partition 2

#### Train z- scores

In [35]:
wilde_part_2_corpus = whole_corpus_generator(wilde_part_2)
wilde_part_2_corpus_freq = list(nltk.FreqDist(wilde_part_2_corpus).most_common(50))
wilde_part_2_features = features_generator(wilde_part_2_corpus_freq, wilde_part_2)
df_wilde_2 = pd.DataFrame.from_dict(wilde_part_2_features, orient = 'index')
wilde_zscores_2 = zscores(df_wilde_2)
wilde_zscores_2;

#### Test z - scores

In [36]:
wilde_test_2_features = features_generator(wilde_part_2_corpus_freq, wilde_test_2)
df_wilde_test_2 = pd.DataFrame.from_dict(wilde_test_2_features, orient = 'index')
wilde_zscores_test_2 = zscores(df_wilde_test_2)
wilde_zscores_test_2;

### Delta Distances

In [37]:
wilde_2_deltas = get_deltas(wilde_characters, wilde_zscores_test_2, wilde_zscores_2)
predictions_wilde_2 = wilde_2_deltas.idxmin()
wilde_df_2 = pd.DataFrame(predictions_wilde_2, columns = ['Part_2'])
wilde_df_2;

In [38]:
wilde_bigram_df_2 = model_predictions(wilde_bigrams_part_2, wilde_bigrams_test_2, 50, wilde_characters, 'Part_2')

## Partition 3

#### Train z - scores

In [39]:
wilde_part_3_corpus = whole_corpus_generator(wilde_part_3)
wilde_part_3_corpus_freq = list(nltk.FreqDist(wilde_part_3_corpus).most_common(50))
wilde_part_3_features = features_generator(wilde_part_3_corpus_freq, wilde_part_3)
df_wilde_3 = pd.DataFrame.from_dict(wilde_part_3_features, orient = 'index')
wilde_zscores_3 = zscores(df_wilde_3)
wilde_zscores_3;

#### Test z - scores

In [40]:
wilde_test_3_features = features_generator(wilde_part_3_corpus_freq, wilde_test_3)
df_wilde_test_3 = pd.DataFrame.from_dict(wilde_test_3_features, orient = 'index')
wilde_zscores_test_3 = zscores(df_wilde_test_3)
wilde_zscores_test_3;

### Delta Distances

In [41]:
wilde_3_deltas = get_deltas(wilde_characters, wilde_zscores_test_3, wilde_zscores_3)
predictions_wilde_3 = wilde_3_deltas.idxmin()
wilde_df_3 = pd.DataFrame(predictions_wilde_3, columns = ['Part_3'])
wilde_df_3;

In [42]:
wilde_bigram_df_3 = model_predictions(wilde_bigrams_part_3, wilde_bigrams_test_3, 50, wilde_characters, 'Part_3')

## Partition 4

#### Train z - scores

In [43]:
wilde_part_4_corpus = whole_corpus_generator(wilde_part_4)
wilde_part_4_corpus_freq = list(nltk.FreqDist(wilde_part_4_corpus).most_common(50))
wilde_part_4_features = features_generator(wilde_part_4_corpus_freq, wilde_part_4)
df_wilde_4 = pd.DataFrame.from_dict(wilde_part_4_features, orient = 'index')
wilde_zscores_4 = zscores(df_wilde_4)
wilde_zscores_4;

#### Test z - scores

In [44]:
wilde_test_4_features = features_generator(wilde_part_4_corpus_freq, wilde_test_4)
df_wilde_test_4 = pd.DataFrame.from_dict(wilde_test_4_features, orient = 'index')
wilde_zscores_test_4 = zscores(df_wilde_test_4)
wilde_zscores_test_4;

### Delta Distances

In [45]:
wilde_4_deltas = get_deltas(wilde_characters, wilde_zscores_test_4, wilde_zscores_4)
predictions_wilde_4 = wilde_4_deltas.idxmin()
wilde_df_4 = pd.DataFrame(predictions_wilde_4, columns = ['Part_4'])
wilde_df_4;

In [46]:
wilde_bigram_df_4 = model_predictions(wilde_bigrams_part_4, wilde_bigrams_test_4, 50, wilde_characters, 'Part_4')

## Partition 5

#### Train z - scores

In [47]:
wilde_part_5_corpus = whole_corpus_generator(wilde_part_5)
wilde_part_5_corpus_freq = list(nltk.FreqDist(wilde_part_5_corpus).most_common(50))
wilde_part_5_features = features_generator(wilde_part_5_corpus_freq, wilde_part_5)
df_wilde_5 = pd.DataFrame.from_dict(wilde_part_5_features, orient = 'index')
wilde_zscores_5 = zscores(df_wilde_5)
wilde_zscores_5;

#### Test z - scores

In [48]:
wilde_test_5_features = features_generator(wilde_part_5_corpus_freq, wilde_test_5)
df_wilde_test_5 = pd.DataFrame.from_dict(wilde_test_5_features, orient = 'index')
wilde_zscores_test_5 = zscores(df_wilde_test_5)
wilde_zscores_test_5;

### Delta distances

In [49]:
wilde_5_deltas = get_deltas(wilde_characters, wilde_zscores_test_5, wilde_zscores_5)
predictions_wilde_5 = wilde_5_deltas.idxmin()
wilde_df_5 = pd.DataFrame(predictions_wilde_5, columns = ['Part_5'])
wilde_df_5;

In [50]:
wilde_bigram_df_5 = model_predictions(wilde_bigrams_part_5, wilde_bigrams_test_5, 50, wilde_characters, 'Part_5')

## Oscar Wilde Characters Results

In [179]:
wilde_results = pd.concat([wilde_df_1, wilde_df_2, wilde_df_3, wilde_df_4, wilde_df_5], axis=1)
wilde_results = success_rate(wilde_results)
wilde_results

Unnamed: 0,Part_1,Part_2,Part_3,Part_4,Part_5,Results
Algernon,Jack,Gwendolen,Algernon,Jack,Algernon,0.4
Allonby,Goring,Illingorth,Chiltern,Allonby,Allonby,0.4
Berwick,Hunstanton,Berwick,Hunstanton,Hunstanton,Berwick,0.4
Bracknell,Bracknell,Bracknell,Goring,Bracknell,Hunstanton,0.6
Caversham,Goring,Goring,Caversham,Lady Windermere,Cheveley,0.2
Cecily,Cecily,Algernon,Cecily,Goring,Jack,0.4
Cheveley,Cheveley,Cheveley,Goring,Cheveley,Cheveley,0.8
Chiltern,Chiltern,Cecily,Chiltern,Chiltern,Chiltern,0.8
Darlington,Erlynne,Cheveley,Darlington,Darlington,Goring,0.4
Erlynne,Cecily,Erlynne,Erlynne,Lady Windermere,Goring,0.4


In [180]:
wilde_results['Results'].mean()

0.48571428571428577

In [181]:
wilde_top_7_results = wilde_results.filter(items = wilde_top_7_characters, axis=0)
wilde_top_7_results

Unnamed: 0,Part_1,Part_2,Part_3,Part_4,Part_5,Results
Goring,Illingorth,Goring,Cecily,Goring,Goring,0.6
Chiltern,Chiltern,Cecily,Chiltern,Chiltern,Chiltern,0.8
Cheveley,Cheveley,Cheveley,Goring,Cheveley,Cheveley,0.8
Illingorth,Goring,Goring,Goring,Illingorth,Illingorth,0.4
Lady Windermere,Lady Windermere,Lady Windermere,Lady Windermere,Lady Windermere,Lady Windermere,1.0
Algernon,Jack,Gwendolen,Algernon,Jack,Algernon,0.4
Jack,Goring,Jack,Jack,Jack,Algernon,0.6


In [182]:
wilde_top_7_results['Results'].mean()

0.6571428571428571

In [52]:
wilde_bigram_results = pd.concat([wilde_bigram_df_1, wilde_bigram_df_2, wilde_bigram_df_3, wilde_bigram_df_4, wilde_bigram_df_5], axis =1)
wilde_bigram_results = success_rate(wilde_bigram_results)
wilde_bigram_results['Results'].mean()

0.41904761904761906

# George Bernard Shaw

### Characters (+1500)

In [53]:
shaw_characters = {**pygmalion_characters, **androcles_and_the_lion_characters, **caesar_and_cleopatra_characters, **candida_characters, **man_and_superman_characters}

In [54]:
del shaw_characters['Mrs Higgins']
del shaw_characters['Megaera']
del shaw_characters['Centurion']
del shaw_characters['Spintho']
del shaw_characters['Ferrovius']
del shaw_characters['Pothinus']
del shaw_characters['Ftatateeta']
del shaw_characters['Proserpine']
del shaw_characters['Miss Ramsden']
del shaw_characters['Hector']
del shaw_characters['Straker']
del shaw_characters['Dona Ana']

In [55]:
len(shaw_characters.keys())

23

In [56]:
shaw_characters_split = corpus_split(shaw_characters, 5)

In [57]:
shaw_characters_partition = split_partitions(shaw_characters_split)
shaw_characters_test = split_test(shaw_characters_split)

In [58]:
shaw_part_1 = dict_word_tokenizer(shaw_characters_partition[0])
shaw_part_2 = dict_word_tokenizer(shaw_characters_partition[1])
shaw_part_3 = dict_word_tokenizer(shaw_characters_partition[2])
shaw_part_4 = dict_word_tokenizer(shaw_characters_partition[3])
shaw_part_5 = dict_word_tokenizer(shaw_characters_partition[4])

In [59]:
shaw_test_1 = dict_word_tokenizer(shaw_characters_test[0])
shaw_test_2 = dict_word_tokenizer(shaw_characters_test[1])
shaw_test_3 = dict_word_tokenizer(shaw_characters_test[2])
shaw_test_4 = dict_word_tokenizer(shaw_characters_test[3])
shaw_test_5 = dict_word_tokenizer(shaw_characters_test[4])

In [60]:
shaw_bigrams_part_1 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_partition[0]), 2)
shaw_bigrams_part_2 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_partition[1]), 2)
shaw_bigrams_part_3 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_partition[2]), 2)
shaw_bigrams_part_4 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_partition[3]), 2)
shaw_bigrams_part_5 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_partition[4]), 2)

In [61]:
shaw_bigrams_test_1 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_test[0]), 2)
shaw_bigrams_test_2 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_test[1]), 2)
shaw_bigrams_test_3 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_test[2]), 2)
shaw_bigrams_test_4 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_test[3]), 2)
shaw_bigrams_test_5 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_test[4]), 2)

### Top 7 characters (+4000 words)

In [62]:
shaw_top_7_characters = ['Higgins', 'Liza', 'Caesar', 'Cleopatra', 'Morell', 'Tanner', 'Don Juan']

In [279]:
shaw_top_7 = {}
for key in shaw_characters:
    if key in shaw_top_7_characters:
        shaw_top_7[key] = shaw_characters[key]

## Partition 1

#### Train z - scores

In [65]:
shaw_part_1_corpus = whole_corpus_generator(shaw_part_1)
shaw_part_1_corpus_freq = list(nltk.FreqDist(shaw_part_1_corpus).most_common(50))
shaw_part_1_features = features_generator(shaw_part_1_corpus_freq, shaw_part_1)
df_shaw_1 = pd.DataFrame.from_dict(shaw_part_1_features, orient = 'index')
shaw_zscores_1 = zscores(df_shaw_1)
shaw_zscores_1;

#### Tests z - scores

In [66]:
shaw_test_1_features = features_generator(shaw_part_1_corpus_freq, shaw_test_1)
df_shaw_test_1 = pd.DataFrame.from_dict(shaw_test_1_features, orient = 'index')
shaw_zscores_test_1 = zscores(df_shaw_test_1)
shaw_zscores_test_1;

### Delta Distances

In [67]:
shaw_1_deltas = get_deltas(shaw_characters, shaw_zscores_test_1, shaw_zscores_1)
predictions_shaw_1 = shaw_1_deltas.idxmin()
shaw_df_1 = pd.DataFrame(predictions_shaw_1, columns = ['Part_1'])
shaw_df_1;

In [68]:
shaw_bigram_df_1 = model_predictions(shaw_bigrams_part_1, shaw_bigrams_test_1, 50, shaw_characters, 'Part_1')
shaw_bigram_df_1;

## Partition 2

#### Train z - scores

In [69]:
shaw_part_2_corpus = whole_corpus_generator(shaw_part_2)
shaw_part_2_corpus_freq = list(nltk.FreqDist(shaw_part_2_corpus).most_common(50))
shaw_part_2_features = features_generator(shaw_part_2_corpus_freq, shaw_part_2)
df_shaw_2 = pd.DataFrame.from_dict(shaw_part_2_features, orient = 'index')
shaw_zscores_2 = zscores(df_shaw_2)
shaw_zscores_2;

#### Test z - scores

In [70]:
shaw_test_2_features = features_generator(shaw_part_2_corpus_freq, shaw_test_2)
df_shaw_test_2 = pd.DataFrame.from_dict(shaw_test_2_features, orient = 'index')
shaw_zscores_test_2 = zscores(df_shaw_test_2)
shaw_zscores_test_2;

### Delta Distances

In [71]:
shaw_2_deltas = get_deltas(shaw_characters, shaw_zscores_test_2, shaw_zscores_2)
predictions_shaw_2 = shaw_2_deltas.idxmin()
shaw_df_2 = pd.DataFrame(predictions_shaw_2, columns = ['Part_2'])
shaw_df_2;

In [72]:
shaw_bigram_df_2 = model_predictions(shaw_bigrams_part_2, shaw_bigrams_test_2, 50, shaw_characters, 'Part_2')
shaw_bigram_df_2;

## Partition 3

#### Train z - scores

In [73]:
shaw_part_3_corpus = whole_corpus_generator(shaw_part_3)
shaw_part_3_corpus_freq = list(nltk.FreqDist(shaw_part_3_corpus).most_common(50))
shaw_part_3_features = features_generator(shaw_part_3_corpus_freq, shaw_part_3)
df_shaw_3 = pd.DataFrame.from_dict(shaw_part_3_features, orient = 'index')
shaw_zscores_3 = zscores(df_shaw_3)
shaw_zscores_3;

#### Test z - scores

In [74]:
shaw_test_3_features = features_generator(shaw_part_3_corpus_freq, shaw_test_3)
df_shaw_test_3 = pd.DataFrame.from_dict(shaw_test_3_features, orient = 'index')
shaw_zscores_test_3 = zscores(df_shaw_test_3)
shaw_zscores_test_3;

### Delta Distances

In [75]:
shaw_3_deltas = get_deltas(shaw_characters, shaw_zscores_test_3, shaw_zscores_3)
predictions_shaw_3 = shaw_3_deltas.idxmin()
shaw_df_3 = pd.DataFrame(predictions_shaw_3, columns = ['Part_3'])
shaw_df_3;

In [76]:
shaw_bigram_df_3 = model_predictions(shaw_bigrams_part_3, shaw_bigrams_test_3, 50, shaw_characters, 'Part_3')
shaw_bigram_df_3;

## Partition 4

#### Train z - scores

In [77]:
shaw_part_4_corpus = whole_corpus_generator(shaw_part_4)
shaw_part_4_corpus_freq = list(nltk.FreqDist(shaw_part_4_corpus).most_common(50))
shaw_part_4_features = features_generator(shaw_part_4_corpus_freq, shaw_part_4)
df_shaw_4 = pd.DataFrame.from_dict(shaw_part_4_features, orient = 'index')
shaw_zscores_4 = zscores(df_shaw_4)
shaw_zscores_4;

#### Test z - scores

In [78]:
shaw_test_4_features = features_generator(shaw_part_4_corpus_freq, shaw_test_4)
df_shaw_test_4 = pd.DataFrame.from_dict(shaw_test_4_features, orient = 'index')
shaw_zscores_test_4 = zscores(df_shaw_test_4)
shaw_zscores_test_4;

### Delta Distances

In [79]:
shaw_4_deltas = get_deltas(shaw_characters, shaw_zscores_test_4, shaw_zscores_4)
predictions_shaw_4 = shaw_4_deltas.idxmin()
shaw_df_4 = pd.DataFrame(predictions_shaw_4, columns = ['Part_4'])
shaw_df_4;

In [80]:
shaw_bigram_df_4 = model_predictions(shaw_bigrams_part_4, shaw_bigrams_test_4, 50, shaw_characters, 'Part_4')
shaw_bigram_df_4;

## Partition 5

#### Train z - scores

In [81]:
shaw_part_5_corpus = whole_corpus_generator(shaw_part_5)
shaw_part_5_corpus_freq = list(nltk.FreqDist(shaw_part_5_corpus).most_common(50))
shaw_part_5_features = features_generator(shaw_part_5_corpus_freq, shaw_part_5)
df_shaw_5 = pd.DataFrame.from_dict(shaw_part_5_features, orient = 'index')
shaw_zscores_5 = zscores(df_shaw_5)
shaw_zscores_5;

#### Test z - scores

In [82]:
shaw_test_5_features = features_generator(shaw_part_5_corpus_freq, shaw_test_5)
df_shaw_test_5 = pd.DataFrame.from_dict(shaw_test_5_features, orient = 'index')
shaw_zscores_test_5 = zscores(df_shaw_test_5)
shaw_zscores_test_5;

### Delta Distances

In [83]:
shaw_5_deltas = get_deltas(shaw_characters, shaw_zscores_test_5, shaw_zscores_5)
predictions_shaw_5 = shaw_5_deltas.idxmin()
shaw_df_5 = pd.DataFrame(predictions_shaw_5, columns = ['Part_5'])
shaw_df_5;

In [84]:
shaw_bigram_df_5 = model_predictions(shaw_bigrams_part_5, shaw_bigrams_test_5, 50, shaw_characters, 'Part_5')
shaw_bigram_df_5;

## George Bernard Shaw Characters Results

In [185]:
shaw_results = pd.concat([shaw_df_1, shaw_df_2, shaw_df_3, shaw_df_4, shaw_df_5], axis=1)
shaw_results = success_rate(shaw_results)
shaw_results

Unnamed: 0,Part_1,Part_2,Part_3,Part_4,Part_5,Results
Androcles,Tanner,Androcles,Lavinia,Morell,Androcles,0.4
Ann,Ann,Ann,Ann,Ann,Ann,1.0
Apollodorus,Don Juan,Mendoza,Apollodorus,Mendoza,Apollodorus,0.4
Burgess,Morell,Burgess,Burgess,Burgess,Burgess,0.8
Caesar,Caesar,Caesar,Caesar,Caesar,Caesar,1.0
Candida,Morell,Candida,Candida,Candida,Candida,0.8
Captain,Captain,Caesar,Captain,Caesar,Captain,0.6
Cleopatra,Cleopatra,Cleopatra,Cleopatra,Cleopatra,Cleopatra,1.0
Don Juan,Don Juan,Don Juan,Don Juan,Don Juan,Don Juan,1.0
Doolittle,Doolittle,Doolittle,Doolittle,Doolittle,Tanner,0.8


In [184]:
shaw_results['Results'].mean()

0.6695652173913043

In [183]:
shaw_top_7_results = shaw_results.filter(items = shaw_top_7_characters, axis=0)
shaw_top_7_results

Unnamed: 0,Part_1,Part_2,Part_3,Part_4,Part_5,Results
Higgins,Higgins,Higgins,Higgins,Higgins,Higgins,1.0
Liza,Liza,Liza,Liza,Liza,Liza,1.0
Caesar,Caesar,Caesar,Caesar,Caesar,Caesar,1.0
Cleopatra,Cleopatra,Cleopatra,Cleopatra,Cleopatra,Cleopatra,1.0
Morell,Tanner,Morell,Morell,Morell,Morell,0.8
Tanner,Tanner,Tanner,Tanner,Tanner,Tanner,1.0
Don Juan,Don Juan,Don Juan,Don Juan,Don Juan,Don Juan,1.0


In [186]:
shaw_top_7_results['Results'].mean()

0.9714285714285714

In [86]:
shaw_bigram_results = pd.concat([shaw_bigram_df_1, shaw_bigram_df_2, shaw_bigram_df_3, shaw_bigram_df_4, shaw_bigram_df_5], axis =1)
shaw_bigram_results = success_rate(shaw_bigram_results)
shaw_bigram_results['Results'].mean()

0.32173913043478264

# Ben Jonson

### Characters (+1500 words)

In [87]:
jonson_characters = {**cynthias_revels_characters, **every_man_on_his_humour_characters, **volpone_or_the_fox_characters, **the_alchemist_characters}

In [88]:
del jonson_characters['Echo']
del jonson_characters['Hedon']
del jonson_characters['Arete']
del jonson_characters['Philautia']
del jonson_characters['Cyntia']
del jonson_characters['Mathew']
del jonson_characters['Tib']
del jonson_characters['Cash']
del jonson_characters['Downright']
del jonson_characters['Dame Kitely']
del jonson_characters['Nano']
del jonson_characters['Androgyno']
del jonson_characters['Voltore']
del jonson_characters['Corbaccio']
del jonson_characters['Peregrine']
del jonson_characters['Bonario']
del jonson_characters['Lady Would-be']
del jonson_characters['Dol']
del jonson_characters['Dapper']
del jonson_characters['Drugger']
del jonson_characters['Ananias']
del jonson_characters['Tribulation']
del jonson_characters['Kastril']

In [89]:
len(jonson_characters.keys())

22

In [90]:
jonson_characters_split = corpus_split(jonson_characters, 5)

In [91]:
jonson_characters_partition = split_partitions(jonson_characters_split)
jonson_characters_test = split_test(jonson_characters_split)

In [92]:
jonson_part_1 = dict_word_tokenizer(jonson_characters_partition[0])
jonson_part_2 = dict_word_tokenizer(jonson_characters_partition[1])
jonson_part_3 = dict_word_tokenizer(jonson_characters_partition[2])
jonson_part_4 = dict_word_tokenizer(jonson_characters_partition[3])
jonson_part_5 = dict_word_tokenizer(jonson_characters_partition[4])

In [93]:
jonson_test_1 = dict_word_tokenizer(jonson_characters_test[0])
jonson_test_2 = dict_word_tokenizer(jonson_characters_test[1])
jonson_test_3 = dict_word_tokenizer(jonson_characters_test[2])
jonson_test_4 = dict_word_tokenizer(jonson_characters_test[3])
jonson_test_5 = dict_word_tokenizer(jonson_characters_test[4])

In [94]:
jonson_bigrams_part_1 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_partition[0]), 2)
jonson_bigrams_part_2 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_partition[1]), 2)
jonson_bigrams_part_3 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_partition[2]), 2)
jonson_bigrams_part_4 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_partition[3]), 2)
jonson_bigrams_part_5 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_partition[4]), 2)

In [95]:
jonson_bigrams_test_1 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_test[0]), 2)
jonson_bigrams_test_2 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_test[1]), 2)
jonson_bigrams_test_3 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_test[2]), 2)
jonson_bigrams_test_4 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_test[3]), 2)
jonson_bigrams_test_5 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_test[4]), 2)

###  Top 7 characters (+4000 words)

In [96]:
jonson_top_7_characters = ['Mercury', 'Amorphus', 'Crites', 'Volpone', 'Mosca', 'Face', 'Subtle']

In [270]:
jonson_top_7 = {}
for key in jonson_characters:
    if key in jonson_top_7_characters:
        jonson_top_7[key] = jonson_characters[key]

## Partition 1

#### Train z - sccores

In [99]:
jonson_part_1_corpus = whole_corpus_generator(jonson_part_1)
jonson_part_1_corpus_freq = list(nltk.FreqDist(jonson_part_1_corpus).most_common(50))
jonson_part_1_features = features_generator(jonson_part_1_corpus_freq, jonson_part_1)
df_jonson_1 = pd.DataFrame.from_dict(jonson_part_1_features, orient = 'index')
jonson_zscores_1 = zscores(df_jonson_1)
jonson_zscores_1;

#### Test z - scores

In [100]:
jonson_test_1_features = features_generator(jonson_part_1_corpus_freq, jonson_test_1)
df_jonson_test_1 = pd.DataFrame.from_dict(jonson_test_1_features, orient = 'index')
jonson_zscores_test_1 = zscores(df_jonson_test_1)
jonson_zscores_test_1;

### Delta Distances

In [101]:
jonson_1_deltas = get_deltas(jonson_characters, jonson_zscores_test_1, jonson_zscores_1)
predictions_jonson_1 = jonson_1_deltas.idxmin()
jonson_df_1 = pd.DataFrame(predictions_jonson_1, columns = ['Part_1'])
jonson_df_1;

In [102]:
jonson_bigram_df_1 = model_predictions(jonson_bigrams_part_1, jonson_bigrams_test_1, 50, jonson_characters, 'Part_1')
jonson_bigram_df_1;

## Partition 2

#### Train z - scores

In [103]:
jonson_part_2_corpus = whole_corpus_generator(jonson_part_2)
jonson_part_2_corpus_freq = list(nltk.FreqDist(jonson_part_2_corpus).most_common(50))
jonson_part_2_features = features_generator(jonson_part_2_corpus_freq, jonson_part_2)
df_jonson_2 = pd.DataFrame.from_dict(jonson_part_2_features, orient = 'index')
jonson_zscores_2 = zscores(df_jonson_2)
jonson_zscores_2;

#### Test z - scores

In [104]:
jonson_test_2_features = features_generator(jonson_part_2_corpus_freq, jonson_test_2)
df_jonson_test_2 = pd.DataFrame.from_dict(jonson_test_2_features, orient = 'index')
jonson_zscores_test_2 = zscores(df_jonson_test_2)
jonson_zscores_test_2;

### Delta Distances

In [105]:
jonson_2_deltas = get_deltas(jonson_characters, jonson_zscores_test_2, jonson_zscores_2)
predictions_jonson_2 = jonson_2_deltas.idxmin()
jonson_df_2 = pd.DataFrame(predictions_jonson_2, columns = ['Part_2'])
jonson_df_2;

In [106]:
jonson_bigram_df_2 = model_predictions(jonson_bigrams_part_2, jonson_bigrams_test_2, 50, jonson_characters, 'Part_2')
jonson_bigram_df_2;

## Partition 3

#### Train z - scores

In [107]:
jonson_part_3_corpus = whole_corpus_generator(jonson_part_3)
jonson_part_3_corpus_freq = list(nltk.FreqDist(jonson_part_3_corpus).most_common(50))
jonson_part_3_features = features_generator(jonson_part_3_corpus_freq, jonson_part_3)
df_jonson_3 = pd.DataFrame.from_dict(jonson_part_3_features, orient = 'index')
jonson_zscores_3 = zscores(df_jonson_3)
jonson_zscores_3;

#### Test z - scores

In [108]:
jonson_test_3_features = features_generator(jonson_part_3_corpus_freq, jonson_test_3)
df_jonson_test_3 = pd.DataFrame.from_dict(jonson_test_3_features, orient = 'index')
jonson_zscores_test_3 = zscores(df_jonson_test_3)
jonson_zscores_test_3;

### Delta Distances

In [109]:
jonson_3_deltas = get_deltas(jonson_characters, jonson_zscores_test_3, jonson_zscores_3)
predictions_jonson_3 = jonson_3_deltas.idxmin()
jonson_df_3 = pd.DataFrame(predictions_jonson_3, columns = ['Part_3'])
jonson_df_3;

In [110]:
jonson_bigram_df_3 = model_predictions(jonson_bigrams_part_3, jonson_bigrams_test_3, 50, jonson_characters, 'Part_3')
jonson_bigram_df_3;

## Partition 4

#### Train z - scores

In [111]:
jonson_part_4_corpus = whole_corpus_generator(jonson_part_4)
jonson_part_4_corpus_freq = list(nltk.FreqDist(jonson_part_4_corpus).most_common(50))
jonson_part_4_features = features_generator(jonson_part_4_corpus_freq, jonson_part_4)
df_jonson_4 = pd.DataFrame.from_dict(jonson_part_4_features, orient = 'index')
jonson_zscores_4 = zscores(df_jonson_4)
jonson_zscores_4;

#### Test z - scores

In [112]:
jonson_test_4_features = features_generator(jonson_part_4_corpus_freq, jonson_test_4)
df_jonson_test_4 = pd.DataFrame.from_dict(jonson_test_4_features, orient = 'index')
jonson_zscores_test_4 = zscores(df_jonson_test_4)
jonson_zscores_test_4;

### Delta Distances

In [113]:
jonson_4_deltas = get_deltas(jonson_characters, jonson_zscores_test_4, jonson_zscores_4)
predictions_jonson_4 = jonson_4_deltas.idxmin()
jonson_df_4 = pd.DataFrame(predictions_jonson_4, columns = ['Part_4'])
jonson_df_4;

In [114]:
jonson_bigram_df_4 = model_predictions(jonson_bigrams_part_4, jonson_bigrams_test_4, 50, jonson_characters, 'Part_4')
jonson_bigram_df_4;

## Partition 5

#### Train z - scores

In [115]:
jonson_part_5_corpus = whole_corpus_generator(jonson_part_5)
jonson_part_5_corpus_freq = list(nltk.FreqDist(jonson_part_5_corpus).most_common(50))
jonson_part_5_features = features_generator(jonson_part_5_corpus_freq, jonson_part_5)
df_jonson_5 = pd.DataFrame.from_dict(jonson_part_5_features, orient = 'index')
jonson_zscores_5 = zscores(df_jonson_5)
jonson_zscores_5;

#### Test z - scores

In [116]:
jonson_test_5_features = features_generator(jonson_part_5_corpus_freq, jonson_test_5)
df_jonson_test_5 = pd.DataFrame.from_dict(jonson_test_5_features, orient = 'index')
jonson_zscores_test_5 = zscores(df_jonson_test_5)
jonson_zscores_test_5;

### Delta Distances

In [117]:
jonson_5_deltas = get_deltas(jonson_characters, jonson_zscores_test_5, jonson_zscores_5)
predictions_jonson_5 = jonson_5_deltas.idxmin()
jonson_df_5 = pd.DataFrame(predictions_jonson_5, columns = ['Part_5'])
jonson_df_5;

In [118]:
jonson_bigram_df_5 = model_predictions(jonson_bigrams_part_5, jonson_bigrams_test_5, 50, jonson_characters, 'Part_5')
jonson_bigram_df_5;

## Ben Jonson Characters Results

In [188]:
jonson_results = pd.concat([jonson_df_1, jonson_df_2, jonson_df_3, jonson_df_4, jonson_df_5], axis=1)
jonson_results = success_rate(jonson_results)
jonson_results;

In [189]:
jonson_results['Results'].mean()

0.49090909090909096

In [190]:
jonson_top_7_results = jonson_results.filter(items = jonson_top_7_characters, axis=0)
jonson_top_7_results

Unnamed: 0,Part_1,Part_2,Part_3,Part_4,Part_5,Results
Mercury,Mercury,Mercury,Mercury,Mercury,Mercury,1.0
Amorphus,Amorphus,Amorphus,Amorphus,Amorphus,Subtle,0.8
Crites,Crites,Volpone,Mercury,Crites,Mercury,0.4
Volpone,Volpone,Volpone,Volpone,Volpone,Volpone,1.0
Mosca,Mosca,Mosca,Mosca,Mosca,Mosca,1.0
Face,Face,Face,Face,Face,Face,1.0
Subtle,Subtle,Subtle,Subtle,Subtle,Subtle,1.0


In [191]:
jonson_top_7_results['Results'].mean()

0.8857142857142858

In [120]:
jonson_bigram_results = pd.concat([jonson_bigram_df_1, jonson_bigram_df_2, jonson_bigram_df_3, jonson_bigram_df_4, jonson_bigram_df_5], axis =1)
jonson_bigram_results = success_rate(jonson_bigram_results)
jonson_bigram_results['Results'].mean()

0.21818181818181817

# William Shakespeare

### Characters (+1500 words)

In [121]:
shakespeare_characters = {**macbeth_characters, **romeo_and_juliet_characters, **othello_characters, **hamlet_characters, **king_lear_characters}

In [122]:
del shakespeare_characters['Banquo']
del shakespeare_characters['Macduff']
del shakespeare_characters['Ross']
del shakespeare_characters['Benvolio']
del shakespeare_characters['Lady Capulet']
del shakespeare_characters['Roderigo']
del shakespeare_characters['Ophelia']
del shakespeare_characters['Laertes']
del shakespeare_characters['Gertrude']
del shakespeare_characters['Regan']

In [123]:
len(shakespeare_characters.keys())

21

In [124]:
shakespeare_characters_split = corpus_split(shakespeare_characters, 5)

In [125]:
shakespeare_characters_partition = split_partitions(shakespeare_characters_split)
shakespeare_characters_test = split_test(shakespeare_characters_split)

In [126]:
shakespeare_part_1 = dict_word_tokenizer(shakespeare_characters_partition[0])
shakespeare_part_2 = dict_word_tokenizer(shakespeare_characters_partition[1])
shakespeare_part_3 = dict_word_tokenizer(shakespeare_characters_partition[2])
shakespeare_part_4 = dict_word_tokenizer(shakespeare_characters_partition[3])
shakespeare_part_5 = dict_word_tokenizer(shakespeare_characters_partition[4])

In [127]:
shakespeare_test_1 = dict_word_tokenizer(shakespeare_characters_test[0])
shakespeare_test_2 = dict_word_tokenizer(shakespeare_characters_test[1])
shakespeare_test_3 = dict_word_tokenizer(shakespeare_characters_test[2])
shakespeare_test_4 = dict_word_tokenizer(shakespeare_characters_test[3])
shakespeare_test_5 = dict_word_tokenizer(shakespeare_characters_test[4])

In [128]:
shakespeare_bigrams_part_1 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_partition[0]), 2)
shakespeare_bigrams_part_2 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_partition[1]), 2)
shakespeare_bigrams_part_3 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_partition[2]), 2)
shakespeare_bigrams_part_4 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_partition[3]), 2)
shakespeare_bigrams_part_5 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_partition[4]), 2)

In [129]:
shakespeare_bigrams_test_1 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_test[0]), 2)
shakespeare_bigrams_test_2 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_test[1]), 2)
shakespeare_bigrams_test_3 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_test[2]), 2)
shakespeare_bigrams_test_4 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_test[3]), 2)
shakespeare_bigrams_test_5 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_test[4]), 2)

### Top 7 characters (+4000)

In [130]:
shakespeare_top_7_characters = ['Macbeth', 'Romeo', 'Juliet', 'Othello', 'Iago', 'Hamlet', 'Lear']

In [276]:
shakespeare_top_7 = {}
for key in shakespeare_characters:
    if key in shakespeare_top_7_characters:
        shakespeare_top_7[key] = shakespeare_characters[key]

## Partition 1

#### Train z - scores

In [133]:
shakespeare_part_1_corpus = whole_corpus_generator(shakespeare_part_1)
shakespeare_part_1_corpus_freq = list(nltk.FreqDist(shakespeare_part_1_corpus).most_common(50))
shakespeare_part_1_features = features_generator(shakespeare_part_1_corpus_freq, shakespeare_part_1)
df_shakespeare_1 = pd.DataFrame.from_dict(shakespeare_part_1_features, orient = 'index')
shakespeare_zscores_1 = zscores(df_shakespeare_1)
shakespeare_zscores_1;

#### Test z - scores

In [134]:
shakespeare_test_1_features = features_generator(shakespeare_part_1_corpus_freq, shakespeare_test_1)
df_shakespeare_test_1 = pd.DataFrame.from_dict(shakespeare_test_1_features, orient = 'index')
shakespeare_zscores_test_1 = zscores(df_shakespeare_test_1)
shakespeare_zscores_test_1;

### Delta Distances

In [135]:
shakespeare_1_deltas = get_deltas(shakespeare_characters, shakespeare_zscores_test_1, shakespeare_zscores_1)
predictions_shakespeare_1 = shakespeare_1_deltas.idxmin()
shakespeare_df_1 = pd.DataFrame(predictions_shakespeare_1, columns = ['Part_1'])
shakespeare_df_1;

In [136]:
shakespeare_bigram_df_1 = model_predictions(shakespeare_bigrams_part_5, shakespeare_bigrams_test_5, 50, shakespeare_characters, 'Part_1')
shakespeare_bigram_df_1;

## Partition 2

#### Train z - scores

In [137]:
shakespeare_part_2_corpus = whole_corpus_generator(shakespeare_part_2)
shakespeare_part_2_corpus_freq = list(nltk.FreqDist(shakespeare_part_2_corpus).most_common(50))
shakespeare_part_2_features = features_generator(shakespeare_part_2_corpus_freq, shakespeare_part_2)
df_shakespeare_2 = pd.DataFrame.from_dict(shakespeare_part_2_features, orient = 'index')
shakespeare_zscores_2 = zscores(df_shakespeare_2)
shakespeare_zscores_2;

#### Test z - scores

In [138]:
shakespeare_test_2_features = features_generator(shakespeare_part_2_corpus_freq, shakespeare_test_2)
df_shakespeare_test_2 = pd.DataFrame.from_dict(shakespeare_test_2_features, orient = 'index')
shakespeare_zscores_test_2 = zscores(df_shakespeare_test_2)
shakespeare_zscores_test_2;

### Delta Distances

In [139]:
shakespeare_2_deltas = get_deltas(shakespeare_characters, shakespeare_zscores_test_2, shakespeare_zscores_2)
predictions_shakespeare_2 = shakespeare_2_deltas.idxmin()
shakespeare_df_2 = pd.DataFrame(predictions_shakespeare_2, columns = ['Part_2'])
shakespeare_df_2;

In [140]:
shakespeare_bigram_df_2 = model_predictions(shakespeare_bigrams_part_2, shakespeare_bigrams_test_2, 50, shakespeare_characters, 'Part_2')
shakespeare_bigram_df_2;

## Partition 3

#### Train z - scores

In [141]:
shakespeare_part_3_corpus = whole_corpus_generator(shakespeare_part_3)
shakespeare_part_3_corpus_freq = list(nltk.FreqDist(shakespeare_part_3_corpus).most_common(50))
shakespeare_part_3_features = features_generator(shakespeare_part_3_corpus_freq, shakespeare_part_3)
df_shakespeare_3 = pd.DataFrame.from_dict(shakespeare_part_3_features, orient = 'index')
shakespeare_zscores_3 = zscores(df_shakespeare_3)
shakespeare_zscores_3;

#### Test z - scores

In [142]:
shakespeare_test_3_features = features_generator(shakespeare_part_3_corpus_freq, shakespeare_test_3)
df_shakespeare_test_3 = pd.DataFrame.from_dict(shakespeare_test_3_features, orient = 'index')
shakespeare_zscores_test_3 = zscores(df_shakespeare_test_3)
shakespeare_zscores_test_3;

### Delta Distances

In [143]:
shakespeare_3_deltas = get_deltas(shakespeare_characters, shakespeare_zscores_test_3, shakespeare_zscores_3)
predictions_shakespeare_3 = shakespeare_3_deltas.idxmin()
shakespeare_df_3 = pd.DataFrame(predictions_shakespeare_3, columns = ['Part_3'])
shakespeare_df_3;

In [144]:
shakespeare_bigram_df_3 = model_predictions(shakespeare_bigrams_part_3, shakespeare_bigrams_test_3, 50, shakespeare_characters, 'Part_3')
shakespeare_bigram_df_3;

## Partition 4

#### Train z - scores

In [145]:
shakespeare_part_4_corpus = whole_corpus_generator(shakespeare_part_4)
shakespeare_part_4_corpus_freq = list(nltk.FreqDist(shakespeare_part_4_corpus).most_common(50))
shakespeare_part_4_features = features_generator(shakespeare_part_4_corpus_freq, shakespeare_part_4)
df_shakespeare_4 = pd.DataFrame.from_dict(shakespeare_part_4_features, orient = 'index')
shakespeare_zscores_4 = zscores(df_shakespeare_4)
shakespeare_zscores_4;

#### Test z - scores

In [146]:
shakespeare_test_4_features = features_generator(shakespeare_part_4_corpus_freq, shakespeare_test_4)
df_shakespeare_test_4 = pd.DataFrame.from_dict(shakespeare_test_4_features, orient = 'index')
shakespeare_zscores_test_4 = zscores(df_shakespeare_test_4)
shakespeare_zscores_test_4;

### Delta Distances

In [147]:
shakespeare_4_deltas = get_deltas(shakespeare_characters, shakespeare_zscores_test_4, shakespeare_zscores_4)
predictions_shakespeare_4 = shakespeare_4_deltas.idxmin()
shakespeare_df_4 = pd.DataFrame(predictions_shakespeare_4, columns = ['Part_4'])
shakespeare_df_4;

In [148]:
shakespeare_bigram_df_4 = model_predictions(shakespeare_bigrams_part_4, shakespeare_bigrams_test_4, 50, shakespeare_characters, 'Part_4')
shakespeare_bigram_df_4;

## Partition 5

#### Train z - scores

In [149]:
shakespeare_part_5_corpus = whole_corpus_generator(shakespeare_part_5)
shakespeare_part_5_corpus_freq = list(nltk.FreqDist(shakespeare_part_5_corpus).most_common(50))
shakespeare_part_5_features = features_generator(shakespeare_part_5_corpus_freq, shakespeare_part_5)
df_shakespeare_5 = pd.DataFrame.from_dict(shakespeare_part_5_features, orient = 'index')
shakespeare_zscores_5 = zscores(df_shakespeare_5)
shakespeare_zscores_5;

#### Test z - scores

In [150]:
shakespeare_test_5_features = features_generator(shakespeare_part_5_corpus_freq, shakespeare_test_5)
df_shakespeare_test_5 = pd.DataFrame.from_dict(shakespeare_test_5_features, orient = 'index')
shakespeare_zscores_test_5 = zscores(df_shakespeare_test_5)
shakespeare_zscores_test_5;

### Delta Distances

In [151]:
shakespeare_5_deltas = get_deltas(shakespeare_characters, shakespeare_zscores_test_5, shakespeare_zscores_5)
predictions_shakespeare_5 = shakespeare_5_deltas.idxmin()
shakespeare_df_5 = pd.DataFrame(predictions_shakespeare_5, columns = ['Part_5'])
shakespeare_df_5;

In [152]:
shakespeare_bigram_df_5 = model_predictions(shakespeare_bigrams_part_5, shakespeare_bigrams_test_5, 50, shakespeare_characters, 'Part_5')
shakespeare_bigram_df_5;

## William Shakespeare Characters Results

In [193]:
shakespeare_results = pd.concat([shakespeare_df_1, shakespeare_df_2, shakespeare_df_3, shakespeare_df_4, shakespeare_df_5], axis=1)
shakespeare_results = success_rate(shakespeare_results)
shakespeare_results;

In [194]:
shakespeare_results['Results'].mean()

0.5142857142857143

In [358]:
shakespeare_top_7_results = shakespeare_results.filter(items = shakespeare_top_7_characters, axis=0)
shakespeare_top_7_results;

In [196]:
shakespeare_top_7_results['Results'].mean()

0.8

In [154]:
shakespeare_bigram_results = pd.concat([shakespeare_bigram_df_1, shakespeare_bigram_df_2, shakespeare_bigram_df_3, shakespeare_bigram_df_4, shakespeare_bigram_df_5], axis =1)
shakespeare_bigram_results = success_rate(shakespeare_bigram_results)
shakespeare_bigram_results['Results'].mean()

0.21904761904761907

## All top 7 characters

In [283]:
all_top_7 = {**wilde_top_7, **shaw_top_7, **jonson_top_7, **shakespeare_top_7}

In [285]:
all_top = corpus_split(all_top_7, 5)

In [286]:
all_top_partition = split_partitions(all_top)
all_top_test = split_test(all_top)

In [287]:
all_top_part_1 = dict_word_tokenizer(all_top_partition[0])
all_top_part_2 = dict_word_tokenizer(all_top_partition[1])
all_top_part_3 = dict_word_tokenizer(all_top_partition[2])
all_top_part_4 = dict_word_tokenizer(all_top_partition[3])
all_top_part_5 = dict_word_tokenizer(all_top_partition[4])

In [288]:
all_top_test_1 = dict_word_tokenizer(all_top_test[0])
all_top_test_2 = dict_word_tokenizer(all_top_test[1])
all_top_test_3 = dict_word_tokenizer(all_top_test[2])
all_top_test_4 = dict_word_tokenizer(all_top_test[3])
all_top_test_5 = dict_word_tokenizer(all_top_test[4])

In [291]:
all_top_bigrams_part_1 = ngrams_tokenizer(dict_word_tokenizer(all_top_partition[0]), 2)
all_top_bigrams_part_2 = ngrams_tokenizer(dict_word_tokenizer(all_top_partition[1]), 2)
all_top_bigrams_part_3 = ngrams_tokenizer(dict_word_tokenizer(all_top_partition[2]), 2)
all_top_bigrams_part_4 = ngrams_tokenizer(dict_word_tokenizer(all_top_partition[3]), 2)
all_top_bigrams_part_5 = ngrams_tokenizer(dict_word_tokenizer(all_top_partition[4]), 2)

In [292]:
all_top_bigrams_test_1 = ngrams_tokenizer(dict_word_tokenizer(all_top_test[0]), 2)
all_top_bigrams_test_2 = ngrams_tokenizer(dict_word_tokenizer(all_top_test[1]), 2)
all_top_bigrams_test_3 = ngrams_tokenizer(dict_word_tokenizer(all_top_test[2]), 2)
all_top_bigrams_test_4 = ngrams_tokenizer(dict_word_tokenizer(all_top_test[3]), 2)
all_top_bigrams_test_5 = ngrams_tokenizer(dict_word_tokenizer(all_top_test[4]), 2)

## Deltas Distances

### Burrows Delta Method 

In [325]:
all_df_1 = model_predictions(all_top_part_1, all_top_test_1, 100, all_top_7, 'Part_1')
all_df_2 = model_predictions(all_top_part_2, all_top_test_2, 100, all_top_7, 'Part_2')
all_df_3 = model_predictions(all_top_part_3, all_top_test_3, 100, all_top_7, 'Part_3')
all_df_4 = model_predictions(all_top_part_4, all_top_test_4, 100, all_top_7, 'Part_4')
all_df_5 = model_predictions(all_top_part_5, all_top_test_5, 100, all_top_7, 'Part_5')

In [326]:
all_top_df = pd.concat([all_df_1, all_df_2, all_df_3, all_df_4, all_df_5], axis = 1)
all_top_df = success_rate(all_top_df)

In [332]:
all_top_df['Results'].mean()

0.8214285714285715

#### Per author

In [327]:
wilde_in_all_results = all_top_df.filter(items = wilde_top_7_characters, axis=0)
shaw_in_all_results = all_top_df.filter(items = shaw_top_7_characters, axis=0)
jonson_in_all_results = all_top_df.filter(items = jonson_top_7_characters, axis=0)
shakespeare_in_all_results = all_top_df.filter(items = shakespeare_top_7_characters, axis=0)

In [328]:
wilde_in_all_results['Results'].mean()

0.7714285714285714

In [329]:
shaw_in_all_results['Results'].mean()

0.9142857142857143

In [330]:
jonson_in_all_results['Results'].mean()

0.7999999999999999

In [331]:
shakespeare_in_all_results['Results'].mean()

0.7999999999999999

### Burrows Delta Method Bigrams

In [341]:
all_bigrams_df_1 = model_predictions(all_top_bigrams_part_1, all_top_bigrams_test_1, 100, all_top_7, 'Part_1')
all_bigrams_df_2 = model_predictions(all_top_bigrams_part_2, all_top_bigrams_test_2, 100, all_top_7, 'Part_2')
all_bigrams_df_3 = model_predictions(all_top_bigrams_part_3, all_top_bigrams_test_3, 100, all_top_7, 'Part_3')
all_bigrams_df_4 = model_predictions(all_top_bigrams_part_4, all_top_bigrams_test_4, 100, all_top_7, 'Part_4')
all_bigrams_df_5 = model_predictions(all_top_bigrams_part_5, all_top_bigrams_test_5, 100, all_top_7, 'Part_5')


In [342]:
all_top_bigrams_df = pd.concat([all_bigrams_df_1, all_bigrams_df_2, all_bigrams_df_3, all_bigrams_df_4, all_bigrams_df_5], axis = 1)
all_top_bigrams_df = success_rate(all_top_bigrams_df)

In [343]:
all_top_bigrams_df['Results'].mean()

0.45000000000000007

#### Per author

In [345]:
wilde_bigrams_in_all_results = all_top_bigrams_df.filter(items = wilde_top_7_characters, axis=0)
shaw_bigrams_in_all_results = all_top_bigrams_df.filter(items = shaw_top_7_characters, axis=0)
jonson_bigrams_in_all_results = all_top_bigrams_df.filter(items = jonson_top_7_characters, axis=0)
shakespeare_bigrams_in_all_results = all_top_bigrams_df.filter(items = shakespeare_top_7_characters, axis=0)

In [346]:
wilde_bigrams_in_all_results['Results'].mean()

0.4571428571428572

In [347]:
shaw_bigrams_in_all_results['Results'].mean()

0.6571428571428571

In [348]:
jonson_bigrams_in_all_results['Results'].mean()

0.34285714285714286

In [349]:
shakespeare_bigrams_in_all_results['Results'].mean()

0.34285714285714286

# German plays

## Friedrich Schiller

### Characters (+1500 words)

In [25]:
schiller_characters = {**kabale_und_liebe_characters, **die_verschwoerung_des_fiesco_zu_genua_characters, **die_räuber_characters, **die_jungfrau_von_orleans_characters}

In [26]:
del schiller_characters['Frau']
del schiller_characters['Sophie']
del schiller_characters['Andreas']
del schiller_characters['Bourgognino']
del schiller_characters['Lomellon']
del schiller_characters['Sacco']
del schiller_characters['Grimm']
del schiller_characters['Razmann']
del schiller_characters['Schufterle']
del schiller_characters['Roller']
del schiller_characters['Kosisnky']
del schiller_characters['Schwartz']
del schiller_characters['La Hire']
del schiller_characters['Lionel']
del schiller_characters['Hofmarschall']
del schiller_characters['Gianettino']
del schiller_characters['Clacagno']
del schiller_characters['Burgund']
del schiller_characters['Sorel']

In [27]:
len(schiller_characters)

17

In [28]:
schiller_characters_split = corpus_split(schiller_characters, 5)

In [29]:
schiller_characters_partition = split_partitions(schiller_characters_split)
schiller_characters_test = split_test(schiller_characters_split)

In [30]:
schiller_part_1 = dict_word_tokenizer(schiller_characters_partition[0])
schiller_part_2 = dict_word_tokenizer(schiller_characters_partition[1])
schiller_part_3 = dict_word_tokenizer(schiller_characters_partition[2])
schiller_part_4 = dict_word_tokenizer(schiller_characters_partition[3])
schiller_part_5 = dict_word_tokenizer(schiller_characters_partition[4])

In [31]:
schiller_test_1 = dict_word_tokenizer(schiller_characters_test[0])
schiller_test_2 = dict_word_tokenizer(schiller_characters_test[1])
schiller_test_3 = dict_word_tokenizer(schiller_characters_test[2])
schiller_test_4 = dict_word_tokenizer(schiller_characters_test[3])
schiller_test_5 = dict_word_tokenizer(schiller_characters_test[4])

In [32]:
schiller_bigrams_part_1 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_partition[0]), 2)
schiller_bigrams_part_2 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_partition[1]), 2)
schiller_bigrams_part_3 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_partition[2]), 2)
schiller_bigrams_part_4 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_partition[3]), 2)
schiller_bigrams_part_5 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_partition[4]), 2)

In [33]:
schiller_bigrams_test_1 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_test[0]), 2)
schiller_bigrams_test_2 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_test[1]), 2)
schiller_bigrams_test_3 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_test[2]), 2)
schiller_bigrams_test_4 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_test[3]), 2)
schiller_bigrams_test_5 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_test[4]), 2)

### Top 6 characters (+4000 words)

In [40]:
schiller_top_6_characters = ['Ferdinand', 'Luise', 'Fiesco', 'Franz', 'Moor', 'Johanna']

In [41]:
schiller_top_6 = {}
for key in schiller_characters:
    if key in schiller_top_6_characters:
        schiller_top_6[key] = schiller_characters[key]

### Burrows Delta Method

In [34]:
schiller_df_1 = model_predictions(schiller_part_1, schiller_test_1, 50, schiller_characters, 'Part_1')
schiller_df_2 = model_predictions(schiller_part_2, schiller_test_2, 50, schiller_characters, 'Part_2')
schiller_df_3 = model_predictions(schiller_part_3, schiller_test_3, 50, schiller_characters, 'Part_3')
schiller_df_4 = model_predictions(schiller_part_4, schiller_test_4, 50, schiller_characters, 'Part_4')
schiller_df_5 = model_predictions(schiller_part_5, schiller_test_5, 50, schiller_characters, 'Part_5')

In [35]:
schiller_bigrams_df_1 = model_predictions(schiller_bigrams_part_1, schiller_bigrams_test_1, 50, schiller_characters, 'Part_1')
schiller_bigrams_df_2 = model_predictions(schiller_bigrams_part_2, schiller_bigrams_test_2, 50, schiller_characters, 'Part_2')
schiller_bigrams_df_3 = model_predictions(schiller_bigrams_part_3, schiller_bigrams_test_3, 50, schiller_characters, 'Part_3')
schiller_bigrams_df_4 = model_predictions(schiller_bigrams_part_4, schiller_bigrams_test_4, 50, schiller_characters, 'Part_4')
schiller_bigrams_df_5 = model_predictions(schiller_bigrams_part_5, schiller_bigrams_test_5, 50, schiller_characters, 'Part_5')

#### Results

In [36]:
schiller_results = pd.concat([schiller_df_1, schiller_df_2, schiller_df_3, schiller_df_4, schiller_df_5], axis = 1)
schiller_results = success_rate(schiller_results)
schiller_results;

In [37]:
schiller_results['Results'].mean()

0.5529411764705882

In [38]:
schiller_bigrams_results = pd.concat([schiller_bigrams_df_1, schiller_bigrams_df_2, schiller_bigrams_df_3, schiller_bigrams_df_4, schiller_bigrams_df_5], axis = 1)
schiller_bigrams_results = success_rate(schiller_bigrams_results)
schiller_bigrams_results;

In [39]:
schiller_bigrams_results['Results'].mean()

0.10588235294117647

In [43]:
schiller_top_6_results = schiller_results.filter(items = schiller_top_6_characters, axis=0)
schiller_top_6_results

Unnamed: 0,Part_1,Part_2,Part_3,Part_4,Part_5,Results
Ferdinand,Ferdinand,Ferdinand,Ferdinand,Ferdinand,Ferdinand,1.0
Luise,Luise,Luise,Luise,Luise,Luise,1.0
Fiesco,Fiesco,Fiesco,Verrina,Fiesco,Fiesco,0.8
Franz,Franz,Fiesco,Franz,Franz,Franz,0.8
Moor,Moor,Moor,Franz,Moor,Moor,0.8
Johanna,Johanna,Johanna,Johanna,Fiesco,Johanna,0.8


In [44]:
schiller_top_6_results['Results'].mean()

0.8666666666666666

## Johann Wolfgang von Goethe

### Characters (+1500 words)

In [70]:
goethe_characters = {**faust_1_characters, **faust2_characters, **egmont_characters, **iphigenie_auf_tauris_characters, **die_laune_des_verliebten_characters}

In [71]:
del goethe_characters['Die Hexe']
del goethe_characters['Kanzler']
del goethe_characters['Gemurmel']
del goethe_characters['Herold']
del goethe_characters['Plutus']
del goethe_characters['Heer Meister']
del goethe_characters['Marcshalk']
del goethe_characters['Homunculus']
del goethe_characters['Sirenen']
del goethe_characters['Mutter']
del goethe_characters['Machiavel']
del goethe_characters['Klare']
del goethe_characters['Brackenburg']
del goethe_characters['Soest']
del goethe_characters['Sekretär']
del goethe_characters['Thoas']
del goethe_characters['Arkas']
del goethe_characters['Amine']
del goethe_characters['Eridon']
del goethe_characters['Lamon']

In [72]:
len(goethe_characters)

15

In [73]:
goethe_characters_split = corpus_split(goethe_characters, 5)

In [74]:
goethe_characters_partition = split_partitions(goethe_characters_split)
goethe_characters_test = split_test(goethe_characters_split)

In [75]:
goethe_part_1 = dict_word_tokenizer(goethe_characters_partition[0])
goethe_part_2 = dict_word_tokenizer(goethe_characters_partition[1])
goethe_part_3 = dict_word_tokenizer(goethe_characters_partition[2])
goethe_part_4 = dict_word_tokenizer(goethe_characters_partition[3])
goethe_part_5 = dict_word_tokenizer(goethe_characters_partition[4])

In [77]:
goethe_test_1 = dict_word_tokenizer(goethe_characters_test[0])
goethe_test_2 = dict_word_tokenizer(goethe_characters_test[1])
goethe_test_3 = dict_word_tokenizer(goethe_characters_test[2])
goethe_test_4 = dict_word_tokenizer(goethe_characters_test[3])
goethe_test_5 = dict_word_tokenizer(goethe_characters_test[4])

In [78]:
goethe_bigrams_part_1 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_partition[0]), 2)
goethe_bigrams_part_2 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_partition[1]), 2)
goethe_bigrams_part_3 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_partition[2]), 2)
goethe_bigrams_part_4 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_partition[3]), 2)
goethe_bigrams_part_5 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_partition[4]), 2)

In [83]:
goethe_bigrams_test_1 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_test[0]), 2)
goethe_bigrams_test_2 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_test[1]), 2)
goethe_bigrams_test_3 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_test[2]), 2)
goethe_bigrams_test_4 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_test[3]), 2)
goethe_bigrams_test_5 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_test[4]), 2)

### Tp 6 characters (+4000 words)

In [79]:
goethe_top_6_characters = ['Faust', 'Mephistopheles', 'Faust II', 'Mephistopheles II', 'Egmont', 'Iphigenie']

In [80]:
goethe_top_6 = {}
for key in goethe_characters:
    if key in goethe_top_6_characters:
        goethe_top_6[key] = goethe_characters[key]

### Burrows Delta Method

In [81]:
goethe_df_1 = model_predictions(goethe_part_1, goethe_test_1, 50, goethe_characters, 'Part_1')
goethe_df_2 = model_predictions(goethe_part_2, goethe_test_2, 50, goethe_characters, 'Part_2')
goethe_df_3 = model_predictions(goethe_part_3, goethe_test_3, 50, goethe_characters, 'Part_3')
goethe_df_4 = model_predictions(goethe_part_4, goethe_test_4, 50, goethe_characters, 'Part_4')
goethe_df_5 = model_predictions(goethe_part_5, goethe_test_5, 50, goethe_characters, 'Part_5')

In [84]:
goethe_bigrams_df_1 = model_predictions(goethe_bigrams_part_1, goethe_bigrams_test_1, 50, goethe_characters, 'Part_1')
goethe_bigrams_df_2 = model_predictions(goethe_bigrams_part_2, goethe_bigrams_test_2, 50, goethe_characters, 'Part_2')
goethe_bigrams_df_3 = model_predictions(goethe_bigrams_part_3, goethe_bigrams_test_3, 50, goethe_characters, 'Part_3')
goethe_bigrams_df_4 = model_predictions(goethe_bigrams_part_4, goethe_bigrams_test_4, 50, goethe_characters, 'Part_4')
goethe_bigrams_df_5 = model_predictions(goethe_bigrams_part_5, goethe_bigrams_test_5, 50, goethe_characters, 'Part_5')

#### Result

In [91]:
goethe_results = pd.concat([goethe_df_1, goethe_df_2, goethe_df_3, goethe_df_4, goethe_df_5], axis = 1)
goethe_results = success_rate(goethe_results)
goethe_results;

In [87]:
goethe_results['Results'].mean()

0.6933333333333334

In [89]:
goethe_bigrams_results = pd.concat([goethe_bigrams_df_1, goethe_bigrams_df_2, goethe_bigrams_df_3, goethe_bigrams_df_4, goethe_bigrams_df_5], axis = 1)
goethe_bigrams_results = success_rate(goethe_bigrams_results)
goethe_bigrams_results;

In [90]:
goethe_bigrams_results['Results'].mean()

0.14666666666666667

In [92]:
goethe_top_6_results = goethe_results.filter(items = goethe_top_6_characters, axis=0)
goethe_top_6_results

Unnamed: 0,Part_1,Part_2,Part_3,Part_4,Part_5,Results
Faust,Faust,Faust,Faust,Faust,Faust,1.0
Mephistopheles,Mephistopheles,Mephistopheles,Mephistopheles,Mephistopheles,Mephistopheles,1.0
Faust II,Mephistopheles II,Faust II,Mephistopheles II,Faust,Faust II,0.4
Mephistopheles II,Mephistopheles II,Mephistopheles II,Mephistopheles II,Mephistopheles II,Mephistopheles II,1.0
Egmont,Egmont,Egmont,Egmont,Egmont,Egmont,1.0
Iphigenie,Faust,Iphigenie,Iphigenie,Iphigenie,Iphigenie,0.8


In [93]:
goethe_top_6_results['Results'].mean()

0.8666666666666667

## Both top 6

In [95]:
both_top_6 = {**schiller_top_6, **goethe_top_6}

In [101]:
both_top = corpus_split(both_top_6, 5)

In [102]:
both_top_partition = split_partitions(both_top)
both_top_test = split_test(both_top)

In [103]:
both_top_part_1 = dict_word_tokenizer(both_top_partition[0])
both_top_part_2 = dict_word_tokenizer(both_top_partition[1])
both_top_part_3 = dict_word_tokenizer(both_top_partition[2])
both_top_part_4 = dict_word_tokenizer(both_top_partition[3])
both_top_part_5 = dict_word_tokenizer(both_top_partition[4])

In [108]:
both_top_test_1 = dict_word_tokenizer(both_top_test[0])
both_top_test_2 = dict_word_tokenizer(both_top_test[1])
both_top_test_3 = dict_word_tokenizer(both_top_test[2])
both_top_test_4 = dict_word_tokenizer(both_top_test[3])
both_top_test_5 = dict_word_tokenizer(both_top_test[4])

In [122]:
both_top_bigrams_part_1 = ngrams_tokenizer(dict_word_tokenizer(both_top_partition[0]), 2)
both_top_bigrams_part_2 = ngrams_tokenizer(dict_word_tokenizer(both_top_partition[1]), 2)
both_top_bigrams_part_3 = ngrams_tokenizer(dict_word_tokenizer(both_top_partition[2]), 2)
both_top_bigrams_part_4 = ngrams_tokenizer(dict_word_tokenizer(both_top_partition[3]), 2)
both_top_bigrams_part_5 = ngrams_tokenizer(dict_word_tokenizer(both_top_partition[4]), 2)

In [109]:
both_top_bigrams_test_1 = ngrams_tokenizer(dict_word_tokenizer(both_top_test[0]), 2)
both_top_bigrams_test_2 = ngrams_tokenizer(dict_word_tokenizer(both_top_test[1]), 2)
both_top_bigrams_test_3 = ngrams_tokenizer(dict_word_tokenizer(both_top_test[2]), 2)
both_top_bigrams_test_4 = ngrams_tokenizer(dict_word_tokenizer(both_top_test[3]), 2)
both_top_bigrams_test_5 = ngrams_tokenizer(dict_word_tokenizer(both_top_test[4]), 2)

## Delta Distances

### Burrows Delta Method

In [112]:
both_df_1 = model_predictions(both_top_part_1, both_top_test_1, 100, both_top_6, 'Part_1')
both_df_2 = model_predictions(both_top_part_2, both_top_test_2, 100, both_top_6, 'Part_2')
both_df_3 = model_predictions(both_top_part_3, both_top_test_3, 100, both_top_6, 'Part_3')
both_df_4 = model_predictions(both_top_part_4, both_top_test_4, 100, both_top_6, 'Part_4')
both_df_5 = model_predictions(both_top_part_5, both_top_test_5, 100, both_top_6, 'Part_5')

In [114]:
both_top_df = pd.concat([both_df_1, both_df_2, both_df_3, both_df_4, both_df_5], axis = 1)
both_top_df = success_rate(both_top_df)
both_top_df;

In [115]:
both_top_df['Results'].mean()

0.9666666666666668

#### Per author

In [116]:
schiller_in_all_results = both_top_df.filter(items = schiller_top_6_characters, axis=0)
goethe_in_all_results = both_top_df.filter(items = goethe_top_6_characters, axis=0)

In [119]:
schiller_in_all_results['Results'].mean()

0.9666666666666667

In [120]:
goethe_in_all_results['Results'].mean()

0.9666666666666667

### Burrows Delta Method Bigrams

In [123]:
both_bigrams_df_1 = model_predictions(both_top_bigrams_part_1, both_top_bigrams_test_1, 100, both_top_6, 'Part_1')
both_bigrams_df_2 = model_predictions(both_top_bigrams_part_2, both_top_bigrams_test_2, 100, both_top_6, 'Part_2')
both_bigrams_df_3 = model_predictions(both_top_bigrams_part_3, both_top_bigrams_test_3, 100, both_top_6, 'Part_3')
both_bigrams_df_4 = model_predictions(both_top_bigrams_part_4, both_top_bigrams_test_4, 100, both_top_6, 'Part_4')
both_bigrams_df_5 = model_predictions(both_top_bigrams_part_5, both_top_bigrams_test_5, 100, both_top_6, 'Part_5')

In [126]:
both_top_bigrams_df = pd.concat([both_bigrams_df_1, both_bigrams_df_2, both_bigrams_df_3, both_bigrams_df_4, both_bigrams_df_5], axis = 1)
both_top_bigrams_df = success_rate(both_top_bigrams_df)
both_top_bigrams_df;

In [127]:
both_top_bigrams_df['Results'].mean()

0.23333333333333336