# BURROWS' DELTA METHOD

In [1]:
import nltk
from nltk import ngrams
import re
import random
import numpy as np
import pandas as pd
%matplotlib inline 

In [2]:
# Tokenizer works as an instance, which will tokenize a string into words
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [3]:
# It recovers the previously stored dicts
%store -r

In [4]:
# Allows to visuallize all colomns in a data frame
pd.set_option('display.max_columns', None)

#  Functions

## Data modeling functions

In [5]:
def partition (char_sens, n):
    '''Splits a list of sentences into 
    n parts, of equall, or close to equal
    number of sentences'''
    return [char_sens[i::n] for i in range(n)]

In [6]:
def corpus_split (play_characters, n):
    '''Takes a dict as input and performs n partitions,
    as list of sentences, in every value for every key'''
    play_characters_part = {}
    for char in play_characters.keys():
        play_characters_part[char] = partition(play_characters[char], n)
    return play_characters_part  

In [7]:
def split_partitions (play_part):
    '''It takes a 5 partition splitted dict, and returns 5 dicts with train sets 
    as values, gathering 4 out of 5 parts everytime. It always excludes a different 
    ungathered partition for testing'''
    part_1 = {}
    part_2 = {}
    part_3 = {}
    part_4 = {}
    part_5 = {}
    for char in play_part:
        for part in char:
            part_1[char] = play_part[char][0] + play_part[char][2] + play_part[char][3] + play_part[char][4]
            part_2[char] = play_part[char][0] + play_part[char][1] + play_part[char][3] + play_part[char][4]
            part_3[char] = play_part[char][0] + play_part[char][1] + play_part[char][2] + play_part[char][4]
            part_4[char] = play_part[char][0] + play_part[char][1] + play_part[char][2] + play_part[char][3]
            part_5[char] = play_part[char][1] + play_part[char][2] + play_part[char][3] + play_part[char][4]
    return part_1, part_2, part_3, part_4, part_5

In [8]:
def split_test (play_part):
    '''It takes a 5 partition splitted dict, and returns a dict where the values are those
    partitions which where excluded as part of the training sets'''
    test_1 = {}
    test_2 = {}
    test_3 = {}
    test_4 = {}
    test_5 = {}
    for char in play_part:
        for part in char:
            test_1[char] = play_part[char][1]
            test_2[char] = play_part[char][2]
            test_3[char] = play_part[char][3]
            test_4[char] = play_part[char][4]
            test_5[char] = play_part[char][0]
    return test_1, test_2, test_3, test_4, test_5

In [9]:
def word_tok_no_punct (char):
    '''It takes characters sentnces and returns them in lower
    case and tokenized by words'''
    result = []
    for sen in char:
        sen = sen.lower()
        output = tokenizer.tokenize(sen)
        result.append(output)
    return result

In [10]:
def sentences_unifier (character):
    '''It joins all the words of all 
    sentences into a string'''
    output = []
    for sen in character:
        for word in sen:
            output.append(word)
    one_string = " ".join(output)
    return output

In [11]:
def dict_word_tokenizer (play_characters):
    '''It takes a characters dict as input and returns 
    another dict as result, after applying word_tok_no_punct, 
    and snetneces_unifier functions'''
    chars = list(play_characters.keys())
    char_tokens = {}
    for char in chars:
        output = word_tok_no_punct(play_characters[char])
        result = sentences_unifier(output)
        char_tokens[char] = result
    return char_tokens

In [12]:
def ngrams_tokenizer (play_characters, n):
    '''It takes a dict, and return another dict,
    where the returned values are lists of 
    tokenized tuples of n words'''
    chars = list(play_characters.keys())
    char_ngrams_tokens = {}
    for char in chars:
        output = ngrams(play_characters[char],n)
        result = list(output)
        char_ngrams_tokens[char] = result
    return char_ngrams_tokens

## Model building functions

In [13]:
def whole_corpus_generator (partition):
    '''It takes one partition in a dict from,
    and unifies all speech lines from all 
    characters into a one unified single corpus'''
    whole_corpus = []
    for char in partition.keys():
        for word in partition[char]:
            whole_corpus.append(word)
    return whole_corpus

In [14]:
def features_generator (whole_corpus_freq, partition):
    '''It takes a list of tuples, in the form of term and frequency, or tuple 
    of terms (in the case of multiple n grams) and frecuency, and the partition
    from which it comes from as second input argument. It returns a dict with 
    characters as keys and a subordinated dict as value, where keys are the features
    previously extracted from the whole corpus, and values its frequency of appearance
    along the character subcorpus level'''
    features = [word for word,freq in whole_corpus_freq]
    feature_freqs = {}
    for char in partition:
        feature_freqs[char] = {} 
        overall = len(partition[char])
        for feature in features:
            presence = partition[char].count(feature)
            feature_freqs[char][feature] = presence / overall
    return  feature_freqs

In [15]:
def zscores (df):
    '''Converts the feature frequencies into z scores,
    geting them out of the frequencies columns, and geting
    rid of the latter ones afterwards'''
    cols = list(df.columns)
    for col in cols:
        if type(col) == tuple:
            join_col = col[0] + '-' + col[1]
            col_zscore = join_col + '_zscore'
        else:        
            col_zscore = col + '_zscore'
        df[col_zscore] = (df[col] - df[col].mean())/df[col].std(ddof=0)
    df = df.drop(cols, axis = 1)
    return df

## Evaluation functions

In [16]:
# This function is included into the get_deltas function
def delta_distance (play_characters, test_zscores, part_zscores, character):    
    '''It gets the delta distances between one character vector, 
    in the form of test z scores, and those corresponding to all 
    characters training partition individually'''
    chars = play_characters.keys()
    delta = {}
    for char in chars:
        delta[char] = (abs(test_zscores.loc[character] - part_zscores.loc[char])).sum()/50
    return delta

In [17]:
def get_deltas (writer_characters, test_zscores, train_zscores):
    '''It generates a data frame with characters as rows and columns,
    where the intersection is the delta distance between them'''
    deltas = {}
    for char in writer_characters.keys():
        result = delta_distance(writer_characters, test_zscores, train_zscores, char)
        deltas[char]= result
        df = pd.DataFrame.from_dict(deltas)
        df = df.reindex(sorted(df.columns), axis=1)
    return df

## General functions

In [18]:
# This function was developed as an automatized heuristic path for the ultherior n grams analysis
def model_predictions (plays_data, plays_test, n, plays_characters, column_name):
    '''It takes as argumentes the trainig partition, the test partition,
    n as the number of extracted features, the original dict, and a string 
    with the name of the resulting data frame column'''
    plays_data_corpus = whole_corpus_generator(plays_data)
    plays_data_corpus_freq = list(nltk.FreqDist(plays_data_corpus).most_common(n))
    plays_data_features = features_generator(plays_data_corpus_freq, plays_data)
    df_plays_data = pd.DataFrame.from_dict(plays_data_features, orient = 'index')
    plays_data_zscores = zscores(df_plays_data)
    plays_test_features = features_generator(plays_data_corpus_freq, plays_test)
    df_plays_test = pd.DataFrame.from_dict(plays_test_features, orient = 'index')
    plays_test_zscores = zscores(df_plays_test)
    deltas = get_deltas(plays_characters, plays_test_zscores, plays_data_zscores)
    predictions = deltas.idxmin()
    df_predictions = pd.DataFrame(predictions, columns = [column_name])
    
    return df_predictions

In [19]:
def success_rate (author_results):
    ''' It creates a column in the whole partitions predictions
    data frame, with the character accuracy scores as result'''
    result = []
    for char in author_results.index:
        output = sum(list(author_results.loc[char] == char))/5
        result.append(output)
    author_results['Results'] = result
    
    return author_results

# English Plays

# Oscar Wilde 

### Characters (+1500 words)

In [20]:
# Gathering all plays characters into one dict
wilde_characters = {**an_ideal_husband_characters, **a_woman_of_no_importance_characters, **lady_windermeres_fan_characters, **the_importance_of_being_earnest_characters}

In [21]:
len(wilde_characters.keys())

21

#### Split partitions generation

In [22]:
wilde_characters_split = corpus_split(wilde_characters, 5)

In [23]:
wilde_characters_partition = split_partitions(wilde_characters_split)
wilde_characters_test = split_test(wilde_characters_split)

#### Train and test corpuses generation

###### Original Deltha Method

In [24]:
wilde_part_1 = dict_word_tokenizer(wilde_characters_partition[0])
wilde_part_2 = dict_word_tokenizer(wilde_characters_partition[1])
wilde_part_3 = dict_word_tokenizer(wilde_characters_partition[2])
wilde_part_4 = dict_word_tokenizer(wilde_characters_partition[3])
wilde_part_5 = dict_word_tokenizer(wilde_characters_partition[4])

In [25]:
wilde_test_1 = dict_word_tokenizer(wilde_characters_test[0])
wilde_test_2 = dict_word_tokenizer(wilde_characters_test[1])
wilde_test_3 = dict_word_tokenizer(wilde_characters_test[2])
wilde_test_4 = dict_word_tokenizer(wilde_characters_test[3])
wilde_test_5 = dict_word_tokenizer(wilde_characters_test[4])

###### Deltha Methd with word bi-grams

In [26]:
wilde_bigrams_part_1 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_partition[0]), 2)
wilde_bigrams_part_2 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_partition[1]), 2)
wilde_bigrams_part_3 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_partition[2]), 2)
wilde_bigrams_part_4 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_partition[3]), 2)
wilde_bigrams_part_5 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_partition[4]), 2)

In [27]:
wilde_bigrams_test_1 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_test[0]), 2)
wilde_bigrams_test_2 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_test[1]), 2)
wilde_bigrams_test_3 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_test[2]), 2)
wilde_bigrams_test_4 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_test[3]), 2)
wilde_bigrams_test_5 = ngrams_tokenizer(dict_word_tokenizer(wilde_characters_test[4]), 2)

### Top 7 characters (+4000 words)

In [28]:
# Geting the top 7 characters, over 4000 words each
wilde_top_7_characters = ['Goring', 'Chiltern', 'Cheveley', 'Illingorth', 'Lady Windermere', 'Algernon', 'Jack']

In [29]:
# Making a dict of the top 7 characters
wilde_top_7 = {}
for key in wilde_characters:
    if key in wilde_top_7_characters:
        wilde_top_7[key] = wilde_characters[key]

In [30]:
wilde_top_7.keys()

dict_keys(['Goring', 'Chiltern', 'Cheveley', 'Illingorth', 'Lady Windermere', 'Jack', 'Algernon'])

In [31]:
%store wilde_top_7

Stored 'wilde_top_7' (dict)


## Partition 1

#### Train z - scores

In [32]:
wilde_part_1_corpus = whole_corpus_generator(wilde_part_1) 

In [33]:
# Getting the top 50 frequency terms from the whole partition corpus
wilde_part_1_corpus_freq = list(nltk.FreqDist(wilde_part_1_corpus).most_common(50))

In [34]:
wilde_part_1_features = features_generator(wilde_part_1_corpus_freq, wilde_part_1)

In [35]:
# We make a pandas Data Frame from the resulting dict of dicts
df_wilde_1 = pd.DataFrame.from_dict(wilde_part_1_features, orient = 'index')

In [36]:
# Getting the training z scores of the first partition in a Data Frame
wilde_zscores_1 = zscores(df_wilde_1)
wilde_zscores_1;

#### Test z - scores

In [37]:
# We repeat the same process to generate the test z scores
wilde_test_1_features = features_generator(wilde_part_1_corpus_freq, wilde_test_1)
df_wilde_test_1 = pd.DataFrame.from_dict(wilde_test_1_features, orient = 'index')
wilde_zscores_test_1 = zscores(df_wilde_test_1)
wilde_zscores_test_1;

### Delta Distances

In [38]:
wilde_1_deltas = get_deltas(wilde_characters, wilde_zscores_test_1, wilde_zscores_1)
wilde_1_deltas

Unnamed: 0,Algernon,Allonby,Berwick,Bracknell,Caversham,Cecily,Cheveley,Chiltern,Darlington,Erlynne,Gerald,Goring,Gwendolen,Hunstanton,Illingorth,Jack,Lady Chiltern,Lady Windermere,Lord Windermere,Mabel,Mrs Artbuthnot
Caversham,1.033737,1.430973,1.367975,1.382483,1.342596,1.101039,1.044462,1.13572,1.254766,1.101859,1.408035,1.094243,1.29296,1.205952,1.284633,1.215895,1.330462,1.065195,1.445795,1.116541,1.463528
Goring,0.721299,0.853645,0.892126,0.943398,1.120469,0.877074,0.708305,0.752967,1.021297,0.955769,1.109533,0.586983,0.891591,0.870076,0.763833,0.744007,0.919483,0.678023,1.339162,1.118264,0.944648
Chiltern,0.889077,1.076256,1.127302,1.042658,1.303227,0.990817,0.807185,0.609886,1.134823,1.105166,1.127037,0.814958,0.837212,1.051107,0.985698,0.834587,0.967572,0.880793,1.367009,1.187462,1.090397
Lady Chiltern,1.053716,1.169819,1.231378,1.38129,1.249683,1.065979,0.870374,0.952331,1.099986,1.09948,1.113581,0.944472,1.016097,1.16928,1.140046,0.922919,0.914569,0.876895,1.255746,1.298012,1.002711
Mabel,1.071769,0.948101,1.098839,1.17996,1.479255,1.145005,0.841133,1.021109,1.174214,0.96752,1.353732,1.056616,1.042822,1.063938,1.01837,1.023509,1.149039,1.094309,1.607605,1.066375,1.400244
Cheveley,0.75977,1.110198,1.081334,1.11336,1.129525,0.949892,0.649698,0.835915,1.137351,0.984681,1.381887,0.789738,0.823695,0.997054,0.913406,0.876684,1.041562,0.931183,1.473596,0.934219,1.032912
Illingorth,0.794116,1.058433,1.111503,1.120901,1.212437,0.9006,0.83767,0.888472,1.250889,1.155778,1.299178,0.561114,1.069462,0.903449,0.731807,0.924919,1.195525,0.881236,1.55349,1.26537,1.239962
Allonby,1.197143,1.182088,1.321777,1.24743,1.457315,1.367638,1.089348,1.202147,1.306406,1.502052,1.350104,1.178447,1.330902,1.034472,1.008233,1.301887,1.542089,1.224258,1.718433,1.57339,1.526129
Gerald,1.033375,1.176916,1.291224,1.372249,1.484601,1.082839,0.979826,1.151353,1.273519,0.912077,1.085833,1.09876,1.131989,1.300497,1.06291,0.943163,1.186524,0.925176,1.406706,1.2672,1.197908
Mrs Artbuthnot,1.30201,1.268215,1.396809,1.449143,1.361715,1.185858,1.115614,1.070707,1.332545,1.359566,1.279934,1.193592,1.316964,1.311042,1.202622,1.268683,1.166761,0.991426,1.226699,1.518168,0.849024


In [39]:
# Getting the prediction from the closest distance match
predictions_wilde_1 = wilde_1_deltas.idxmin()

In [40]:
# Generating a data frame with the predictions of the partition as column
wilde_df_1 = pd.DataFrame(predictions_wilde_1, columns = ['Part_1'])
wilde_df_1

Unnamed: 0,Part_1
Algernon,Jack
Allonby,Goring
Berwick,Berwick
Bracknell,Bracknell
Caversham,Goring
Cecily,Algernon
Cheveley,Cheveley
Chiltern,Chiltern
Darlington,Cecily
Erlynne,Lady Windermere


#### Deltha Method bi-grams predictions

In [41]:
wilde_bigram_df_1 = model_predictions(wilde_bigrams_part_1, wilde_bigrams_test_1, 50, wilde_characters, 'Part_1')
wilde_bigram_df_1

Unnamed: 0,Part_1
Algernon,Algernon
Allonby,Illingorth
Berwick,Illingorth
Bracknell,Bracknell
Caversham,Illingorth
Cecily,Algernon
Cheveley,Lady Windermere
Chiltern,Chiltern
Darlington,Darlington
Erlynne,Lady Windermere


## Partition 2

#### Train z- scores

In [42]:
wilde_part_2_corpus = whole_corpus_generator(wilde_part_2)
wilde_part_2_corpus_freq = list(nltk.FreqDist(wilde_part_2_corpus).most_common(50))
wilde_part_2_features = features_generator(wilde_part_2_corpus_freq, wilde_part_2)
df_wilde_2 = pd.DataFrame.from_dict(wilde_part_2_features, orient = 'index')
wilde_zscores_2 = zscores(df_wilde_2)
wilde_zscores_2;

#### Test z - scores

In [43]:
wilde_test_2_features = features_generator(wilde_part_2_corpus_freq, wilde_test_2)
df_wilde_test_2 = pd.DataFrame.from_dict(wilde_test_2_features, orient = 'index')
wilde_zscores_test_2 = zscores(df_wilde_test_2)
wilde_zscores_test_2;

### Delta Distances

In [44]:
wilde_2_deltas = get_deltas(wilde_characters, wilde_zscores_test_2, wilde_zscores_2)
predictions_wilde_2 = wilde_2_deltas.idxmin()
wilde_df_2 = pd.DataFrame(predictions_wilde_2, columns = ['Part_2'])
wilde_df_2;

In [45]:
wilde_bigram_df_2 = model_predictions(wilde_bigrams_part_2, wilde_bigrams_test_2, 50, wilde_characters, 'Part_2')

## Partition 3

#### Train z - scores

In [46]:
wilde_part_3_corpus = whole_corpus_generator(wilde_part_3)
wilde_part_3_corpus_freq = list(nltk.FreqDist(wilde_part_3_corpus).most_common(50))
wilde_part_3_features = features_generator(wilde_part_3_corpus_freq, wilde_part_3)
df_wilde_3 = pd.DataFrame.from_dict(wilde_part_3_features, orient = 'index')
wilde_zscores_3 = zscores(df_wilde_3)
wilde_zscores_3;

#### Test z - scores

In [47]:
wilde_test_3_features = features_generator(wilde_part_3_corpus_freq, wilde_test_3)
df_wilde_test_3 = pd.DataFrame.from_dict(wilde_test_3_features, orient = 'index')
wilde_zscores_test_3 = zscores(df_wilde_test_3)
wilde_zscores_test_3;

### Delta Distances

In [48]:
wilde_3_deltas = get_deltas(wilde_characters, wilde_zscores_test_3, wilde_zscores_3)
predictions_wilde_3 = wilde_3_deltas.idxmin()
wilde_df_3 = pd.DataFrame(predictions_wilde_3, columns = ['Part_3'])
wilde_df_3;

In [49]:
wilde_bigram_df_3 = model_predictions(wilde_bigrams_part_3, wilde_bigrams_test_3, 50, wilde_characters, 'Part_3')

## Partition 4

#### Train z - scores

In [50]:
wilde_part_4_corpus = whole_corpus_generator(wilde_part_4)
wilde_part_4_corpus_freq = list(nltk.FreqDist(wilde_part_4_corpus).most_common(50))
wilde_part_4_features = features_generator(wilde_part_4_corpus_freq, wilde_part_4)
df_wilde_4 = pd.DataFrame.from_dict(wilde_part_4_features, orient = 'index')
wilde_zscores_4 = zscores(df_wilde_4)
wilde_zscores_4;

#### Test z - scores

In [51]:
wilde_test_4_features = features_generator(wilde_part_4_corpus_freq, wilde_test_4)
df_wilde_test_4 = pd.DataFrame.from_dict(wilde_test_4_features, orient = 'index')
wilde_zscores_test_4 = zscores(df_wilde_test_4)
wilde_zscores_test_4;

### Delta Distances

In [52]:
wilde_4_deltas = get_deltas(wilde_characters, wilde_zscores_test_4, wilde_zscores_4)
predictions_wilde_4 = wilde_4_deltas.idxmin()
wilde_df_4 = pd.DataFrame(predictions_wilde_4, columns = ['Part_4'])
wilde_df_4;

In [53]:
wilde_bigram_df_4 = model_predictions(wilde_bigrams_part_4, wilde_bigrams_test_4, 50, wilde_characters, 'Part_4')

## Partition 5

#### Train z - scores

In [54]:
wilde_part_5_corpus = whole_corpus_generator(wilde_part_5)
wilde_part_5_corpus_freq = list(nltk.FreqDist(wilde_part_5_corpus).most_common(50))
wilde_part_5_features = features_generator(wilde_part_5_corpus_freq, wilde_part_5)
df_wilde_5 = pd.DataFrame.from_dict(wilde_part_5_features, orient = 'index')
wilde_zscores_5 = zscores(df_wilde_5)
wilde_zscores_5;

#### Test z - scores

In [55]:
wilde_test_5_features = features_generator(wilde_part_5_corpus_freq, wilde_test_5)
df_wilde_test_5 = pd.DataFrame.from_dict(wilde_test_5_features, orient = 'index')
wilde_zscores_test_5 = zscores(df_wilde_test_5)
wilde_zscores_test_5;

### Delta distances

In [56]:
wilde_5_deltas = get_deltas(wilde_characters, wilde_zscores_test_5, wilde_zscores_5)
predictions_wilde_5 = wilde_5_deltas.idxmin()
wilde_df_5 = pd.DataFrame(predictions_wilde_5, columns = ['Part_5'])
wilde_df_5;

In [57]:
wilde_bigram_df_5 = model_predictions(wilde_bigrams_part_5, wilde_bigrams_test_5, 50, wilde_characters, 'Part_5')

## Oscar Wilde Characters Results

##### Burrows'  Delta Method Results

In [58]:
# We concatenate all the partition predictions data frames into a single one
# and then we create the result column as an accuracy metric
wilde_results = pd.concat([wilde_df_1, wilde_df_2, wilde_df_3, wilde_df_4, wilde_df_5], axis=1)
wilde_results = success_rate(wilde_results)
wilde_results

Unnamed: 0,Part_1,Part_2,Part_3,Part_4,Part_5,Results
Algernon,Jack,Chiltern,Algernon,Jack,Algernon,0.4
Allonby,Goring,Illingorth,Chiltern,Allonby,Allonby,0.4
Berwick,Berwick,Hunstanton,Hunstanton,Goring,Hunstanton,0.2
Bracknell,Bracknell,Goring,Jack,Bracknell,Bracknell,0.6
Caversham,Goring,Lady Windermere,Caversham,Caversham,Darlington,0.4
Cecily,Algernon,Jack,Cecily,Cecily,Cecily,0.6
Cheveley,Cheveley,Cheveley,Cheveley,Cheveley,Goring,0.8
Chiltern,Chiltern,Chiltern,Chiltern,Jack,Chiltern,0.8
Darlington,Cecily,Cheveley,Darlington,Goring,Cecily,0.2
Erlynne,Lady Windermere,Erlynne,Erlynne,Lady Windermere,Goring,0.4


#### Total accuracy

In [59]:
wilde_results['Results'].mean()

0.5047619047619049

#### Results of the top 7 

In [60]:
# We get the result of the top 7 characters (+4000 words), with the purpose of exploring
# their accuracy scores, when they are predicted among the whole list of characters, 
# where 4000 words ones could be treted as noise
wilde_top_7_results = wilde_results.filter(items = wilde_top_7_characters, axis=0)
wilde_top_7_results

Unnamed: 0,Part_1,Part_2,Part_3,Part_4,Part_5,Results
Goring,Illingorth,Goring,Goring,Goring,Goring,0.8
Chiltern,Chiltern,Chiltern,Chiltern,Jack,Chiltern,0.8
Cheveley,Cheveley,Cheveley,Cheveley,Cheveley,Goring,0.8
Illingorth,Illingorth,Goring,Illingorth,Goring,Goring,0.4
Lady Windermere,Lady Windermere,Lady Windermere,Lady Windermere,Lady Windermere,Goring,0.8
Algernon,Jack,Chiltern,Algernon,Jack,Algernon,0.4
Jack,Jack,Goring,Jack,Jack,Cheveley,0.6


#### Total accuracy

In [61]:
wilde_top_7_results['Results'].mean()

0.6571428571428573

##### Burrows' Delta Method with word bi-grams

In [62]:
wilde_bigram_results = pd.concat([wilde_bigram_df_1, wilde_bigram_df_2, wilde_bigram_df_3, wilde_bigram_df_4, wilde_bigram_df_5], axis =1)
wilde_bigram_results = success_rate(wilde_bigram_results)
wilde_bigram_results['Results'].mean()

0.4095238095238095

# George Bernard Shaw

### Characters (+1500)

In [63]:
shaw_characters = {**pygmalion_characters, **androcles_and_the_lion_characters, **caesar_and_cleopatra_characters, **candida_characters, **man_and_superman_characters}

In [64]:
len(shaw_characters.keys())

23

In [65]:
shaw_characters_split = corpus_split(shaw_characters, 5)

In [66]:
shaw_characters_partition = split_partitions(shaw_characters_split)
shaw_characters_test = split_test(shaw_characters_split)

In [67]:
shaw_part_1 = dict_word_tokenizer(shaw_characters_partition[0])
shaw_part_2 = dict_word_tokenizer(shaw_characters_partition[1])
shaw_part_3 = dict_word_tokenizer(shaw_characters_partition[2])
shaw_part_4 = dict_word_tokenizer(shaw_characters_partition[3])
shaw_part_5 = dict_word_tokenizer(shaw_characters_partition[4])

In [68]:
shaw_test_1 = dict_word_tokenizer(shaw_characters_test[0])
shaw_test_2 = dict_word_tokenizer(shaw_characters_test[1])
shaw_test_3 = dict_word_tokenizer(shaw_characters_test[2])
shaw_test_4 = dict_word_tokenizer(shaw_characters_test[3])
shaw_test_5 = dict_word_tokenizer(shaw_characters_test[4])

In [69]:
shaw_bigrams_part_1 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_partition[0]), 2)
shaw_bigrams_part_2 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_partition[1]), 2)
shaw_bigrams_part_3 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_partition[2]), 2)
shaw_bigrams_part_4 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_partition[3]), 2)
shaw_bigrams_part_5 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_partition[4]), 2)

In [70]:
shaw_bigrams_test_1 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_test[0]), 2)
shaw_bigrams_test_2 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_test[1]), 2)
shaw_bigrams_test_3 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_test[2]), 2)
shaw_bigrams_test_4 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_test[3]), 2)
shaw_bigrams_test_5 = ngrams_tokenizer(dict_word_tokenizer(shaw_characters_test[4]), 2)

### Top 7 characters (+4000 words)

In [71]:
shaw_top_7_characters = ['Higgins', 'Liza', 'Caesar', 'Cleopatra', 'Morell', 'Tanner', 'Don Juan']

In [72]:
shaw_top_7 = {}
for key in shaw_characters:
    if key in shaw_top_7_characters:
        shaw_top_7[key] = shaw_characters[key]

## Partition 1

#### Train z - scores

In [73]:
shaw_part_1_corpus = whole_corpus_generator(shaw_part_1)
shaw_part_1_corpus_freq = list(nltk.FreqDist(shaw_part_1_corpus).most_common(50))
shaw_part_1_features = features_generator(shaw_part_1_corpus_freq, shaw_part_1)
df_shaw_1 = pd.DataFrame.from_dict(shaw_part_1_features, orient = 'index')
shaw_zscores_1 = zscores(df_shaw_1)
shaw_zscores_1;

#### Tests z - scores

In [74]:
shaw_test_1_features = features_generator(shaw_part_1_corpus_freq, shaw_test_1)
df_shaw_test_1 = pd.DataFrame.from_dict(shaw_test_1_features, orient = 'index')
shaw_zscores_test_1 = zscores(df_shaw_test_1)
shaw_zscores_test_1;

### Delta Distances

In [75]:
shaw_1_deltas = get_deltas(shaw_characters, shaw_zscores_test_1, shaw_zscores_1)
predictions_shaw_1 = shaw_1_deltas.idxmin()
shaw_df_1 = pd.DataFrame(predictions_shaw_1, columns = ['Part_1'])
shaw_df_1;

In [76]:
shaw_bigram_df_1 = model_predictions(shaw_bigrams_part_1, shaw_bigrams_test_1, 50, shaw_characters, 'Part_1')
shaw_bigram_df_1;

## Partition 2

#### Train z - scores

In [77]:
shaw_part_2_corpus = whole_corpus_generator(shaw_part_2)
shaw_part_2_corpus_freq = list(nltk.FreqDist(shaw_part_2_corpus).most_common(50))
shaw_part_2_features = features_generator(shaw_part_2_corpus_freq, shaw_part_2)
df_shaw_2 = pd.DataFrame.from_dict(shaw_part_2_features, orient = 'index')
shaw_zscores_2 = zscores(df_shaw_2)
shaw_zscores_2;

#### Test z - scores

In [78]:
shaw_test_2_features = features_generator(shaw_part_2_corpus_freq, shaw_test_2)
df_shaw_test_2 = pd.DataFrame.from_dict(shaw_test_2_features, orient = 'index')
shaw_zscores_test_2 = zscores(df_shaw_test_2)
shaw_zscores_test_2;

### Delta Distances

In [79]:
shaw_2_deltas = get_deltas(shaw_characters, shaw_zscores_test_2, shaw_zscores_2)
predictions_shaw_2 = shaw_2_deltas.idxmin()
shaw_df_2 = pd.DataFrame(predictions_shaw_2, columns = ['Part_2'])
shaw_df_2;

In [80]:
shaw_bigram_df_2 = model_predictions(shaw_bigrams_part_2, shaw_bigrams_test_2, 50, shaw_characters, 'Part_2')
shaw_bigram_df_2;

## Partition 3

#### Train z - scores

In [81]:
shaw_part_3_corpus = whole_corpus_generator(shaw_part_3)
shaw_part_3_corpus_freq = list(nltk.FreqDist(shaw_part_3_corpus).most_common(50))
shaw_part_3_features = features_generator(shaw_part_3_corpus_freq, shaw_part_3)
df_shaw_3 = pd.DataFrame.from_dict(shaw_part_3_features, orient = 'index')
shaw_zscores_3 = zscores(df_shaw_3)
shaw_zscores_3;

#### Test z - scores

In [82]:
shaw_test_3_features = features_generator(shaw_part_3_corpus_freq, shaw_test_3)
df_shaw_test_3 = pd.DataFrame.from_dict(shaw_test_3_features, orient = 'index')
shaw_zscores_test_3 = zscores(df_shaw_test_3)
shaw_zscores_test_3;

### Delta Distances

In [83]:
shaw_3_deltas = get_deltas(shaw_characters, shaw_zscores_test_3, shaw_zscores_3)
predictions_shaw_3 = shaw_3_deltas.idxmin()
shaw_df_3 = pd.DataFrame(predictions_shaw_3, columns = ['Part_3'])
shaw_df_3;

In [84]:
shaw_bigram_df_3 = model_predictions(shaw_bigrams_part_3, shaw_bigrams_test_3, 50, shaw_characters, 'Part_3')
shaw_bigram_df_3;

## Partition 4

#### Train z - scores

In [85]:
shaw_part_4_corpus = whole_corpus_generator(shaw_part_4)
shaw_part_4_corpus_freq = list(nltk.FreqDist(shaw_part_4_corpus).most_common(50))
shaw_part_4_features = features_generator(shaw_part_4_corpus_freq, shaw_part_4)
df_shaw_4 = pd.DataFrame.from_dict(shaw_part_4_features, orient = 'index')
shaw_zscores_4 = zscores(df_shaw_4)
shaw_zscores_4;

#### Test z - scores

In [86]:
shaw_test_4_features = features_generator(shaw_part_4_corpus_freq, shaw_test_4)
df_shaw_test_4 = pd.DataFrame.from_dict(shaw_test_4_features, orient = 'index')
shaw_zscores_test_4 = zscores(df_shaw_test_4)
shaw_zscores_test_4;

### Delta Distances

In [87]:
shaw_4_deltas = get_deltas(shaw_characters, shaw_zscores_test_4, shaw_zscores_4)
predictions_shaw_4 = shaw_4_deltas.idxmin()
shaw_df_4 = pd.DataFrame(predictions_shaw_4, columns = ['Part_4'])
shaw_df_4;

In [88]:
shaw_bigram_df_4 = model_predictions(shaw_bigrams_part_4, shaw_bigrams_test_4, 50, shaw_characters, 'Part_4')
shaw_bigram_df_4;

## Partition 5

#### Train z - scores

In [89]:
shaw_part_5_corpus = whole_corpus_generator(shaw_part_5)
shaw_part_5_corpus_freq = list(nltk.FreqDist(shaw_part_5_corpus).most_common(50))
shaw_part_5_features = features_generator(shaw_part_5_corpus_freq, shaw_part_5)
df_shaw_5 = pd.DataFrame.from_dict(shaw_part_5_features, orient = 'index')
shaw_zscores_5 = zscores(df_shaw_5)
shaw_zscores_5;

#### Test z - scores

In [90]:
shaw_test_5_features = features_generator(shaw_part_5_corpus_freq, shaw_test_5)
df_shaw_test_5 = pd.DataFrame.from_dict(shaw_test_5_features, orient = 'index')
shaw_zscores_test_5 = zscores(df_shaw_test_5)
shaw_zscores_test_5;

### Delta Distances

In [91]:
shaw_5_deltas = get_deltas(shaw_characters, shaw_zscores_test_5, shaw_zscores_5)
predictions_shaw_5 = shaw_5_deltas.idxmin()
shaw_df_5 = pd.DataFrame(predictions_shaw_5, columns = ['Part_5'])
shaw_df_5;

In [92]:
shaw_bigram_df_5 = model_predictions(shaw_bigrams_part_5, shaw_bigrams_test_5, 50, shaw_characters, 'Part_5')
shaw_bigram_df_5;

## George Bernard Shaw Characters Results

In [93]:
shaw_results = pd.concat([shaw_df_1, shaw_df_2, shaw_df_3, shaw_df_4, shaw_df_5], axis=1)
shaw_results = success_rate(shaw_results)
shaw_results

Unnamed: 0,Part_1,Part_2,Part_3,Part_4,Part_5,Results
Androcles,Tanner,Androcles,Lavinia,Morell,Androcles,0.4
Ann,Ann,Ann,Ann,Ann,Ann,1.0
Apollodorus,Don Juan,Mendoza,Apollodorus,Mendoza,Apollodorus,0.4
Burgess,Morell,Burgess,Burgess,Burgess,Burgess,0.8
Caesar,Caesar,Caesar,Caesar,Caesar,Caesar,1.0
Candida,Morell,Candida,Candida,Candida,Candida,0.8
Captain,Captain,Caesar,Captain,Caesar,Captain,0.6
Cleopatra,Cleopatra,Cleopatra,Cleopatra,Cleopatra,Cleopatra,1.0
Don Juan,Don Juan,Don Juan,Don Juan,Don Juan,Don Juan,1.0
Doolittle,Doolittle,Doolittle,Doolittle,Doolittle,Tanner,0.8


In [94]:
shaw_results['Results'].mean()

0.6695652173913043

In [95]:
shaw_top_7_results = shaw_results.filter(items = shaw_top_7_characters, axis=0)
shaw_top_7_results

Unnamed: 0,Part_1,Part_2,Part_3,Part_4,Part_5,Results
Higgins,Higgins,Higgins,Higgins,Higgins,Higgins,1.0
Liza,Liza,Liza,Liza,Liza,Liza,1.0
Caesar,Caesar,Caesar,Caesar,Caesar,Caesar,1.0
Cleopatra,Cleopatra,Cleopatra,Cleopatra,Cleopatra,Cleopatra,1.0
Morell,Tanner,Morell,Morell,Morell,Morell,0.8
Tanner,Tanner,Tanner,Tanner,Tanner,Tanner,1.0
Don Juan,Don Juan,Don Juan,Don Juan,Don Juan,Don Juan,1.0


In [96]:
shaw_top_7_results['Results'].mean()

0.9714285714285714

In [97]:
shaw_bigram_results = pd.concat([shaw_bigram_df_1, shaw_bigram_df_2, shaw_bigram_df_3, shaw_bigram_df_4, shaw_bigram_df_5], axis =1)
shaw_bigram_results = success_rate(shaw_bigram_results)
shaw_bigram_results['Results'].mean()

0.32173913043478264

# Ben Jonson

### Characters (+1500 words)

In [98]:
jonson_characters = {**cynthias_revels_characters, **every_man_on_his_humour_characters, **volpone_or_the_fox_characters, **the_alchemist_characters}

In [99]:
len(jonson_characters.keys())

22

In [100]:
jonson_characters_split = corpus_split(jonson_characters, 5)

In [101]:
jonson_characters_partition = split_partitions(jonson_characters_split)
jonson_characters_test = split_test(jonson_characters_split)

In [102]:
jonson_part_1 = dict_word_tokenizer(jonson_characters_partition[0])
jonson_part_2 = dict_word_tokenizer(jonson_characters_partition[1])
jonson_part_3 = dict_word_tokenizer(jonson_characters_partition[2])
jonson_part_4 = dict_word_tokenizer(jonson_characters_partition[3])
jonson_part_5 = dict_word_tokenizer(jonson_characters_partition[4])

In [103]:
jonson_test_1 = dict_word_tokenizer(jonson_characters_test[0])
jonson_test_2 = dict_word_tokenizer(jonson_characters_test[1])
jonson_test_3 = dict_word_tokenizer(jonson_characters_test[2])
jonson_test_4 = dict_word_tokenizer(jonson_characters_test[3])
jonson_test_5 = dict_word_tokenizer(jonson_characters_test[4])

In [104]:
jonson_bigrams_part_1 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_partition[0]), 2)
jonson_bigrams_part_2 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_partition[1]), 2)
jonson_bigrams_part_3 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_partition[2]), 2)
jonson_bigrams_part_4 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_partition[3]), 2)
jonson_bigrams_part_5 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_partition[4]), 2)

In [105]:
jonson_bigrams_test_1 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_test[0]), 2)
jonson_bigrams_test_2 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_test[1]), 2)
jonson_bigrams_test_3 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_test[2]), 2)
jonson_bigrams_test_4 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_test[3]), 2)
jonson_bigrams_test_5 = ngrams_tokenizer(dict_word_tokenizer(jonson_characters_test[4]), 2)

###  Top 7 characters (+4000 words)

In [106]:
jonson_top_7_characters = ['Mercury', 'Amorphus', 'Crites', 'Volpone', 'Mosca', 'Face', 'Subtle']

In [107]:
jonson_top_7 = {}
for key in jonson_characters:
    if key in jonson_top_7_characters:
        jonson_top_7[key] = jonson_characters[key]

## Partition 1

#### Train z - sccores

In [108]:
jonson_part_1_corpus = whole_corpus_generator(jonson_part_1)
jonson_part_1_corpus_freq = list(nltk.FreqDist(jonson_part_1_corpus).most_common(50))
jonson_part_1_features = features_generator(jonson_part_1_corpus_freq, jonson_part_1)
df_jonson_1 = pd.DataFrame.from_dict(jonson_part_1_features, orient = 'index')
jonson_zscores_1 = zscores(df_jonson_1)
jonson_zscores_1;

#### Test z - scores

In [109]:
jonson_test_1_features = features_generator(jonson_part_1_corpus_freq, jonson_test_1)
df_jonson_test_1 = pd.DataFrame.from_dict(jonson_test_1_features, orient = 'index')
jonson_zscores_test_1 = zscores(df_jonson_test_1)
jonson_zscores_test_1;

### Delta Distances

In [110]:
jonson_1_deltas = get_deltas(jonson_characters, jonson_zscores_test_1, jonson_zscores_1)
predictions_jonson_1 = jonson_1_deltas.idxmin()
jonson_df_1 = pd.DataFrame(predictions_jonson_1, columns = ['Part_1'])
jonson_df_1;

In [111]:
jonson_bigram_df_1 = model_predictions(jonson_bigrams_part_1, jonson_bigrams_test_1, 50, jonson_characters, 'Part_1')
jonson_bigram_df_1;

## Partition 2

#### Train z - scores

In [112]:
jonson_part_2_corpus = whole_corpus_generator(jonson_part_2)
jonson_part_2_corpus_freq = list(nltk.FreqDist(jonson_part_2_corpus).most_common(50))
jonson_part_2_features = features_generator(jonson_part_2_corpus_freq, jonson_part_2)
df_jonson_2 = pd.DataFrame.from_dict(jonson_part_2_features, orient = 'index')
jonson_zscores_2 = zscores(df_jonson_2)
jonson_zscores_2;

#### Test z - scores

In [113]:
jonson_test_2_features = features_generator(jonson_part_2_corpus_freq, jonson_test_2)
df_jonson_test_2 = pd.DataFrame.from_dict(jonson_test_2_features, orient = 'index')
jonson_zscores_test_2 = zscores(df_jonson_test_2)
jonson_zscores_test_2;

### Delta Distances

In [114]:
jonson_2_deltas = get_deltas(jonson_characters, jonson_zscores_test_2, jonson_zscores_2)
predictions_jonson_2 = jonson_2_deltas.idxmin()
jonson_df_2 = pd.DataFrame(predictions_jonson_2, columns = ['Part_2'])
jonson_df_2;

In [115]:
jonson_bigram_df_2 = model_predictions(jonson_bigrams_part_2, jonson_bigrams_test_2, 50, jonson_characters, 'Part_2')
jonson_bigram_df_2;

## Partition 3

#### Train z - scores

In [116]:
jonson_part_3_corpus = whole_corpus_generator(jonson_part_3)
jonson_part_3_corpus_freq = list(nltk.FreqDist(jonson_part_3_corpus).most_common(50))
jonson_part_3_features = features_generator(jonson_part_3_corpus_freq, jonson_part_3)
df_jonson_3 = pd.DataFrame.from_dict(jonson_part_3_features, orient = 'index')
jonson_zscores_3 = zscores(df_jonson_3)
jonson_zscores_3;

#### Test z - scores

In [117]:
jonson_test_3_features = features_generator(jonson_part_3_corpus_freq, jonson_test_3)
df_jonson_test_3 = pd.DataFrame.from_dict(jonson_test_3_features, orient = 'index')
jonson_zscores_test_3 = zscores(df_jonson_test_3)
jonson_zscores_test_3;

### Delta Distances

In [118]:
jonson_3_deltas = get_deltas(jonson_characters, jonson_zscores_test_3, jonson_zscores_3)
predictions_jonson_3 = jonson_3_deltas.idxmin()
jonson_df_3 = pd.DataFrame(predictions_jonson_3, columns = ['Part_3'])
jonson_df_3;

In [119]:
jonson_bigram_df_3 = model_predictions(jonson_bigrams_part_3, jonson_bigrams_test_3, 50, jonson_characters, 'Part_3')
jonson_bigram_df_3;

## Partition 4

#### Train z - scores

In [120]:
jonson_part_4_corpus = whole_corpus_generator(jonson_part_4)
jonson_part_4_corpus_freq = list(nltk.FreqDist(jonson_part_4_corpus).most_common(50))
jonson_part_4_features = features_generator(jonson_part_4_corpus_freq, jonson_part_4)
df_jonson_4 = pd.DataFrame.from_dict(jonson_part_4_features, orient = 'index')
jonson_zscores_4 = zscores(df_jonson_4)
jonson_zscores_4;

#### Test z - scores

In [121]:
jonson_test_4_features = features_generator(jonson_part_4_corpus_freq, jonson_test_4)
df_jonson_test_4 = pd.DataFrame.from_dict(jonson_test_4_features, orient = 'index')
jonson_zscores_test_4 = zscores(df_jonson_test_4)
jonson_zscores_test_4;

### Delta Distances

In [122]:
jonson_4_deltas = get_deltas(jonson_characters, jonson_zscores_test_4, jonson_zscores_4)
predictions_jonson_4 = jonson_4_deltas.idxmin()
jonson_df_4 = pd.DataFrame(predictions_jonson_4, columns = ['Part_4'])
jonson_df_4;

In [123]:
jonson_bigram_df_4 = model_predictions(jonson_bigrams_part_4, jonson_bigrams_test_4, 50, jonson_characters, 'Part_4')
jonson_bigram_df_4;

## Partition 5

#### Train z - scores

In [124]:
jonson_part_5_corpus = whole_corpus_generator(jonson_part_5)
jonson_part_5_corpus_freq = list(nltk.FreqDist(jonson_part_5_corpus).most_common(50))
jonson_part_5_features = features_generator(jonson_part_5_corpus_freq, jonson_part_5)
df_jonson_5 = pd.DataFrame.from_dict(jonson_part_5_features, orient = 'index')
jonson_zscores_5 = zscores(df_jonson_5)
jonson_zscores_5;

#### Test z - scores

In [125]:
jonson_test_5_features = features_generator(jonson_part_5_corpus_freq, jonson_test_5)
df_jonson_test_5 = pd.DataFrame.from_dict(jonson_test_5_features, orient = 'index')
jonson_zscores_test_5 = zscores(df_jonson_test_5)
jonson_zscores_test_5;

### Delta Distances

In [126]:
jonson_5_deltas = get_deltas(jonson_characters, jonson_zscores_test_5, jonson_zscores_5)
predictions_jonson_5 = jonson_5_deltas.idxmin()
jonson_df_5 = pd.DataFrame(predictions_jonson_5, columns = ['Part_5'])
jonson_df_5;

In [127]:
jonson_bigram_df_5 = model_predictions(jonson_bigrams_part_5, jonson_bigrams_test_5, 50, jonson_characters, 'Part_5')
jonson_bigram_df_5;

## Ben Jonson Characters Results

In [128]:
jonson_results = pd.concat([jonson_df_1, jonson_df_2, jonson_df_3, jonson_df_4, jonson_df_5], axis=1)
jonson_results = success_rate(jonson_results)
jonson_results;

In [129]:
jonson_results['Results'].mean()

0.49090909090909096

In [130]:
jonson_top_7_results = jonson_results.filter(items = jonson_top_7_characters, axis=0)
jonson_top_7_results

Unnamed: 0,Part_1,Part_2,Part_3,Part_4,Part_5,Results
Mercury,Mercury,Mercury,Mercury,Mercury,Mercury,1.0
Amorphus,Amorphus,Amorphus,Amorphus,Amorphus,Subtle,0.8
Crites,Crites,Volpone,Mercury,Crites,Mercury,0.4
Volpone,Volpone,Volpone,Volpone,Volpone,Volpone,1.0
Mosca,Mosca,Mosca,Mosca,Mosca,Mosca,1.0
Face,Face,Face,Face,Face,Face,1.0
Subtle,Subtle,Subtle,Subtle,Subtle,Subtle,1.0


In [131]:
jonson_top_7_results['Results'].mean()

0.8857142857142858

In [132]:
jonson_bigram_results = pd.concat([jonson_bigram_df_1, jonson_bigram_df_2, jonson_bigram_df_3, jonson_bigram_df_4, jonson_bigram_df_5], axis =1)
jonson_bigram_results = success_rate(jonson_bigram_results)
jonson_bigram_results['Results'].mean()

0.21818181818181817

# William Shakespeare

### Characters (+1500 words)

In [133]:
shakespeare_characters = {**macbeth_characters, **romeo_and_juliet_characters, **othello_characters, **hamlet_characters, **king_lear_characters}

In [134]:
len(shakespeare_characters.keys())

21

In [135]:
shakespeare_characters_split = corpus_split(shakespeare_characters, 5)

In [136]:
shakespeare_characters_partition = split_partitions(shakespeare_characters_split)
shakespeare_characters_test = split_test(shakespeare_characters_split)

In [137]:
shakespeare_part_1 = dict_word_tokenizer(shakespeare_characters_partition[0])
shakespeare_part_2 = dict_word_tokenizer(shakespeare_characters_partition[1])
shakespeare_part_3 = dict_word_tokenizer(shakespeare_characters_partition[2])
shakespeare_part_4 = dict_word_tokenizer(shakespeare_characters_partition[3])
shakespeare_part_5 = dict_word_tokenizer(shakespeare_characters_partition[4])

In [138]:
shakespeare_test_1 = dict_word_tokenizer(shakespeare_characters_test[0])
shakespeare_test_2 = dict_word_tokenizer(shakespeare_characters_test[1])
shakespeare_test_3 = dict_word_tokenizer(shakespeare_characters_test[2])
shakespeare_test_4 = dict_word_tokenizer(shakespeare_characters_test[3])
shakespeare_test_5 = dict_word_tokenizer(shakespeare_characters_test[4])

In [139]:
shakespeare_bigrams_part_1 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_partition[0]), 2)
shakespeare_bigrams_part_2 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_partition[1]), 2)
shakespeare_bigrams_part_3 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_partition[2]), 2)
shakespeare_bigrams_part_4 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_partition[3]), 2)
shakespeare_bigrams_part_5 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_partition[4]), 2)

In [140]:
shakespeare_bigrams_test_1 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_test[0]), 2)
shakespeare_bigrams_test_2 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_test[1]), 2)
shakespeare_bigrams_test_3 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_test[2]), 2)
shakespeare_bigrams_test_4 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_test[3]), 2)
shakespeare_bigrams_test_5 = ngrams_tokenizer(dict_word_tokenizer(shakespeare_characters_test[4]), 2)

### Top 7 characters (+4000)

In [141]:
shakespeare_top_7_characters = ['Macbeth', 'Romeo', 'Juliet', 'Othello', 'Iago', 'Hamlet', 'Lear']

In [142]:
shakespeare_top_7 = {}
for key in shakespeare_characters:
    if key in shakespeare_top_7_characters:
        shakespeare_top_7[key] = shakespeare_characters[key]

## Partition 1

#### Train z - scores

In [143]:
shakespeare_part_1_corpus = whole_corpus_generator(shakespeare_part_1)
shakespeare_part_1_corpus_freq = list(nltk.FreqDist(shakespeare_part_1_corpus).most_common(50))
shakespeare_part_1_features = features_generator(shakespeare_part_1_corpus_freq, shakespeare_part_1)
df_shakespeare_1 = pd.DataFrame.from_dict(shakespeare_part_1_features, orient = 'index')
shakespeare_zscores_1 = zscores(df_shakespeare_1)
shakespeare_zscores_1;

#### Test z - scores

In [144]:
shakespeare_test_1_features = features_generator(shakespeare_part_1_corpus_freq, shakespeare_test_1)
df_shakespeare_test_1 = pd.DataFrame.from_dict(shakespeare_test_1_features, orient = 'index')
shakespeare_zscores_test_1 = zscores(df_shakespeare_test_1)
shakespeare_zscores_test_1;

### Delta Distances

In [145]:
shakespeare_1_deltas = get_deltas(shakespeare_characters, shakespeare_zscores_test_1, shakespeare_zscores_1)
predictions_shakespeare_1 = shakespeare_1_deltas.idxmin()
shakespeare_df_1 = pd.DataFrame(predictions_shakespeare_1, columns = ['Part_1'])
shakespeare_df_1;

In [146]:
shakespeare_bigram_df_1 = model_predictions(shakespeare_bigrams_part_5, shakespeare_bigrams_test_5, 50, shakespeare_characters, 'Part_1')
shakespeare_bigram_df_1;

## Partition 2

#### Train z - scores

In [147]:
shakespeare_part_2_corpus = whole_corpus_generator(shakespeare_part_2)
shakespeare_part_2_corpus_freq = list(nltk.FreqDist(shakespeare_part_2_corpus).most_common(50))
shakespeare_part_2_features = features_generator(shakespeare_part_2_corpus_freq, shakespeare_part_2)
df_shakespeare_2 = pd.DataFrame.from_dict(shakespeare_part_2_features, orient = 'index')
shakespeare_zscores_2 = zscores(df_shakespeare_2)
shakespeare_zscores_2;

#### Test z - scores

In [148]:
shakespeare_test_2_features = features_generator(shakespeare_part_2_corpus_freq, shakespeare_test_2)
df_shakespeare_test_2 = pd.DataFrame.from_dict(shakespeare_test_2_features, orient = 'index')
shakespeare_zscores_test_2 = zscores(df_shakespeare_test_2)
shakespeare_zscores_test_2;

### Delta Distances

In [149]:
shakespeare_2_deltas = get_deltas(shakespeare_characters, shakespeare_zscores_test_2, shakespeare_zscores_2)
predictions_shakespeare_2 = shakespeare_2_deltas.idxmin()
shakespeare_df_2 = pd.DataFrame(predictions_shakespeare_2, columns = ['Part_2'])
shakespeare_df_2;

In [150]:
shakespeare_bigram_df_2 = model_predictions(shakespeare_bigrams_part_2, shakespeare_bigrams_test_2, 50, shakespeare_characters, 'Part_2')
shakespeare_bigram_df_2;

## Partition 3

#### Train z - scores

In [151]:
shakespeare_part_3_corpus = whole_corpus_generator(shakespeare_part_3)
shakespeare_part_3_corpus_freq = list(nltk.FreqDist(shakespeare_part_3_corpus).most_common(50))
shakespeare_part_3_features = features_generator(shakespeare_part_3_corpus_freq, shakespeare_part_3)
df_shakespeare_3 = pd.DataFrame.from_dict(shakespeare_part_3_features, orient = 'index')
shakespeare_zscores_3 = zscores(df_shakespeare_3)
shakespeare_zscores_3;

#### Test z - scores

In [152]:
shakespeare_test_3_features = features_generator(shakespeare_part_3_corpus_freq, shakespeare_test_3)
df_shakespeare_test_3 = pd.DataFrame.from_dict(shakespeare_test_3_features, orient = 'index')
shakespeare_zscores_test_3 = zscores(df_shakespeare_test_3)
shakespeare_zscores_test_3;

### Delta Distances

In [153]:
shakespeare_3_deltas = get_deltas(shakespeare_characters, shakespeare_zscores_test_3, shakespeare_zscores_3)
predictions_shakespeare_3 = shakespeare_3_deltas.idxmin()
shakespeare_df_3 = pd.DataFrame(predictions_shakespeare_3, columns = ['Part_3'])
shakespeare_df_3;

In [154]:
shakespeare_bigram_df_3 = model_predictions(shakespeare_bigrams_part_3, shakespeare_bigrams_test_3, 50, shakespeare_characters, 'Part_3')
shakespeare_bigram_df_3;

## Partition 4

#### Train z - scores

In [155]:
shakespeare_part_4_corpus = whole_corpus_generator(shakespeare_part_4)
shakespeare_part_4_corpus_freq = list(nltk.FreqDist(shakespeare_part_4_corpus).most_common(50))
shakespeare_part_4_features = features_generator(shakespeare_part_4_corpus_freq, shakespeare_part_4)
df_shakespeare_4 = pd.DataFrame.from_dict(shakespeare_part_4_features, orient = 'index')
shakespeare_zscores_4 = zscores(df_shakespeare_4)
shakespeare_zscores_4;

#### Test z - scores

In [156]:
shakespeare_test_4_features = features_generator(shakespeare_part_4_corpus_freq, shakespeare_test_4)
df_shakespeare_test_4 = pd.DataFrame.from_dict(shakespeare_test_4_features, orient = 'index')
shakespeare_zscores_test_4 = zscores(df_shakespeare_test_4)
shakespeare_zscores_test_4;

### Delta Distances

In [157]:
shakespeare_4_deltas = get_deltas(shakespeare_characters, shakespeare_zscores_test_4, shakespeare_zscores_4)
predictions_shakespeare_4 = shakespeare_4_deltas.idxmin()
shakespeare_df_4 = pd.DataFrame(predictions_shakespeare_4, columns = ['Part_4'])
shakespeare_df_4;

In [158]:
shakespeare_bigram_df_4 = model_predictions(shakespeare_bigrams_part_4, shakespeare_bigrams_test_4, 50, shakespeare_characters, 'Part_4')
shakespeare_bigram_df_4;

## Partition 5

#### Train z - scores

In [159]:
shakespeare_part_5_corpus = whole_corpus_generator(shakespeare_part_5)
shakespeare_part_5_corpus_freq = list(nltk.FreqDist(shakespeare_part_5_corpus).most_common(50))
shakespeare_part_5_features = features_generator(shakespeare_part_5_corpus_freq, shakespeare_part_5)
df_shakespeare_5 = pd.DataFrame.from_dict(shakespeare_part_5_features, orient = 'index')
shakespeare_zscores_5 = zscores(df_shakespeare_5)
shakespeare_zscores_5;

#### Test z - scores

In [160]:
shakespeare_test_5_features = features_generator(shakespeare_part_5_corpus_freq, shakespeare_test_5)
df_shakespeare_test_5 = pd.DataFrame.from_dict(shakespeare_test_5_features, orient = 'index')
shakespeare_zscores_test_5 = zscores(df_shakespeare_test_5)
shakespeare_zscores_test_5;

### Delta Distances

In [161]:
shakespeare_5_deltas = get_deltas(shakespeare_characters, shakespeare_zscores_test_5, shakespeare_zscores_5)
predictions_shakespeare_5 = shakespeare_5_deltas.idxmin()
shakespeare_df_5 = pd.DataFrame(predictions_shakespeare_5, columns = ['Part_5'])
shakespeare_df_5;

In [162]:
shakespeare_bigram_df_5 = model_predictions(shakespeare_bigrams_part_5, shakespeare_bigrams_test_5, 50, shakespeare_characters, 'Part_5')
shakespeare_bigram_df_5;

## William Shakespeare Characters Results

In [163]:
shakespeare_results = pd.concat([shakespeare_df_1, shakespeare_df_2, shakespeare_df_3, shakespeare_df_4, shakespeare_df_5], axis=1)
shakespeare_results = success_rate(shakespeare_results)
shakespeare_results;

In [164]:
shakespeare_results['Results'].mean()

0.5142857142857143

In [165]:
shakespeare_top_7_results = shakespeare_results.filter(items = shakespeare_top_7_characters, axis=0)
shakespeare_top_7_results;

In [166]:
shakespeare_top_7_results['Results'].mean()

0.8

In [167]:
shakespeare_bigram_results = pd.concat([shakespeare_bigram_df_1, shakespeare_bigram_df_2, shakespeare_bigram_df_3, shakespeare_bigram_df_4, shakespeare_bigram_df_5], axis =1)
shakespeare_bigram_results = success_rate(shakespeare_bigram_results)
shakespeare_bigram_results['Results'].mean()

0.21904761904761907

## All top 7 characters

#### We take all the top +4000 words characters together, with the purpose of exploring the accuracy of each author top characters when presented among another large corpuses ones as noise

In [168]:
all_top_7 = {**wilde_top_7, **shaw_top_7, **jonson_top_7, **shakespeare_top_7}

In [169]:
all_top = corpus_split(all_top_7, 5)

In [170]:
all_top_partition = split_partitions(all_top)
all_top_test = split_test(all_top)

In [171]:
all_top_part_1 = dict_word_tokenizer(all_top_partition[0])
all_top_part_2 = dict_word_tokenizer(all_top_partition[1])
all_top_part_3 = dict_word_tokenizer(all_top_partition[2])
all_top_part_4 = dict_word_tokenizer(all_top_partition[3])
all_top_part_5 = dict_word_tokenizer(all_top_partition[4])

In [172]:
all_top_test_1 = dict_word_tokenizer(all_top_test[0])
all_top_test_2 = dict_word_tokenizer(all_top_test[1])
all_top_test_3 = dict_word_tokenizer(all_top_test[2])
all_top_test_4 = dict_word_tokenizer(all_top_test[3])
all_top_test_5 = dict_word_tokenizer(all_top_test[4])

In [173]:
all_top_bigrams_part_1 = ngrams_tokenizer(dict_word_tokenizer(all_top_partition[0]), 2)
all_top_bigrams_part_2 = ngrams_tokenizer(dict_word_tokenizer(all_top_partition[1]), 2)
all_top_bigrams_part_3 = ngrams_tokenizer(dict_word_tokenizer(all_top_partition[2]), 2)
all_top_bigrams_part_4 = ngrams_tokenizer(dict_word_tokenizer(all_top_partition[3]), 2)
all_top_bigrams_part_5 = ngrams_tokenizer(dict_word_tokenizer(all_top_partition[4]), 2)

In [174]:
all_top_bigrams_test_1 = ngrams_tokenizer(dict_word_tokenizer(all_top_test[0]), 2)
all_top_bigrams_test_2 = ngrams_tokenizer(dict_word_tokenizer(all_top_test[1]), 2)
all_top_bigrams_test_3 = ngrams_tokenizer(dict_word_tokenizer(all_top_test[2]), 2)
all_top_bigrams_test_4 = ngrams_tokenizer(dict_word_tokenizer(all_top_test[3]), 2)
all_top_bigrams_test_5 = ngrams_tokenizer(dict_word_tokenizer(all_top_test[4]), 2)

## Deltas Distances

##### Burrows' Delta Method 

In [175]:
all_df_1 = model_predictions(all_top_part_1, all_top_test_1, 100, all_top_7, 'Part_1')
all_df_2 = model_predictions(all_top_part_2, all_top_test_2, 100, all_top_7, 'Part_2')
all_df_3 = model_predictions(all_top_part_3, all_top_test_3, 100, all_top_7, 'Part_3')
all_df_4 = model_predictions(all_top_part_4, all_top_test_4, 100, all_top_7, 'Part_4')
all_df_5 = model_predictions(all_top_part_5, all_top_test_5, 100, all_top_7, 'Part_5')

In [176]:
all_top_df = pd.concat([all_df_1, all_df_2, all_df_3, all_df_4, all_df_5], axis = 1)
all_top_df = success_rate(all_top_df)

In [177]:
all_top_df['Results'].mean()

0.835714285714286

#### Per author

In [178]:
wilde_in_all_results = all_top_df.filter(items = wilde_top_7_characters, axis=0)
shaw_in_all_results = all_top_df.filter(items = shaw_top_7_characters, axis=0)
jonson_in_all_results = all_top_df.filter(items = jonson_top_7_characters, axis=0)
shakespeare_in_all_results = all_top_df.filter(items = shakespeare_top_7_characters, axis=0)

In [179]:
wilde_in_all_results['Results'].mean()

0.7999999999999999

In [180]:
shaw_in_all_results['Results'].mean()

0.9428571428571428

In [181]:
jonson_in_all_results['Results'].mean()

0.8571428571428571

In [182]:
shakespeare_in_all_results['Results'].mean()

0.7428571428571429

##### Burrows' Delta Method with word bi-grams

In [183]:
all_bigrams_df_1 = model_predictions(all_top_bigrams_part_1, all_top_bigrams_test_1, 100, all_top_7, 'Part_1')
all_bigrams_df_2 = model_predictions(all_top_bigrams_part_2, all_top_bigrams_test_2, 100, all_top_7, 'Part_2')
all_bigrams_df_3 = model_predictions(all_top_bigrams_part_3, all_top_bigrams_test_3, 100, all_top_7, 'Part_3')
all_bigrams_df_4 = model_predictions(all_top_bigrams_part_4, all_top_bigrams_test_4, 100, all_top_7, 'Part_4')
all_bigrams_df_5 = model_predictions(all_top_bigrams_part_5, all_top_bigrams_test_5, 100, all_top_7, 'Part_5')


In [184]:
all_top_bigrams_df = pd.concat([all_bigrams_df_1, all_bigrams_df_2, all_bigrams_df_3, all_bigrams_df_4, all_bigrams_df_5], axis = 1)
all_top_bigrams_df = success_rate(all_top_bigrams_df)

In [185]:
all_top_bigrams_df['Results'].mean()

0.4714285714285714

#### Per author

In [186]:
wilde_bigrams_in_all_results = all_top_bigrams_df.filter(items = wilde_top_7_characters, axis=0)
shaw_bigrams_in_all_results = all_top_bigrams_df.filter(items = shaw_top_7_characters, axis=0)
jonson_bigrams_in_all_results = all_top_bigrams_df.filter(items = jonson_top_7_characters, axis=0)
shakespeare_bigrams_in_all_results = all_top_bigrams_df.filter(items = shakespeare_top_7_characters, axis=0)

In [187]:
wilde_bigrams_in_all_results['Results'].mean()

0.45714285714285713

In [188]:
shaw_bigrams_in_all_results['Results'].mean()

0.6857142857142857

In [189]:
jonson_bigrams_in_all_results['Results'].mean()

0.42857142857142855

In [190]:
shakespeare_bigrams_in_all_results['Results'].mean()

0.31428571428571433

# German Plays

# Friedrich Schiller

### Characters (+1500 words)

In [35]:
schiller_characters = {**kabale_und_liebe_characters, **die_verschwoerung_des_fiesco_zu_genua_characters, **die_räuber_characters, **die_jungfrau_von_orleans_characters}

In [36]:
len(schiller_characters)

17

In [37]:
schiller_characters_split = corpus_split(schiller_characters, 5)

In [38]:
schiller_characters_partition = split_partitions(schiller_characters_split)
schiller_characters_test = split_test(schiller_characters_split)

In [39]:
schiller_part_1 = dict_word_tokenizer(schiller_characters_partition[0])
schiller_part_2 = dict_word_tokenizer(schiller_characters_partition[1])
schiller_part_3 = dict_word_tokenizer(schiller_characters_partition[2])
schiller_part_4 = dict_word_tokenizer(schiller_characters_partition[3])
schiller_part_5 = dict_word_tokenizer(schiller_characters_partition[4])

In [40]:
schiller_test_1 = dict_word_tokenizer(schiller_characters_test[0])
schiller_test_2 = dict_word_tokenizer(schiller_characters_test[1])
schiller_test_3 = dict_word_tokenizer(schiller_characters_test[2])
schiller_test_4 = dict_word_tokenizer(schiller_characters_test[3])
schiller_test_5 = dict_word_tokenizer(schiller_characters_test[4])

In [41]:
schiller_bigrams_part_1 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_partition[0]), 2)
schiller_bigrams_part_2 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_partition[1]), 2)
schiller_bigrams_part_3 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_partition[2]), 2)
schiller_bigrams_part_4 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_partition[3]), 2)
schiller_bigrams_part_5 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_partition[4]), 2)

In [42]:
schiller_bigrams_test_1 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_test[0]), 2)
schiller_bigrams_test_2 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_test[1]), 2)
schiller_bigrams_test_3 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_test[2]), 2)
schiller_bigrams_test_4 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_test[3]), 2)
schiller_bigrams_test_5 = ngrams_tokenizer(dict_word_tokenizer(schiller_characters_test[4]), 2)

### Top 6 characters (+4000 words)

In [43]:
schiller_top_6_characters = ['Ferdinand', 'Luise', 'Fiesco', 'Franz', 'Moor', 'Johanna']

In [44]:
schiller_top_6 = {}
for key in schiller_characters:
    if key in schiller_top_6_characters:
        schiller_top_6[key] = schiller_characters[key]

### Burrows' Delta Method

In [45]:
# Burrows' Delta Method
schiller_df_1 = model_predictions(schiller_part_1, schiller_test_1, 50, schiller_characters, 'Part_1')
schiller_df_2 = model_predictions(schiller_part_2, schiller_test_2, 50, schiller_characters, 'Part_2')
schiller_df_3 = model_predictions(schiller_part_3, schiller_test_3, 50, schiller_characters, 'Part_3')
schiller_df_4 = model_predictions(schiller_part_4, schiller_test_4, 50, schiller_characters, 'Part_4')
schiller_df_5 = model_predictions(schiller_part_5, schiller_test_5, 50, schiller_characters, 'Part_5')

In [46]:
# Burrows' Delta Method with word bi-grams
schiller_bigrams_df_1 = model_predictions(schiller_bigrams_part_1, schiller_bigrams_test_1, 50, schiller_characters, 'Part_1')
schiller_bigrams_df_2 = model_predictions(schiller_bigrams_part_2, schiller_bigrams_test_2, 50, schiller_characters, 'Part_2')
schiller_bigrams_df_3 = model_predictions(schiller_bigrams_part_3, schiller_bigrams_test_3, 50, schiller_characters, 'Part_3')
schiller_bigrams_df_4 = model_predictions(schiller_bigrams_part_4, schiller_bigrams_test_4, 50, schiller_characters, 'Part_4')
schiller_bigrams_df_5 = model_predictions(schiller_bigrams_part_5, schiller_bigrams_test_5, 50, schiller_characters, 'Part_5')

#### Results

In [47]:
schiller_results = pd.concat([schiller_df_1, schiller_df_2, schiller_df_3, schiller_df_4, schiller_df_5], axis = 1)
schiller_results = success_rate(schiller_results)
schiller_results;

In [48]:
schiller_results['Results'].mean()

0.5411764705882354

In [51]:
schiller_bigrams_results = pd.concat([schiller_bigrams_df_1, schiller_bigrams_df_2, schiller_bigrams_df_3, schiller_bigrams_df_4, schiller_bigrams_df_5], axis = 1)
schiller_bigrams_results = success_rate(schiller_bigrams_results)
schiller_bigrams_results;

In [206]:
schiller_bigrams_results['Results'].mean()

0.15294117647058827

In [207]:
schiller_top_6_results = schiller_results.filter(items = schiller_top_6_characters, axis=0)
schiller_top_6_results

Unnamed: 0,Part_1,Part_2,Part_3,Part_4,Part_5,Results
Ferdinand,Lady Milford,Ferdinand,Ferdinand,Ferdinand,Ferdinand,0.8
Luise,Luise,Luise,Lady Milford,Luise,Luise,0.8
Fiesco,Fiesco,Fiesco,Verrina,Fiesco,Fiesco,0.8
Franz,Franz,Moor,Franz,Franz,Franz,0.8
Moor,Moor,Moor,Franz,Moor,Moor,0.8
Johanna,Johanna,Johanna,Johanna,Johanna,Johanna,1.0


In [208]:
schiller_top_6_results['Results'].mean()

0.8333333333333334

# Johann Wolfgang von Goethe

### Characters (+1500 words)

In [21]:
goethe_characters = {**faust_1_characters, **faust2_characters, **egmont_characters, **iphigenie_auf_tauris_characters, **die_laune_des_verliebten_characters}

In [22]:
len(goethe_characters)

15

In [23]:
goethe_characters_split = corpus_split(goethe_characters, 5)

In [24]:
goethe_characters_partition = split_partitions(goethe_characters_split)
goethe_characters_test = split_test(goethe_characters_split)

In [25]:
goethe_part_1 = dict_word_tokenizer(goethe_characters_partition[0])
goethe_part_2 = dict_word_tokenizer(goethe_characters_partition[1])
goethe_part_3 = dict_word_tokenizer(goethe_characters_partition[2])
goethe_part_4 = dict_word_tokenizer(goethe_characters_partition[3])
goethe_part_5 = dict_word_tokenizer(goethe_characters_partition[4])

In [26]:
goethe_test_1 = dict_word_tokenizer(goethe_characters_test[0])
goethe_test_2 = dict_word_tokenizer(goethe_characters_test[1])
goethe_test_3 = dict_word_tokenizer(goethe_characters_test[2])
goethe_test_4 = dict_word_tokenizer(goethe_characters_test[3])
goethe_test_5 = dict_word_tokenizer(goethe_characters_test[4])

In [27]:
goethe_bigrams_part_1 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_partition[0]), 2)
goethe_bigrams_part_2 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_partition[1]), 2)
goethe_bigrams_part_3 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_partition[2]), 2)
goethe_bigrams_part_4 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_partition[3]), 2)
goethe_bigrams_part_5 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_partition[4]), 2)

In [28]:
goethe_bigrams_test_1 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_test[0]), 2)
goethe_bigrams_test_2 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_test[1]), 2)
goethe_bigrams_test_3 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_test[2]), 2)
goethe_bigrams_test_4 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_test[3]), 2)
goethe_bigrams_test_5 = ngrams_tokenizer(dict_word_tokenizer(goethe_characters_test[4]), 2)

### Tp 6 characters (+4000 words)

In [29]:
goethe_top_6_characters = ['Faust', 'Mephistopheles', 'Faust II', 'Mephistopheles II', 'Egmont', 'Iphigenie']

In [30]:
goethe_top_6 = {}
for key in goethe_characters:
    if key in goethe_top_6_characters:
        goethe_top_6[key] = goethe_characters[key]

### Burrows' Delta Method

In [31]:
# Burrow's Delta Method
goethe_df_1 = model_predictions(goethe_part_1, goethe_test_1, 50, goethe_characters, 'Part_1')
goethe_df_2 = model_predictions(goethe_part_2, goethe_test_2, 50, goethe_characters, 'Part_2')
goethe_df_3 = model_predictions(goethe_part_3, goethe_test_3, 50, goethe_characters, 'Part_3')
goethe_df_4 = model_predictions(goethe_part_4, goethe_test_4, 50, goethe_characters, 'Part_4')
goethe_df_5 = model_predictions(goethe_part_5, goethe_test_5, 50, goethe_characters, 'Part_5')

In [32]:
# Burrows' Delta Method with word bi-grams
goethe_bigrams_df_1 = model_predictions(goethe_bigrams_part_1, goethe_bigrams_test_1, 50, goethe_characters, 'Part_1')
goethe_bigrams_df_2 = model_predictions(goethe_bigrams_part_2, goethe_bigrams_test_2, 50, goethe_characters, 'Part_2')
goethe_bigrams_df_3 = model_predictions(goethe_bigrams_part_3, goethe_bigrams_test_3, 50, goethe_characters, 'Part_3')
goethe_bigrams_df_4 = model_predictions(goethe_bigrams_part_4, goethe_bigrams_test_4, 50, goethe_characters, 'Part_4')
goethe_bigrams_df_5 = model_predictions(goethe_bigrams_part_5, goethe_bigrams_test_5, 50, goethe_characters, 'Part_5')

#### Result

In [53]:
goethe_results = pd.concat([goethe_df_1, goethe_df_2, goethe_df_3, goethe_df_4, goethe_df_5], axis = 1)
goethe_results = success_rate(goethe_results)
goethe_results;

In [222]:
goethe_results['Results'].mean()

0.64

In [223]:
goethe_bigrams_results = pd.concat([goethe_bigrams_df_1, goethe_bigrams_df_2, goethe_bigrams_df_3, goethe_bigrams_df_4, goethe_bigrams_df_5], axis = 1)
goethe_bigrams_results = success_rate(goethe_bigrams_results)
goethe_bigrams_results;

In [224]:
goethe_bigrams_results['Results'].mean()

0.14666666666666667

In [225]:
goethe_top_6_results = goethe_results.filter(items = goethe_top_6_characters, axis=0)
goethe_top_6_results

Unnamed: 0,Part_1,Part_2,Part_3,Part_4,Part_5,Results
Faust,Faust,Faust,Faust,Faust,Faust,1.0
Mephistopheles,Mephistopheles II,Mephistopheles,Mephistopheles,Mephistopheles,Mephistopheles,0.8
Faust II,Faust,Faust II,Mephistopheles II,Mephistopheles II,Faust II,0.4
Mephistopheles II,Mephistopheles II,Mephistopheles II,Mephistopheles II,Mephistopheles II,Mephistopheles II,1.0
Egmont,Egmont,Egmont,Egmont,Egmont,Egmont,1.0
Iphigenie,Faust,Egmont,Iphigenie,Iphigenie,Iphigenie,0.6


In [226]:
goethe_top_6_results['Results'].mean()

0.7999999999999999

## Both top 6

In [245]:
both_top_6 = {**schiller_top_6, **goethe_top_6}

In [246]:
both_top = corpus_split(both_top_6, 5)

In [247]:
both_top_partition = split_partitions(both_top)
both_top_test = split_test(both_top)

In [248]:
both_top_part_1 = dict_word_tokenizer(both_top_partition[0])
both_top_part_2 = dict_word_tokenizer(both_top_partition[1])
both_top_part_3 = dict_word_tokenizer(both_top_partition[2])
both_top_part_4 = dict_word_tokenizer(both_top_partition[3])
both_top_part_5 = dict_word_tokenizer(both_top_partition[4])

In [249]:
both_top_test_1 = dict_word_tokenizer(both_top_test[0])
both_top_test_2 = dict_word_tokenizer(both_top_test[1])
both_top_test_3 = dict_word_tokenizer(both_top_test[2])
both_top_test_4 = dict_word_tokenizer(both_top_test[3])
both_top_test_5 = dict_word_tokenizer(both_top_test[4])

In [250]:
both_top_bigrams_part_1 = ngrams_tokenizer(dict_word_tokenizer(both_top_partition[0]), 2)
both_top_bigrams_part_2 = ngrams_tokenizer(dict_word_tokenizer(both_top_partition[1]), 2)
both_top_bigrams_part_3 = ngrams_tokenizer(dict_word_tokenizer(both_top_partition[2]), 2)
both_top_bigrams_part_4 = ngrams_tokenizer(dict_word_tokenizer(both_top_partition[3]), 2)
both_top_bigrams_part_5 = ngrams_tokenizer(dict_word_tokenizer(both_top_partition[4]), 2)

In [251]:
both_top_bigrams_test_1 = ngrams_tokenizer(dict_word_tokenizer(both_top_test[0]), 2)
both_top_bigrams_test_2 = ngrams_tokenizer(dict_word_tokenizer(both_top_test[1]), 2)
both_top_bigrams_test_3 = ngrams_tokenizer(dict_word_tokenizer(both_top_test[2]), 2)
both_top_bigrams_test_4 = ngrams_tokenizer(dict_word_tokenizer(both_top_test[3]), 2)
both_top_bigrams_test_5 = ngrams_tokenizer(dict_word_tokenizer(both_top_test[4]), 2)

## Delta Distances

### Burrows Delta Method

In [252]:
both_df_1 = model_predictions(both_top_part_1, both_top_test_1, 100, both_top_6, 'Part_1')
both_df_2 = model_predictions(both_top_part_2, both_top_test_2, 100, both_top_6, 'Part_2')
both_df_3 = model_predictions(both_top_part_3, both_top_test_3, 100, both_top_6, 'Part_3')
both_df_4 = model_predictions(both_top_part_4, both_top_test_4, 100, both_top_6, 'Part_4')
both_df_5 = model_predictions(both_top_part_5, both_top_test_5, 100, both_top_6, 'Part_5')

In [253]:
both_top_df = pd.concat([both_df_1, both_df_2, both_df_3, both_df_4, both_df_5], axis = 1)
both_top_df = success_rate(both_top_df)
both_top_df;

In [254]:
both_top_df['Results'].mean()

0.9666666666666668

#### Per author

In [255]:
schiller_in_all_results = both_top_df.filter(items = schiller_top_6_characters, axis=0)
goethe_in_all_results = both_top_df.filter(items = goethe_top_6_characters, axis=0)

In [256]:
schiller_in_all_results['Results'].mean()

0.9666666666666667

In [257]:
goethe_in_all_results['Results'].mean()

0.9666666666666667

### Burrows Delta Method Bigrams

In [258]:
both_bigrams_df_1 = model_predictions(both_top_bigrams_part_1, both_top_bigrams_test_1, 100, both_top_6, 'Part_1')
both_bigrams_df_2 = model_predictions(both_top_bigrams_part_2, both_top_bigrams_test_2, 100, both_top_6, 'Part_2')
both_bigrams_df_3 = model_predictions(both_top_bigrams_part_3, both_top_bigrams_test_3, 100, both_top_6, 'Part_3')
both_bigrams_df_4 = model_predictions(both_top_bigrams_part_4, both_top_bigrams_test_4, 100, both_top_6, 'Part_4')
both_bigrams_df_5 = model_predictions(both_top_bigrams_part_5, both_top_bigrams_test_5, 100, both_top_6, 'Part_5')

In [259]:
both_top_bigrams_df = pd.concat([both_bigrams_df_1, both_bigrams_df_2, both_bigrams_df_3, both_bigrams_df_4, both_bigrams_df_5], axis = 1)
both_top_bigrams_df = success_rate(both_top_bigrams_df)
both_top_bigrams_df;

In [260]:
both_top_bigrams_df['Results'].mean()

0.26666666666666666

#### Per author

In [261]:
schiller_bigrams_in_all_results = both_top_bigrams_df.filter(items = schiller_top_6_characters, axis=0)
goethe_bigrams_in_all_results = both_top_bigrams_df.filter(items = goethe_top_6_characters, axis=0)

In [262]:
schiller_bigrams_in_all_results['Results'].mean()

0.19999999999999998

In [263]:
goethe_bigrams_in_all_results['Results'].mean()

0.3333333333333333