In [1]:
import nltk
import re
import random
import math
import pandas as pd
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
%matplotlib inline 

In [2]:
%store -r

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
def partition (char_sens, n):
    random.seed(123)
    random.shuffle(char_sens)
    return [char_sens[i::n] for i in range(n)]

In [5]:
def corpus_split (play_characters, n):
    play_characters_part = {}
    for char in play_characters.keys():
        play_characters_part[char] = partition(play_characters[char], n)
    return play_characters_part  

In [6]:
def split_partitions (play_part):
    part_1 = {}
    part_2 = {}
    part_3 = {}
    part_4 = {}
    part_5 = {}
    for char in play_part:
        for part in char:
            part_1[char] = play_part[char][0] + play_part[char][2] + play_part[char][3] + play_part[char][4]
            part_2[char] = play_part[char][0] + play_part[char][1] + play_part[char][3] + play_part[char][4]
            part_3[char] = play_part[char][0] + play_part[char][1] + play_part[char][2] + play_part[char][4]
            part_4[char] = play_part[char][0] + play_part[char][1] + play_part[char][2] + play_part[char][3]
            part_5[char] = play_part[char][1] + play_part[char][2] + play_part[char][3] + play_part[char][4]
    return part_1, part_2, part_3, part_4, part_5

In [7]:
def split_test (play_part):
    test_1 = {}
    test_2 = {}
    test_3 = {}
    test_4 = {}
    test_5 = {}
    for char in play_part:
        for part in char:
            test_1[char] = play_part[char][1]
            test_2[char] = play_part[char][2]
            test_3[char] = play_part[char][3]
            test_4[char] = play_part[char][4]
            test_5[char] = play_part[char][0]
    return test_1, test_2, test_3, test_4, test_5

In [8]:
def char_tok_no_punct (char):
    result = []
    for sen in char:
        sen = sen.lower()
        output = tokenizer.tokenize(sen)
        result.append(output)
    return result

In [9]:
def sentences_unifier (character):
    output = []
    for sen in character:
        for word in sen:
            output.append(word)
    one_string = " ".join(output)
    return output

In [10]:
def dict_word_tokenizer (play_characters):
    chars = list(play_characters.keys())
    char_tokens = {}
    for char in chars:
        output = char_tok_no_punct(play_characters[char])
        result = sentences_unifier(output)
        char_tokens[char] = result
    return char_tokens

In [11]:
def whole_corpus_generator (partition):
    whole_corpus = []
    for char in partition.keys():
        for word in partition[char]:
            word = word.lower()
            whole_corpus.append(word)
    return whole_corpus

In [12]:
def features_generator (whole_corpus_freq, partition):
    features = [word for word,freq in whole_corpus_freq]
    feature_freqs = {}
    for char in partition:
        feature_freqs[char] = {} 
        overall = len(partition[char])
        for feature in features:
            presence = partition[char].count(feature)
            feature_freqs[char][feature] = presence / overall
    return  feature_freqs

In [13]:
def zscores (df):
    cols = list(df.columns)
    for col in cols:
        col_zscore = col + '_zscore'
        df[col_zscore] = (df[col] - df[col].mean())/df[col].std(ddof=0)
    df = df.drop(cols, axis = 1)
    return df

In [14]:
def delta_distance (play_characters, test_zscores, part_zscores, character):    
    chars = play_characters.keys()
    delta = {}
    for char in chars:
        delta[char] = (abs(test_zscores.loc[character] - part_zscores.loc[char])).sum()/50
    return delta

In [15]:
def get_deltas (writer_characters, test_zscores, train_zscores):
    deltas = {}
    for char in writer_characters.keys():
        result = delta_distance(writer_characters, test_zscores, train_zscores, char)
        deltas[char]= result
        df = pd.DataFrame.from_dict(deltas)
        df = df.reindex(sorted(df.columns), axis=1)
    return df

# Burrows Delta

## Oscar Wilde 

In [16]:
wilde_characters = {**an_ideal_husband_characters, **a_woman_of_no_importance_characters, **lady_windermeres_fan_characters, **the_importance_of_being_earnest_characters}

In [17]:
del wilde_characters['Chasuble']
del wilde_characters['Prism']
del wilde_characters['Augustus']

In [18]:
len(wilde_characters.keys())

22

In [19]:
wilde_characters_split = corpus_split(wilde_characters, 5)

In [20]:
wilde_characters_partition = split_partitions(wilde_characters_split)
wilde_characters_test = split_test(wilde_characters_split)

In [21]:
wilde_part_1 = dict_word_tokenizer(wilde_characters_partition[0])
wilde_part_2 = dict_word_tokenizer(wilde_characters_partition[1])
wilde_part_3 = dict_word_tokenizer(wilde_characters_partition[2])
wilde_part_4 = dict_word_tokenizer(wilde_characters_partition[3])
wilde_part_5 = dict_word_tokenizer(wilde_characters_partition[4])

In [22]:
wilde_test_1 = dict_word_tokenizer(wilde_characters_test[0])
wilde_test_2 = dict_word_tokenizer(wilde_characters_test[1])
wilde_test_3 = dict_word_tokenizer(wilde_characters_test[2])
wilde_test_4 = dict_word_tokenizer(wilde_characters_test[3])
wilde_test_5 = dict_word_tokenizer(wilde_characters_test[4])

## Partition 1

### Train z - scores

In [23]:
wilde_part_1_corpus = whole_corpus_generator(wilde_part_1)
wilde_part_1_corpus_freq = list(nltk.FreqDist(wilde_part_1_corpus).most_common(50))
wilde_part_1_features = features_generator(wilde_part_1_corpus_freq, wilde_part_1)
df_wilde_1 = pd.DataFrame.from_dict(wilde_part_1_features, orient = 'index')
wilde_zscores_1 = zscores(df_wilde_1)
wilde_zscores_1;

### Test z - scores

In [24]:
wilde_test_1 = dict_word_tokenizer(wilde_test_1)
wilde_test_1_features = features_generator(wilde_part_1_corpus_freq, wilde_test_1)
df_wilde_test_1 = pd.DataFrame.from_dict(wilde_test_1_features, orient = 'index')
wilde_zscores_test_1 = zscores(df_wilde_test_1)
wilde_zscores_test_1;

### Delta Distances

In [25]:
wilde_1_deltas = get_deltas(wilde_characters, wilde_zscores_test_1, wilde_zscores_1)
predictions_wilde_1 = wilde_1_deltas.idxmin()
predictions_wilde_1

Algernon                  Algernon
Allonby                    Allonby
Berwick                   Cheveley
Bracknell                Bracknell
Caversham                Bracknell
Cecily                      Goring
Cheveley                    Goring
Chiltern                    Goring
Darlington                Chiltern
Erlynne            Lady Windermere
Gerald             Lady Windermere
Goring                      Goring
Gwendolen                 Cheveley
Hester                      Gerald
Hunstanton                  Goring
Illingorth              Illingorth
Jack                          Jack
Lady Chiltern               Goring
Lady Windermere    Lady Windermere
Lord Windermere    Lord Windermere
Mabel                       Goring
Mrs Artbuthnot      Mrs Artbuthnot
dtype: object

## Partition 2

### Train z- scores

In [26]:
wilde_part_2_corpus = whole_corpus_generator(wilde_part_2)
wilde_part_2_corpus_freq = list(nltk.FreqDist(wilde_part_2_corpus).most_common(50))
wilde_part_2_features = features_generator(wilde_part_2_corpus_freq, wilde_part_2)
df_wilde_2 = pd.DataFrame.from_dict(wilde_part_2_features, orient = 'index')
wilde_zscores_2 = zscores(df_wilde_2)
wilde_zscores_2;

### Test z - scores

In [27]:
wilde_test_2 = dict_word_tokenizer(wilde_test_2)
wilde_test_2_features = features_generator(wilde_part_2_corpus_freq, wilde_test_2)
df_wilde_test_2 = pd.DataFrame.from_dict(wilde_test_2_features, orient = 'index')
wilde_zscores_test_2 = zscores(df_wilde_test_2)
wilde_zscores_test_2;

### Delta Distances

In [28]:
wilde_2_deltas = get_deltas(wilde_characters, wilde_zscores_test_2, wilde_zscores_2)
predictions_wilde_2 = wilde_2_deltas.idxmin()
predictions_wilde_2

Algernon                  Algernon
Allonby                     Goring
Berwick                 Hunstanton
Bracknell                   Goring
Caversham                 Algernon
Cecily                    Algernon
Cheveley                    Goring
Chiltern                  Chiltern
Darlington              Illingorth
Erlynne                   Algernon
Gerald                     Erlynne
Goring                      Goring
Gwendolen                   Cecily
Hester                      Hester
Hunstanton              Hunstanton
Illingorth                Cheveley
Jack                          Jack
Lady Chiltern              Erlynne
Lady Windermere    Lady Windermere
Lord Windermere    Lady Windermere
Mabel                     Cheveley
Mrs Artbuthnot      Mrs Artbuthnot
dtype: object

## Partition 3

### Train z - scores

In [29]:
wilde_part_3_corpus = whole_corpus_generator(wilde_part_3)
wilde_part_3_corpus_freq = list(nltk.FreqDist(wilde_part_3_corpus).most_common(50))
wilde_part_3_features = features_generator(wilde_part_3_corpus_freq, wilde_part_3)
df_wilde_3 = pd.DataFrame.from_dict(wilde_part_3_features, orient = 'index')
wilde_zscores_3 = zscores(df_wilde_3)
wilde_zscores_3;

### Test z - scores

In [30]:
wilde_test_3 = dict_word_tokenizer(wilde_test_3)
wilde_test_3_features = features_generator(wilde_part_3_corpus_freq, wilde_test_3)
df_wilde_test_3 = pd.DataFrame.from_dict(wilde_test_3_features, orient = 'index')
wilde_zscores_test_3 = zscores(df_wilde_test_3)
wilde_zscores_test_3;

### Delta Distances

In [31]:
wilde_3_deltas = get_deltas(wilde_characters, wilde_zscores_test_3, wilde_zscores_3)
predictions_wilde_3 = wilde_3_deltas.idxmin()
predictions_wilde_3

Algernon                      Jack
Allonby                     Cecily
Berwick                 Hunstanton
Bracknell                Bracknell
Caversham                   Gerald
Cecily                   Gwendolen
Cheveley                  Cheveley
Chiltern                  Chiltern
Darlington         Lady Windermere
Erlynne            Lady Windermere
Gerald                      Gerald
Goring                    Cheveley
Gwendolen                   Cecily
Hester                    Cheveley
Hunstanton              Hunstanton
Illingorth              Illingorth
Jack                          Jack
Lady Chiltern        Lady Chiltern
Lady Windermere    Lady Windermere
Lord Windermere    Lady Windermere
Mabel                       Goring
Mrs Artbuthnot      Mrs Artbuthnot
dtype: object