##  Naive Bayes Feature Engineering 

Uses
- Naive Bayes Classifier
- provided data frame
- basic features POS tag frequency, stop word frequency, n-grams of POS and stop words

to test different additional features, namely
- word frequency
- keyword extraction
- stemming
- line length
- sentence length
- lemmatizing
- hapax legomena (unique word frequency)

In [78]:
#NOTE

#Results can differ depending on how the CountVectorizers in the get_feature functions are built. Two versions 
#have been tried and been found to affect the results for better or for worse in a non consistent way 
#in each feature.

#Version 1
#cvec1 = CountVectorizer(vocabulary = stopwords_507, strip_accents="ascii")#word freq of stop words
#cvec2 = CountVectorizer() #freq of POS tags
#cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii") #ngrams of POS/stops

#Version 2
#cvec1 = CountVectorizer(vocabulary= stopwords_507, strip_accents="ascii")#word freq of stop words
#cvec2 = CountVectorizer(max_features=1000, strip_accents="ascii") #freq of POS tags
#cvec3 = CountVectorizer(ngram_range=(2,2),max_features=1000, strip_accents="ascii") #ngrams of POS/stops 

In [1]:
import os
import re
import spacy
import pandas as pd
from glob import glob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn import linear_model

import nltk
from nltk.corpus import stopwords
from stop_words import get_stop_words

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

import matplotlib.pyplot as plt

import numpy as np

### Get Stop Words

In [2]:
nlp = spacy.load("en")
stopwrd1= []
for word in nlp.Defaults.stop_words:
    stopwrd1.append(word)


stopwords2 = stopwords.words('english')


stopwords3 = get_stop_words('english')

#from https://www.ranks.nl/stopwords
stopwords4 = ['a ', 'able', 'about', 'above', 'abst', 'accordance', 'according', 'accordingly', 'across', 'act', 'actually', 'added', 'adj', 'affected', 'affecting', 'affects', 'after', 'afterwards', 'again', 'against', 'ah', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'announce', 'another', 'any', 'anybody', 'anyhow', 'anymore', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apparently', 'approximately', 'are', 'aren', 'arent', 'arise', 'around', 'as', 'aside', 'ask', 'asking', 'at', 'auth', 'available', 'away', 'awfully', 'b', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'begin', 'beginning', 'beginnings', 'begins', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'between', 'beyond', 'biol', 'both', 'brief', 'briefly', 'but', 'by', 'c', 'ca', 'came', 'can', 'cannot', "can't", 'cause', 'causes', 'certain', 'certainly', 'co', 'com', 'come', 'comes', 'contain', 'containing', 'contains', 'could', 'couldnt', 'd', 'date', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'downwards', 'due', 'during', 'e', 'each', 'ed', 'edu', 'effect', 'eg', 'eight', 'eighty', 'either', 'else', 'elsewhere', 'end', 'ending', 'enough', 'especially', 'et', 'et-al', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'except', 'f', 'far', 'few', 'ff', 'fifth', 'first', 'five', 'fix', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'found', 'four', 'from', 'further', 'furthermore', 'g', 'gave', 'get', 'gets', 'getting', 'give', 'given', 'gives', 'giving', 'go', 'goes', 'gone', 'got', 'gotten', 'h', 'had', 'happens', 'hardly', 'has', "hasn't", 'have', "haven't", 'having', 'he', 'hed', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'heres', 'hereupon', 'hers', 'herself', 'hes', 'hi', 'hid', 'him', 'himself', 'his', 'hither', 'home', 'how', 'howbeit', 'however', 'hundred', 'i', 'id', 'ie', 'if', "i'll", 'im', 'immediate', 'immediately', 'importance', 'important', 'in', 'inc', 'indeed', 'index', 'information', 'instead', 'into', 'invention', 'inward', 'is', "isn't", 'it', 'itd', "it'll", 'its', 'itself', "i've", 'j', 'just', 'k', 'keep\tkeeps', 'kept', 'kg', 'km', 'know', 'known', 'knows', 'l', 'largely', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', 'lets', 'like', 'liked', 'likely', 'line', 'little', "'ll", 'look', 'looking', 'looks', 'ltd', 'm', 'made', 'mainly', 'make', 'makes', 'many', 'may', 'maybe', 'me', 'mean', 'means', 'meantime', 'meanwhile', 'merely', 'mg', 'might', 'million', 'miss', 'ml', 'more', 'moreover', 'most', 'mostly', 'mr', 'mrs', 'much', 'mug', 'must', 'my', 'myself', 'n', 'na', 'name', 'namely', 'nay', 'nd', 'near', 'nearly', 'necessarily', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'ninety', 'no', 'nobody', 'non', 'none', 'nonetheless', 'noone', 'nor', 'normally', 'nos', 'not', 'noted', 'nothing', 'now', 'nowhere', 'o', 'obtain', 'obtained', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'omitted', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'ord', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'owing', 'own', 'p', 'page', 'pages', 'part', 'particular', 'particularly', 'past', 'per', 'perhaps', 'placed', 'please', 'plus', 'poorly', 'possible', 'possibly', 'potentially', 'pp', 'predominantly', 'present', 'previously', 'primarily', 'probably', 'promptly', 'proud', 'provides', 'put', 'q', 'que', 'quickly', 'quite', 'qv', 'r', 'ran', 'rather', 'rd', 're', 'readily', 'really', 'recent', 'recently', 'ref', 'refs', 'regarding', 'regardless', 'regards', 'related', 'relatively', 'research', 'respectively', 'resulted', 'resulting', 'results', 'right', 'run', 's', 'said', 'same', 'saw', 'say', 'saying', 'says', 'sec', 'section', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sent', 'seven', 'several', 'shall', 'she', 'shed', "she'll", 'shes', 'should', "shouldn't", 'show', 'showed', 'shown', 'showns', 'shows', 'significant', 'significantly', 'similar', 'similarly', 'since', 'six', 'slightly', 'so', 'some', 'somebody', 'somehow', 'someone', 'somethan', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specifically', 'specified', 'specify', 'specifying', 'still', 'stop', 'strongly', 'sub', 'substantially', 'successfully', 'such', 'sufficiently', 'suggest', 'sup', 'sure\tt', 'take', 'taken', 'taking', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', "that'll", 'thats', "that've", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 
             'thered', 'therefore', 'therein', "there'll", 'thereof', 'therere', 'theres', 'thereto', 'thereupon', "there've", 'these', 'they', 'theyd', "they'll", 'theyre', "they've", 'think', 'this', 'those', 'thou', 'though', 'though', 'thousand', 'throug', 'through', 'throughout', 'thru', 'thus', 'til', 'tip', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'ts', 'twice', 'two', 'u', 'un', 'under', 'unfortunately', 'unless', 'unlike', 'unlikely', 'until', 'unto', 'up', 'upon', 'ups', 'us', 'use', 'used', 'useful', 'usefully', 'usefulness', 'uses', 'using', 'usually', 'v', 'value', 'various', "'ve", 'very', 'via', 'viz', 'vol', 'vols', 'vs', 'w', 'want', 'wants', 'was', 'wasnt', 'way', 'we', 'wed', 'welcome', "we'll", 'went', 'were', 'werent', "we've", 'what', 'whatever', "what'll", 'whats', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'wheres', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whim', 'whither', 'who', 'whod', 'whoever', 'whole', "who'll", 'whom', 'whomever', 'whos', 'whose', 'why', 'widely', 'willing', 'wish', 'with', 'within', 'without', 'wont', 'words', 'world', 'would', 'wouldnt', 'www', 'x', 'y', 'yes', 'yet', 'you', 'youd', "you'll", 'your', 'youre', 'yours', 'yourself', 'yourselves', "you've", 'z', 'zero']

        

stopwords_305 = stopwrd1
stopwords_747= list(set(stopwrd1 + stopwords2+ stopwords3 +stopwords4))
stopwords_710= stopwords_747[:710]


### Naive Bayes Classifier

In [3]:
def  predict(train_features, train_label, test_features): 
    """
    input: features for train and test instances and label for train instances
    
    returns prediction for instance and coefficient matrix and classifier
    
    """
    # played around with smoothing 0.1 seemed to be the best
    clf = MultinomialNB(alpha=0.1) #using the same classifier as Fox et al.
    clf.fit(train_features, train_label)
    pred=clf.predict(test_features)
    coef = clf.coef_
    return pred, coef, clf

### Leave One Out Cross Validation

In [4]:
def get_LOO_instances(dataframe):
    """
    
    generates instances for LOO
    
    input: df
    
    output: list with instances as tuples: [(train_row, test_row)]
    
    """
    loo_instances=[]
    for instance in range(dataframe.shape[0]):
        inst= list(range(dataframe.shape[0]))
        inst.remove(instance)
        test=dataframe.drop(inst)
        train= dataframe.drop([instance])
        loo_instances.append((train, test))
    return loo_instances


In [84]:
#read in given data frame
df = pd.read_excel("DataFrame.xlsx")

In [85]:
df.head()

Unnamed: 0.1,Unnamed: 0,Play_id,Raw Text,POS_305,POS_710,POS_747,POSstops_305,POSstops_710,POSstops_747,Lemmas,Label,Author,Play
0,0,0,"As I remember , Adam , it was upon this fashio...",ADP PRON VERB PUNCT PROPN PUNCT NOUN SPACE VE...,ADP PRON VERB PUNCT PROPN PUNCT VERB NOUN SPA...,ADP PRON VERB PUNCT PROPN PUNCT NOUN SPACE VE...,ADP PRON VERB PUNCT PROPN PUNCT it was upon t...,ADP PRON VERB PUNCT PROPN PUNCT it VERB upon ...,ADP PRON VERB PUNCT PROPN PUNCT it was upon t...,"as -PRON- remember , adam , -PRON- be upon th...",0,E-Shakespeare,asyoulikeit.txt.E-Shakespeare.tok
1,1,1,"Proceed , Solinus , to procure my fall\nAnd by...",PROPN PUNCT PROPN PUNCT VERB NOUN SPACE CCONJ...,PROPN PUNCT PROPN PUNCT VERB NOUN SPACE CCONJ...,PROPN PUNCT PROPN PUNCT VERB NOUN SPACE CCONJ...,PROPN PUNCT PROPN PUNCT to VERB my NOUN SPACE...,PROPN PUNCT PROPN PUNCT to VERB my NOUN SPACE...,PROPN PUNCT PROPN PUNCT to VERB my NOUN SPACE...,"proceed , solinus , to procure -PRON- fall \n...",0,E-Shakespeare,comedy_errors.txt.E-Shakespeare.tok
2,2,2,"Who 's there ?\nNay , answer me : stand , and ...",NOUN VERB PUNCT SPACE PROPN PUNCT VERB PUNCT ...,NOUN VERB PUNCT SPACE PROPN PUNCT VERB PUNCT ...,NOUN VERB PUNCT SPACE PROPN PUNCT VERB PUNCT ...,NOUN VERB there PUNCT SPACE PROPN PUNCT VERB ...,NOUN VERB there PUNCT SPACE PROPN PUNCT VERB ...,NOUN VERB there PUNCT SPACE PROPN PUNCT VERB ...,"who be there ? \n nay , answer -PRON- : stand...",0,E-Shakespeare,Hamlet.txt.E-Shakespeare.tok
3,3,3,"So shaken as we are , so wan with care ,\nFind...",ADV VERB PUNCT NOUN NOUN PUNCT SPACE VERB NOU...,ADV VERB PUNCT NOUN NOUN PUNCT SPACE VERB NOU...,ADV VERB PUNCT NOUN NOUN PUNCT SPACE VERB NOU...,ADV VERB as we are PUNCT so NOUN with NOUN PU...,ADV VERB as we are PUNCT so NOUN with NOUN PU...,ADV VERB as we are PUNCT so NOUN with NOUN PU...,"so shake as -PRON- be , so wan with care , \n...",0,E-Shakespeare,henryivPart1.txt.E-Shakespeare.tok
4,4,4,Open your ears ; for which of you will stop\nT...,VERB NOUN PUNCT VERB SPACE DET NOUN VERB ADJ ...,VERB NOUN PUNCT SPACE DET NOUN VERB ADJ PROPN...,VERB NOUN PUNCT SPACE DET NOUN VERB ADJ PROPN...,VERB your NOUN PUNCT for which of you will VE...,VERB your NOUN PUNCT for which of you will st...,VERB your NOUN PUNCT for which of you will st...,open -PRON- ear ; for which of -PRON- will st...,0,E-Shakespeare,henryivPart2.txt.E-Shakespeare.tok


### Get Basic Features

In [34]:
def get_preds_imp(dataframe):
    
    """
    change function get_features_GM(train_data, test_data) for feature engineering!
    input: Dataframe
    
    output: predictions for test instances, and coefficient matrix, cls
    """
    
    instances= get_LOO_instances(dataframe)
    predictions=[]
    for inst in instances:
        train_data= inst[0]
        test_data = inst[1]
        train_label=train_data["Label"]
        train_features, test_features= get_features_GM_imp(train_data, test_data)
        pred, coeff, cls= predict(train_features, train_label, test_features)
        predictions.append(pred)
        
    return np.array(predictions), coeff, cls


In [8]:

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model (created with the LOO function), both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I want to use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train["POS_710"]
    X_POSstops = X_train[ "POSstops_710"]
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test["POS_710"]
    X_test_POSstops = X_test[ "POSstops_710"]
    
    
    # using countvectorizors to get freqs 
    cvec1 = CountVectorizer(vocabulary = stopwords_710, strip_accents="ascii")#word freq of stop words
    cvec2 = CountVectorizer() #freq of POS tags
    
    #add max_feature=1000 to get different results
    cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii") #ngrams of POS/stops

   # fitting  and transforming of train data, and stack vectors at the same time to get X for model
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        cvec3.fit_transform(X_POSstops).toarray(),
        ))
    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            cvec3.transform(X_test_POSstops).toarray(),
        ))

    
    return train_features, test_features


In [9]:
predictions, coeff, cls =  get_preds_imp(df)

In [10]:
df["Pred_imp"]=predictions

In [11]:
df["acc_imp"] =df["Label"]==df[ "Pred_imp"]
acc_imp=df.loc[df.acc_imp== True, 'acc_imp'].count()/df.shape[0]
print(acc_imp)


0.8051948051948052


In [None]:
#305 stop words
# if ngrams not restricted: 0.8181818181818182
# if ngrams restricted to most frequent 1000: 0.7792207792207793

#710 stop words
#if ngrams not restricted: 80.5

## Feature: Word Frequency

Add simple word frequency to the basic features rather than using it by itself as in the bag of words model.

In [20]:
#modified get_features function including word frequency

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model (created with the LOO function), both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I want to use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train["POS_710"]
    X_POSstops = X_train[ "POSstops_710"]
   
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test["POS_710"]
    X_test_POSstops = X_test[ "POSstops_710"]
    
    
    # using countvectorizors to get freqs 
    cvec1 = CountVectorizer(vocabulary = stopwords_710, strip_accents="ascii")#word freq of stop words
    cvec2 = CountVectorizer() #freq of POS tags
    
    #add max_feature=1000 to get different results
    cvec3 = CountVectorizer(ngram_range=(2,2), max_features = 1000, strip_accents="ascii") #ngrams of POS/stops
    
    cvec4 = CountVectorizer(strip_accents="ascii") 

   # fitting  and transforming of train data, and stack vectors at the same time to get X for model
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        cvec3.fit_transform(X_POSstops).toarray(),
        cvec4.fit_transform(X_txt).toarray(),
        
        ))
    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            cvec3.transform(X_test_POSstops).toarray(),
            cvec4.transform(X_test_txt).toarray(),
        ))

    
    return train_features, test_features


In [21]:
predictions_word_freq, coeff, cls =  get_preds_imp(df)

In [22]:
df["Pred_word_freq"]=predictions_word_freq

In [23]:
df["acc_word_freq"] =df["Label"]==df[ "Pred_word_freq"]
acc_word_freq=df.loc[df.acc_word_freq== True, 'acc_word_freq'].count()/df.shape[0]
print(acc_word_freq)


0.8051948051948052


In [None]:
#305 stop words
#if ngrams not restricted: 0.7662337662337663
#if ngrams restricted to most frequent 1000: 0.8181818181818182

#710 stop words
#if ngrams not restricted: 77.9
#if ngrams restricted: 80.5

## Feature: Keyword Extraction

Uses package RAKE (Rapid Automated Keyword Extraction) to extract keywords and key phrases from the text which can then be counted to form a feature vector.

In [30]:
"""Get set out of 1000 highest ranked keywords for each play saved in one single string"""

import RAKE
rake = RAKE.Rake(stopwords_305) #use stopwords with best results from before

all_keywords = []
punctuation = """!"',;:.-?)([]<>*#\n\t\r """

for text in df["Raw Text"]:
    keywords = ""
    text_keywords = rake.run(text.strip(punctuation))
    text_keywords = set([keywords[0] for keywords in sorted(text_keywords, key=lambda word: word[1])[:1000]])
    for keyword in text_keywords:
        keywords += keyword + " "
    
    all_keywords.append(keywords)

all_keywords = np.array(all_keywords)


    

In [31]:
#create column and save in df
df["Keywords"] = all_keywords

In [80]:
#modified get_features function including keywords

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model (created with the LOO function), both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I want to use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train["POS_710"]
    X_POSstops = X_train[ "POSstops_710"]
    X_key = X_train["Keywords"]
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test["POS_710"]
    X_test_POSstops = X_test[ "POSstops_710"]
    X_test_key = X_test["Keywords"]
    
    # using countvectorizors to get freqs 
    cvec1 = CountVectorizer(vocabulary = stopwords_710, strip_accents="ascii")#word freq of stop words
    cvec2 = CountVectorizer() #freq of POS tags
    
    #add max_feature=1000 to get different results
    cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii", max_features=1000) #ngrams of POS/stops
    
    cvec4 = CountVectorizer(strip_accents="ascii") #keyword frequency

   # fitting  and transforming of train data, and stack vectors at the same time to get X for model
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        cvec3.fit_transform(X_POSstops).toarray(),
        cvec4.fit_transform(X_key).toarray(),
        
        ))

    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            cvec3.transform(X_test_POSstops).toarray(),
            cvec4.transform(X_test_key).toarray(),
        ))

    
    return train_features, test_features


In [81]:
predictions_key, coeff, cls =  get_preds_imp(df)

In [82]:
df["Pred_keywords"]=predictions_key

In [83]:
df["acc_keywords"] =df["Label"]==df[ "Pred_keywords"]
acc_keywords=df.loc[df.acc_keywords== True, 'acc_keywords'].count()/df.shape[0]
print(acc_keywords)


0.8571428571428571


In [None]:
#305 stop words
#if ngrams not restricted: 0.7922077922077922
#if ngrams restricted to most frequent 1000: 0.8571428571428571

#710 stop words
#if ngrams restricted: 0.8441558441558441

## Feature: Frequency of Stemmed Words

Three different nltk stemmers were tested to add frequency of stemmed words as a feature. The Porter Stemmer seemed to work best.

In [29]:
"""Transform given play to a stemmed version and save in df"""

#three different nltk stemmers to be tested
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer

punctuation = """!"',;:.-?)([]<>*#\n\t\r """
all_stemmed_texts = []

lancaster = LancasterStemmer()
porter = PorterStemmer()
snowball = SnowballStemmer("english")

#change this to test different stemmers
stemmer = porter

for text in df["Raw Text"]:
    stemmed_text = ""
    for word in text.split():
        stemmed_text += stemmer.stem(word.strip(punctuation)) + " "
    
    all_stemmed_texts.append(stemmed_text)

all_stemmed_texts = np.array(all_stemmed_texts)

In [30]:
#save Stem column in df
df["Stems"] = all_stemmed_texts

In [31]:
#modified get_features function including stemmed words frequency

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model (created with the LOO function), both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I want to use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train["POS_710"]
    X_POSstops = X_train[ "POSstops_710"]
    X_stem = X_train["Stems"]
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test["POS_710"]
    X_test_POSstops = X_test[ "POSstops_710"]
    X_test_stem = X_test["Stems"]
    
    # using countvectorizors to get freqs 
    cvec1 = CountVectorizer(vocabulary = stopwords_710, strip_accents="ascii")#word freq of stop words
    cvec2 = CountVectorizer() #freq of POS tags
    
    #add max_feature=1000 to get different results
    cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii", max_features = 1000) #ngrams of POS/stops 
    
    cvec4 = CountVectorizer()

   # fitting  and transforming of train data, and stack vectors at the same time to get X for model
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        cvec3.fit_transform(X_POSstops).toarray(),
        cvec4.fit_transform(X_stem).toarray(),
        
        ))
    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            cvec3.transform(X_test_POSstops).toarray(),
            cvec4.transform(X_test_stem).toarray(),
        ))

    
    return train_features, test_features


In [32]:
predictions_stem, coeff, cls =  get_preds_imp(df)

In [33]:
df["Pred_stems"]=predictions_stem

In [34]:
df["acc_stems"] =df["Label"]==df[ "Pred_stems"]
acc_stems=df.loc[df.acc_stems== True, 'acc_stems'].count()/df.shape[0]
print(acc_stems)

0.8441558441558441


In [None]:
#305 stop words
#restricted ngrams: 0.8311688311688312
#unrestricted ngrams: 0.8051948051948052

#710 stop words
#if ngrams restricted: 0.8441558441558441

## Feature: Line Length

A structural feature representing the frequency of all occurring line lengths.
Seems to make the classification a lot worse.

In [35]:
#save lengths as list

all_linelengths = []
max_line_length = max([len(text.split('\n')) for text in df["Raw Text"]])

for text in df["Raw Text"]:
    line_lengths_list = []
    
    for line in text.split('\n'):
        line_lengths_list.append(len(line))
    for remain_lines in range(max_line_length - len(line_lengths_list)):
        line_lengths_list.append(0)
    
    all_linelengths.append(line_lengths_list)
    

In [36]:
df["Line Length"] = all_linelengths

In [37]:
#modified get_features function including line lengths

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model (created with the LOO function), both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I want to use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train["POS_710"]
    X_POSstops = X_train[ "POSstops_710"]
    X_lines = list(X_train["Line Length"])
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test["POS_710"]
    X_test_POSstops = X_test[ "POSstops_710"]
    X_test_lines = list(X_test["Line Length"])
    
    # using countvectorizors to get freqs 
    cvec1 = CountVectorizer(vocabulary = stopwords_710, strip_accents="ascii")#word freq of stop words
    cvec2 = CountVectorizer() #freq of POS tags
    
    #add max_feature=1000 to get different results
    cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii", max_features = 1000) #ngrams of POS/stops 

   # fitting  and transforming of train data, and stack vectors at the same time to get X for model
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        cvec3.fit_transform(X_POSstops).toarray(),
        np.asarray(X_lines),
        
        ))
    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            cvec3.transform(X_test_POSstops).toarray(),
            np.asarray(X_test_lines),
        ))

    
    return train_features, test_features


In [38]:
predictions_lines, coeff, cls =  get_preds_imp(df)

In [39]:
df["Pred_lines"]=predictions_lines

In [40]:
df["acc_lines"] =df["Label"]==df[ "Pred_lines"]
acc_lines=df.loc[df.acc_lines== True, 'acc_lines'].count()/df.shape[0]
print(acc_lines)

0.7012987012987013


In [None]:
#305 stop words
#restricted ngrams: 0.7142857142857143
#unrestricted ngrams: 0.6753246753246753

#710 stop words
#restricted ngrams: 0.7012987012987013

## Feature: Sentence Length

In [35]:
from nltk.tokenize.punkt import PunktSentenceTokenizer

tok = PunktSentenceTokenizer()
all_sent_lengths = []
max_sent_length = max([len(tok.sentences_from_text(text)) for text in df["Raw Text"]])

            
    
for text in df["Raw Text"]:
    sent_lengths = []
    for sent in tok.sentences_from_text(text):
        sent_lengths.append(len(sent))
    for remain_sents in range(max_sent_length - len(sent_lengths)):
        sent_lengths.append(0)  
    
    all_sent_lengths.append(sent_lengths)


In [36]:
df["Sent Length"] = all_sent_lengths

In [187]:
#modified get_features function including line lengths

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model (created with the LOO function), both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I want to use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train["POS_710"]
    X_POSstops = X_train[ "POSstops_710"]
    X_sent = list(X_train["Sent Length"])

    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test["POS_710"]
    X_test_POSstops = X_test[ "POSstops_710"]
    X_test_sent = list(X_test["Sent Length"])
    
    # using countvectorizors to get freqs 
    cvec1 = CountVectorizer(vocabulary = stopwords_710, strip_accents="ascii")#word freq of stop words
    cvec2 = CountVectorizer() #freq of POS tags
    cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii") #ngrams of POS/stops 

   # fitting  and transforming of train data, and stack vectors at the same time to get X for model
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        cvec3.fit_transform(X_POSstops).toarray(),
        np.asarray(X_sent),
        
        ))
    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            cvec3.transform(X_test_POSstops).toarray(),
            np.asarray(X_test_sent),
        ))

    
    return train_features, test_features


In [188]:
predictions_sent, coeff, cls =  get_preds_imp(df)

In [45]:
df["Pred_sent"]=predictions_sent

In [46]:
df["acc_sent"] =df["Label"]==df[ "Pred_sent"]
acc_sent=df.loc[df.acc_sent== True, 'acc_sent'].count()/df.shape[0]
print(acc_sent)

0.4675324675324675


In [None]:
#305 stop words
#restricted ngrams: 0.45454545454545453
#unrestricted ngrams: 0.45454545454545453

#710 stop words
#unrestricted ngrams: 0.4675324675324675

## Feature: Frequency of Lemmatized Words

Lemmas of words are already to be found in the df (generated with spacy).

In [51]:
#modified get_features function including lemma frequency

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model (created with the LOO function), both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I want to use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train["POS_710"]
    X_POSstops = X_train[ "POSstops_710"]
    X_lemmas = X_train["Lemmas"]
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test["POS_710"]
    X_test_POSstops = X_test[ "POSstops_710"]
    X_test_lemmas = X_test["Lemmas"]
    
    # using countvectorizors to get freqs 
    cvec1 = CountVectorizer(vocabulary = stopwords_710, strip_accents="ascii")#word freq of stop words
    cvec2 = CountVectorizer() #freq of POS tags
    
    #add max_feature=1000 to get different results
    cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii") #ngrams of POS/stops 
    
    cvec4 = CountVectorizer(strip_accents="ascii")

   # fitting  and transforming of train data, and stack vectors at the same time to get X for model
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        cvec3.fit_transform(X_POSstops).toarray(),
        cvec4.fit_transform(X_lemmas).toarray(),
        
        ))
    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            cvec3.transform(X_test_POSstops).toarray(),
            cvec4.transform(X_test_lemmas).toarray(),
        ))

    
    return train_features, test_features


In [52]:
predictions_lemmas, coeff, cls =  get_preds_imp(df)

In [53]:
df["Pred_lemmas"]=predictions_lemmas

In [54]:
df["acc_lemmas"] =df["Label"]==df[ "Pred_lemmas"]
acc_lemmas=df.loc[df.acc_lemmas== True, 'acc_lemmas'].count()/df.shape[0]
print(acc_lemmas)

0.7792207792207793


In [None]:
#305 stop words
#restricted ngrams: 0.8441558441558441

#710 stop words
#restricted ngrams: 0.8441558441558441
#unrestricted ngrams: 0.7792207792207793

## Feature: Hapax Legomena Frequency

A lexical feature calculating the frequency of Hapax Legomena in the corpora.

In [7]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [54]:
def average_word_length(words):
    """Return the average length of all words in text."""
    lettersum = 0
    for x in words:
        lettersum = lettersum + len(x)
        
    return [lettersum / len(words)]

In [46]:
def hapax_legomena_ratio(words):
    """Return the hapax_legomena ratio for this list of words."""
    adict = {}
    hapaxlego = []
    for x in words:
        if x not in adict:
            adict[x] = 1
        else:
            adict[x] = adict[x] + 1
    for x in adict:
        if adict[x] == 1:
            hapaxlego.append(x)
    hapaxlegoratio = ( len(hapaxlego) / len(words) )
    
    return [hapaxlegoratio]

In [121]:
#Calculate both hapax legomena and word length for next feature
hapax_values = []
avg_wordlength_values = []
for play in df["Raw Text"]:
    processedplay = [word.lower() for word in tokenizer.tokenize(play)]
    hapax_values.append(hapax_legomena_ratio(processedplay))
    avg_wordlength_values.append(average_word_length(processedplay))

df["Average Word Length"] = avg_wordlength_values
df["Hapax Legomena Ratio"] = hapax_values

In [90]:
#modified get_feature_GM_imp function including hapax legomena ratio

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model, both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I wanna use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train[ "POS_305"]
    X_POSstops = X_train[ "POSstops_305"]
    X_Hapax = list(X_train["Hapax Legomena Ratio"])
    
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test[ "POS_305"]
    X_test_POSstops = X_test[ "POSstops_305"]
    X_test_Hapax = list(X_test["Hapax Legomena Ratio"])
    

    #using countvectorizors to get freqs 
    #cvec1 = CountVectorizer( max_features=1000, strip_accents="ascii") 
    cvec2 = CountVectorizer(stop_words= stopwords_305,  max_df=100, max_features=1000, strip_accents="ascii")
     
    cvec1 = CountVectorizer(max_features = 1000, strip_accents="ascii")#word freq of stop words
    #cvec2 = CountVectorizer() #freq of POS tags
    
    #add max_feature=1000 to get different results
    #cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii") #ngrams of POS/stops 


    # fitting  and transforming of train data, and stack vectors at the same time to get X for model, 
    
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        #cvec3.fit_transform(X_POSstops).toarray(),
        np.asarray(X_Hapax),
        ))
    

    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            #cvec3.transform(X_test_POSstops).toarray(),
            np.asarray(X_test_Hapax),
        ))  
    
    

    return train_features, test_features



In [91]:
predictions_hapax, coeff, cls =  get_preds_imp(df)

In [92]:
df["Pred_hapax"]=predictions_hapax

In [93]:
df["acc_hapax"] =df["Label"]==df[ "Pred_hapax"]
acc_hapax=df.loc[df.acc_hapax== True, 'acc_hapax'].count()/df.shape[0]
print(acc_hapax)

0.8311688311688312


In [None]:
#305 stopwords
#0.8181818181818182

#combination of word frequency, POS and hapax (but also works without the hapax)
#0.8311688311688312

## Feature: Average Word Length

A lexical feature calculating the average word length in a given corpus. Has been already appended to the df in feature from before.

In [117]:
#modified get_feature_GM_imp function including average word length

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model, both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I wanna use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train["POS_305"]
    X_POSstops = X_train[ "POSstops_305"]
    X_AvgWordlength = list(X_train["Average Word Length"])
    
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test["POS_305"]
    X_test_POSstops = X_test[ "POSstops_305"]
    X_test_AvgWordlength = list(X_test["Average Word Length"])
    
    
    #using countvectorizors to get freqs 
    #cvec1 = CountVectorizer( max_features=1000, strip_accents="ascii") 
    cvec1 = CountVectorizer(vocabulary = stopwords_305,  strip_accents="ascii")
     
    #cvec1 = CountVectorizer(max_features = 1000, strip_accents="ascii")#word freq of stop words
    cvec2 = CountVectorizer() #freq of POS tags
    
    #add max_feature=1000 to get different results
    cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii") #ngrams of POS/stops 

    # fitting  and transforming of train data, and stack vectors at the same time to get X for model, 
    
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        cvec3.fit_transform(X_POSstops).toarray(),
        np.asarray(X_AvgWordlength),
        ))
    
    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            cvec3.transform(X_test_POSstops).toarray(),
            np.asarray(X_test_AvgWordlength),
        ))  
    

    return train_features, test_features

In [118]:
predictions_word_length, coeff, cls =  get_preds_imp(df)

In [119]:
df["Pred_word_length"]=predictions_word_length

In [120]:
df["acc_word_length"] =df["Label"]==df[ "Pred_word_length"]
acc_word_length=df.loc[df.acc_word_length== True, 'acc_word_length'].count()/df.shape[0]
print(acc_word_length)

0.8181818181818182


In [None]:
#305 stopwords
#0.8181818181818182

## Combine Features that Increased Accuracy Most

Experimenting around the basic features.

In [72]:
#modified get_features function including a combination of stemming, lemmatizing, keyword extraction

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model (created with the LOO function), both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I want to use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train["POS_747"]
    X_POSstops = X_train[ "POSstops_747"]
    X_lemmas = X_train["Lemmas"]
    X_keys = X_train["Keywords"]
    X_stems = X_train["Stems"]
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test["POS_747"]
    X_test_POSstops = X_test[ "POSstops_747"]
    X_test_lemmas = X_test["Lemmas"]
    X_test_keys = X_test["Keywords"]
    X_test_stems = X_test["Stems"]
    
    # using countvectorizors to get freqs 
    cvec1 = CountVectorizer(vocabulary = stopwords_747, strip_accents="ascii")#word freq of stop words
    cvec2 = CountVectorizer() #freq of POS tags
    
    #add max_feature=1000 to get different results
    cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii", max_features = 1000) #ngrams of POS/stops 
    
    cvec4 = CountVectorizer(strip_accents="ascii")
    cvec5 = CountVectorizer(strip_accents="ascii")
    cvec6 = CountVectorizer()

   # fitting  and transforming of train data, and stack vectors at the same time to get X for model
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        cvec3.fit_transform(X_POSstops).toarray(),
        cvec4.fit_transform(X_lemmas).toarray(),
        cvec5.fit_transform(X_keys).toarray(),
        #cvec6.fit_transform(X_stems).toarray(),
        
        ))
    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            cvec3.transform(X_test_POSstops).toarray(),
            cvec4.transform(X_test_lemmas).toarray(),
            cvec5.transform(X_test_keys).toarray(),
            #cvec6.transform(X_test_stems).toarray(),
        ))

    
    return train_features, test_features


In [73]:
predictions_combo5, coeff, cls =  get_preds_imp(df)

In [74]:
df["Pred_combo5"]=predictions_combo5

In [75]:
df["acc_combo5"] =df["Label"]==df[ "Pred_combo5"]
acc_combo5=df.loc[df.acc_combo5== True, 'acc_combo5'].count()/df.shape[0]
print(acc_combo5)

0.8181818181818182


In [None]:
#combo 1: basic features with stemming, lemmatizing, keywords, restricted ngrams, 305 stop words
#0.7662337662337663

#combo 2: combo 1 without stop word freq and POS freq
#0.7402597402597403

#combo 3: only lemmas and keywords
#0.6883116883116883

#it's just getting worse

#combo 4: 747 stop words, basic features, lemmas
#0.8441558441558441

#combo 5: 747 stop words, basic features, lemmas, keywords
#0.8181818181818182

Experimenting without the basic features.

In [134]:
#modified get_feature_GM_imp function

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model, both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I wanna use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train[ "POS_305"]
    X_POSstops = X_train[ "POSstops_305"]
    X_keys = X_train["Keywords"]
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test[ "POS_305"]
    X_test_POSstops = X_test[ "POSstops_305"]
    X_test_keys = X_test["Keywords"]
    
    

    #using countvectorizors to get freqs 
    cvec1 = CountVectorizer( max_features=1000, strip_accents="ascii") 
    cvec2 = CountVectorizer(stop_words= stopwords_305,  max_df=100, max_features=1000, strip_accents="ascii")
    cvec3 = CountVectorizer(strip_accents = "ascii")
    
    #cvec2 = CountVectorizer() #freq of POS tags
    
    #add max_feature=1000 to get different results
    #cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii") #ngrams of POS/stops 


    # fitting  and transforming of train data, and stack vectors at the same time to get X for model, 
    
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POSstops).toarray(),
        cvec3.fit_transform(X_keys).toarray(),
        
        ))
    

    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POSstops).toarray(),
            cvec3.transform(X_test_keys).toarray(),
            
        ))  
    
    

    return train_features, test_features



In [138]:
predictions_tryout, coeff, cls =  get_preds_imp(df)

In [139]:
df["Pred_tryout"]=predictions_tryout

In [140]:
df["acc_tryout"] =df["Label"]==df[ "Pred_tryout"]
acc_tryout=df.loc[df.acc_tryout== True, 'acc_tryout'].count()/df.shape[0]
print(acc_tryout)

0.7792207792207793


In [None]:
#word freq, POS, lemmas
#0.7662337662337663

#lemmas, POS
#0.7012987012987013

#word freq, POS, keywords
#0.7792207792207793

Trying to improve the best model (basic features and keywords).

In [181]:
#modified get_features function 

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model (created with the LOO function), both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I want to use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train["POS_747"]
    X_POSstops = X_train[ "POSstops_747"]
    X_key = X_train["Keywords"]
    X_lemmas = X_train["Lemmas"]
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test["POS_747"]
    X_test_POSstops = X_test[ "POSstops_747"]
    X_test_key = X_test["Keywords"]
    X_test_lemmas = X_test["Lemmas"]
    
    
    # using countvectorizors to get freqs 
    cvec1 = CountVectorizer(vocabulary = stopwords_747, strip_accents="ascii")#word freq of stop words
    cvec2 = CountVectorizer(max_features = 1000) #freq of POS tags
    
    #add max_feature=1000 to get different results
    cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii", max_features=1000) #ngrams of POS/stops
    
    cvec4 = CountVectorizer(strip_accents="ascii") #keyword frequency
    cvec5 = CountVectorizer(stop_words = stopwords_305, max_features = 500)

   # fitting  and transforming of train data, and stack vectors at the same time to get X for model
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        cvec3.fit_transform(X_POSstops).toarray(),
        cvec4.fit_transform(X_key).toarray(),
        cvec5.fit_transform(X_lemmas).toarray(),
        
        ))
    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            cvec3.transform(X_test_POSstops).toarray(),
            cvec4.transform(X_test_key).toarray(),
            cvec5.transform(X_test_lemmas).toarray(),
        ))

    
    return train_features, test_features


In [182]:
predictions_tryout, coeff, cls =  get_preds_imp(df)

In [183]:
df["Pred_tryout"]=predictions_tryout

In [184]:
df["acc_tryout"] =df["Label"]==df[ "Pred_tryout"]
acc_tryout=df.loc[df.acc_tryout== True, 'acc_tryout'].count()/df.shape[0]
print(acc_tryout)

0.8441558441558441


In [None]:
#basic, keywords
#0.8571428571428571

#basic - POS, keywords
#0.8311688311688312

#basic (restricting stop words), keywords
#0.8571428571428571

#word freq, POS, ngrams, keywords
#0.7792207792207793

#basic, keywords, lemmas with removed stopwords
#0.7792207792207793

#basic (restricting POS, restricting ngrams to 500, keywords)
#0.8571428571428571

#basic (restricting POS, restricting ngrams to 500 and using 3-grams, keywords)
#0.8181818181818182

#basic (restricting POS, restricting ngrams to 500 and using bigrams and 3-grams, keywords)
#0.8181818181818182

#basic with 747 stop words, keywords
#0.8571428571428571

#basic with 747 stop words, keywords, lemmas with removed stop words and restricted to 500
#0.8441558441558441

### Using the Best Model to Predict the Author for <i> A Second Maiden's Tragedy <i>

Fox et al. had <i> A Second Maiden's Tragedy </i> included in their corpus with Middleton as its true author. To ensure the comparability of our models with those of Fox et al., we left it in the corpus when evaluating our model. Now we can have a look at what our overall best model says about the attribution of the play in question.

In [189]:
#modified get_features function including keywords

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model (created with the LOO function), both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I want to use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train["POS_747"]
    X_POSstops = X_train[ "POSstops_747"]
    X_key = X_train["Keywords"]
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test["POS_747"]
    X_test_POSstops = X_test[ "POSstops_747"]
    X_test_key = X_test["Keywords"]
    
    # using countvectorizors to get freqs 
    cvec1 = CountVectorizer(vocabulary = stopwords_747, strip_accents="ascii")#word freq of stop words
    cvec2 = CountVectorizer() #freq of POS tags
    
    #add max_feature=1000 to get different results
    cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii", max_features=1000) #ngrams of POS/stops
    
    cvec4 = CountVectorizer(strip_accents="ascii") #keyword frequency

   # fitting  and transforming of train data, and stack vectors at the same time to get X for model
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        cvec3.fit_transform(X_POSstops).toarray(),
        cvec4.fit_transform(X_key).toarray(),
        
        ))
    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            cvec3.transform(X_test_POSstops).toarray(),
            cvec4.transform(X_test_key).toarray(),
        ))

    
    return train_features, test_features


In [190]:
predictions_best, coeff, cls =  get_preds_imp(df)

In [191]:
df["Pred_best"]=predictions_best

In [192]:
df["acc_best"] =df["Label"]==df[ "Pred_best"]
acc_best=df.loc[df.acc_best== True, 'acc_best'].count()/df.shape[0]
print(acc_best)


0.8571428571428571


In [199]:
if df.at[50, 'acc_best'] == True:
    print("Play is most likely written by Middleton")
else:
    print("Play is most likely written by author with label {}".format(df.at[50, 'Pred_best']))

Play is most likely written by Middleton


### Attributing <i> A Yorkeshire Tragedy </i>

This play is considered to be a collaboration and likely to include Shakespeare and Middleton amongst its authors.

In [19]:
#read in text
with open('yorkshire.txt') as text:
    yorkshire = text.read()
    yorkshire = yorkshire.replace('\xa0', ' ')
    yorkshire = yorkshire.replace('-', '')
    yorkshire = yorkshire.replace('\n', ' ')

In [26]:
#process text in order to compute features

nlp = spacy.load("en")
y_doc = nlp(yorkshire)

y_POS = ""
y_POSstops = ""
for word in y_doc:
    if word.is_stop:
        y_POSstops += str(word) 
        y_POSstops += str(" ") 
    else:
        y_POSstops  += str(word.pos_)
        y_POSstops += str(" ")
        y_POS += str(word.pos_)
        y_POS += " "


import RAKE
rake = RAKE.Rake(stopwords_305) 

y_keywords = ""

y_keys = rake.run(yorkshire)
y_keys = set([y_key[0] for y_key in sorted(y_keys, key=lambda word: word[1])[:1000]])
for y_key in y_keys:
    y_keywords += y_key + " "
    


In [71]:
#get features without function since the play is not included in the dataframe


X_txt = df["Raw Text"]
X_POS = df["POS_305"]
X_POSstops = df[ "POSstops_305"]
X_key = df["Keywords"]
    

X_test_txt = pd.Series(yorkshire)
X_test_POS = pd.Series(y_POS)
X_test_POSstops = pd.Series(y_POSstops)
X_test_key = pd.Series(y_keywords)


cvec1 = CountVectorizer(vocabulary = stopwords_305, strip_accents="ascii")#word freq of stop words
cvec2 = CountVectorizer() #freq of POS tags
    
#add max_feature=1000 to get different results
cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii", max_features=1000) #ngrams of POS/stops
    
cvec4 = CountVectorizer(strip_accents="ascii") #keyword frequency

# fitting  and transforming of train data, and stack vectors at the same time to get X for model
train_features= np.hstack((
    cvec1.fit_transform(X_txt).toarray(),
    cvec2.fit_transform(X_POS).toarray(),
    cvec3.fit_transform(X_POSstops).toarray(),
    cvec4.fit_transform(X_key).toarray(),
        
    ))


#only fit X_test data
test_features = np.hstack((
        cvec1.transform(X_test_txt).toarray(),
        cvec2.transform(X_test_POS).toarray(),
        cvec3.transform(X_test_POSstops).toarray(),
        cvec4.transform(X_test_key).toarray(),
        ))



In [72]:
#predict
train_label=df["Label"]
pred, coeff, cls= predict(train_features, train_label, test_features)

In [73]:
print("Play is most likely written by author with label {}".format(pred))

Play is most likely written by author with label [5]


Our model says that <i> A Yorkshire Tragedy </i> has most likely been written by Chapman though he is not even considered in the debate about the authorship of this play. Therefore it doesn't seem likely that our model is correct in this case.

### Attributing <i> The Puritan </i>

See what our model says about a play that is not included in the Fox corpus but has often been attributed to Middleton.

In [75]:
#read in text
with open('puritan.txt') as text:
    puritan = text.read()
    puritan = puritan.replace('\xa0', ' ')
    puritan = puritan.replace('-', '')
    puritan = puritan.replace('\n', ' ')

In [76]:
#process text in order to compute features

nlp = spacy.load("en")
p_doc = nlp(puritan)

p_POS = ""
p_POSstops = ""
for word in p_doc:
    if word.is_stop:
        p_POSstops += str(word) 
        p_POSstops += str(" ") 
    else:
        p_POSstops  += str(word.pos_)
        p_POSstops += str(" ")
        p_POS += str(word.pos_)
        p_POS += " "


import RAKE
rake = RAKE.Rake(stopwords_305) 

p_keywords = ""

p_keys = rake.run(puritan)
p_keys = set([p_key[0] for p_key in sorted(p_keys, key=lambda word: word[1])[:1000]])
for p_key in p_keys:
    p_keywords += p_key + " "
    


In [77]:
#get features without function since the play is not included in the dataframe


X_txt = df["Raw Text"]
X_POS = df["POS_305"]
X_POSstops = df[ "POSstops_305"]
X_key = df["Keywords"]
    

X_test_txt = pd.Series(puritan)
X_test_POS = pd.Series(p_POS)
X_test_POSstops = pd.Series(p_POSstops)
X_test_key = pd.Series(p_keywords)


cvec1 = CountVectorizer(vocabulary = stopwords_305, strip_accents="ascii")#word freq of stop words
cvec2 = CountVectorizer() #freq of POS tags
    
#add max_feature=1000 to get different results
cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii", max_features=1000) #ngrams of POS/stops
    
cvec4 = CountVectorizer(strip_accents="ascii") #keyword frequency

# fitting  and transforming of train data, and stack vectors at the same time to get X for model
train_features= np.hstack((
    cvec1.fit_transform(X_txt).toarray(),
    cvec2.fit_transform(X_POS).toarray(),
    cvec3.fit_transform(X_POSstops).toarray(),
    cvec4.fit_transform(X_key).toarray(),
        
    ))


#only fit X_test data
test_features = np.hstack((
        cvec1.transform(X_test_txt).toarray(),
        cvec2.transform(X_test_POS).toarray(),
        cvec3.transform(X_test_POSstops).toarray(),
        cvec4.transform(X_test_key).toarray(),
        ))



In [78]:
#predict
train_label=df["Label"]
pred, coeff, cls= predict(train_features, train_label, test_features)

In [79]:
print("Play is most likely written by author with label {}".format(pred))

Play is most likely written by author with label [3]


It (presumably) correctly states that the play was written by Middleton.