##  SVM Feature Engineering 

Uses
- Support Vector Machine as classifier
- provided original dataframe (DataFrame.xlsx), improved data frame (DataFrame_imp.xlsx)
- basic features POS tag frequency, stop word frequency (of 507 and 305 stop words), n-grams of POS and stop words

to test different additional features, namely
- word frequency
- keyword extraction
- stemming
- line length
- sentence length
- lemmatizing

In [78]:
#NOTE

#Results can differ depending on how the CountVectorizers in the get_feature functions are built. Two versions 
#have been tried and been found to affect the results for better or for worse in a non consistent way 
#in each feature.

#Version 1
#cvec1 = CountVectorizer(vocabulary = stopwords_507, strip_accents="ascii")#word freq of stop words
#cvec2 = CountVectorizer() #freq of POS tags
#cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii") #ngrams of POS/stops

#Version 2
#cvec1 = CountVectorizer(vocabulary= stopwords_507, strip_accents="ascii")#word freq of stop words
#cvec2 = CountVectorizer(max_features=1000, strip_accents="ascii") #freq of POS tags
#cvec3 = CountVectorizer(ngram_range=(2,2),max_features=1000, strip_accents="ascii") #ngrams of POS/stops 

In [2]:
import os
import re
import spacy
import pandas as pd
from glob import glob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn import linear_model

import nltk
from nltk.corpus import stopwords
from stop_words import get_stop_words

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

import matplotlib.pyplot as plt

import numpy as np

### Get Stop Words

In [3]:
nlp = spacy.load("en")
stopwrd1= []
for word in nlp.Defaults.stop_words:
    stopwrd1.append(word)


stopwords2 = stopwords.words('english')


stopwords3 = get_stop_words('english')

#from https://www.ranks.nl/stopwords
stopwords4 = ['a ', 'able', 'about', 'above', 'abst', 'accordance', 'according', 'accordingly', 'across', 'act', 'actually', 'added', 'adj', 'affected', 'affecting', 'affects', 'after', 'afterwards', 'again', 'against', 'ah', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'announce', 'another', 'any', 'anybody', 'anyhow', 'anymore', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apparently', 'approximately', 'are', 'aren', 'arent', 'arise', 'around', 'as', 'aside', 'ask', 'asking', 'at', 'auth', 'available', 'away', 'awfully', 'b', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'begin', 'beginning', 'beginnings', 'begins', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'between', 'beyond', 'biol', 'both', 'brief', 'briefly', 'but', 'by', 'c', 'ca', 'came', 'can', 'cannot', "can't", 'cause', 'causes', 'certain', 'certainly', 'co', 'com', 'come', 'comes', 'contain', 'containing', 'contains', 'could', 'couldnt', 'd', 'date', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'downwards', 'due', 'during', 'e', 'each', 'ed', 'edu', 'effect', 'eg', 'eight', 'eighty', 'either', 'else', 'elsewhere', 'end', 'ending', 'enough', 'especially', 'et', 'et-al', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'except', 'f', 'far', 'few', 'ff', 'fifth', 'first', 'five', 'fix', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'found', 'four', 'from', 'further', 'furthermore', 'g', 'gave', 'get', 'gets', 'getting', 'give', 'given', 'gives', 'giving', 'go', 'goes', 'gone', 'got', 'gotten', 'h', 'had', 'happens', 'hardly', 'has', "hasn't", 'have', "haven't", 'having', 'he', 'hed', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'heres', 'hereupon', 'hers', 'herself', 'hes', 'hi', 'hid', 'him', 'himself', 'his', 'hither', 'home', 'how', 'howbeit', 'however', 'hundred', 'i', 'id', 'ie', 'if', "i'll", 'im', 'immediate', 'immediately', 'importance', 'important', 'in', 'inc', 'indeed', 'index', 'information', 'instead', 'into', 'invention', 'inward', 'is', "isn't", 'it', 'itd', "it'll", 'its', 'itself', "i've", 'j', 'just', 'k', 'keep\tkeeps', 'kept', 'kg', 'km', 'know', 'known', 'knows', 'l', 'largely', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', 'lets', 'like', 'liked', 'likely', 'line', 'little', "'ll", 'look', 'looking', 'looks', 'ltd', 'm', 'made', 'mainly', 'make', 'makes', 'many', 'may', 'maybe', 'me', 'mean', 'means', 'meantime', 'meanwhile', 'merely', 'mg', 'might', 'million', 'miss', 'ml', 'more', 'moreover', 'most', 'mostly', 'mr', 'mrs', 'much', 'mug', 'must', 'my', 'myself', 'n', 'na', 'name', 'namely', 'nay', 'nd', 'near', 'nearly', 'necessarily', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'ninety', 'no', 'nobody', 'non', 'none', 'nonetheless', 'noone', 'nor', 'normally', 'nos', 'not', 'noted', 'nothing', 'now', 'nowhere', 'o', 'obtain', 'obtained', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'omitted', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'ord', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'owing', 'own', 'p', 'page', 'pages', 'part', 'particular', 'particularly', 'past', 'per', 'perhaps', 'placed', 'please', 'plus', 'poorly', 'possible', 'possibly', 'potentially', 'pp', 'predominantly', 'present', 'previously', 'primarily', 'probably', 'promptly', 'proud', 'provides', 'put', 'q', 'que', 'quickly', 'quite', 'qv', 'r', 'ran', 'rather', 'rd', 're', 'readily', 'really', 'recent', 'recently', 'ref', 'refs', 'regarding', 'regardless', 'regards', 'related', 'relatively', 'research', 'respectively', 'resulted', 'resulting', 'results', 'right', 'run', 's', 'said', 'same', 'saw', 'say', 'saying', 'says', 'sec', 'section', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sent', 'seven', 'several', 'shall', 'she', 'shed', "she'll", 'shes', 'should', "shouldn't", 'show', 'showed', 'shown', 'showns', 'shows', 'significant', 'significantly', 'similar', 'similarly', 'since', 'six', 'slightly', 'so', 'some', 'somebody', 'somehow', 'someone', 'somethan', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specifically', 'specified', 'specify', 'specifying', 'still', 'stop', 'strongly', 'sub', 'substantially', 'successfully', 'such', 'sufficiently', 'suggest', 'sup', 'sure\tt', 'take', 'taken', 'taking', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', "that'll", 'thats', "that've", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 
             'thered', 'therefore', 'therein', "there'll", 'thereof', 'therere', 'theres', 'thereto', 'thereupon', "there've", 'these', 'they', 'theyd', "they'll", 'theyre', "they've", 'think', 'this', 'those', 'thou', 'though', 'though', 'thousand', 'throug', 'through', 'throughout', 'thru', 'thus', 'til', 'tip', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'ts', 'twice', 'two', 'u', 'un', 'under', 'unfortunately', 'unless', 'unlike', 'unlikely', 'until', 'unto', 'up', 'upon', 'ups', 'us', 'use', 'used', 'useful', 'usefully', 'usefulness', 'uses', 'using', 'usually', 'v', 'value', 'various', "'ve", 'very', 'via', 'viz', 'vol', 'vols', 'vs', 'w', 'want', 'wants', 'was', 'wasnt', 'way', 'we', 'wed', 'welcome', "we'll", 'went', 'were', 'werent', "we've", 'what', 'whatever', "what'll", 'whats', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'wheres', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whim', 'whither', 'who', 'whod', 'whoever', 'whole', "who'll", 'whom', 'whomever', 'whos', 'whose', 'why', 'widely', 'willing', 'wish', 'with', 'within', 'without', 'wont', 'words', 'world', 'would', 'wouldnt', 'www', 'x', 'y', 'yes', 'yet', 'you', 'youd', "you'll", 'your', 'youre', 'yours', 'yourself', 'yourselves', "you've", 'z', 'zero']

        

stopwords_305 = stopwrd1
stopwords_747= list(set(stopwrd1 + stopwords2+ stopwords3 +stopwords4))
stopwords_710= stopwords_747[:710]


In [4]:
stopwords_507=stopwords_747[:int(305+(710-305)/2)]

### SVM Classifier

In [31]:
def  predict(train_features, train_label, test_features): 
    """
    input: features for train and test instances and label for train instances
    
    returns prediction for instance and coefficient matrix and classifier
    
    """
    
    clf = LinearSVC()
    clf.fit(train_features, train_label)
    pred=clf.predict(test_features)
    coef = clf.coef_
    return pred, coef, clf

In [None]:
#NOTE

#using the SVM may create a non convergence warning, which does not affect the output 
#and cannot be fixed by an increase of iterations

### Leave One Out Cross Validation

In [27]:
def get_LOO_instances(dataframe):
    """
    
    generates instances for LOO
    
    input: df
    
    output: list with instances as tuples: [(train_row, test_row)]
    
    """
    loo_instances=[]
    for instance in range(dataframe.shape[0]):
        inst= list(range(dataframe.shape[0]))
        inst.remove(instance)
        test=dataframe.drop(inst)
        train= dataframe.drop([instance])
        loo_instances.append((train, test))
    return loo_instances


In [83]:
#read in given data frame for 507 stop words
df = pd.read_excel("DataFrame_imp.xlsx")
#add Lemmas
old_df = pd.read_excel("DataFrame.xlsx")
df["Lemmas"] = old_df["Lemmas"]

In [89]:
#read in other data frame for other stop words
df = pd.read_excel("DataFrame.xlsx")

In [90]:
df

Unnamed: 0.1,Unnamed: 0,Play_id,Raw Text,POS_305,POS_710,POS_747,POSstops_305,POSstops_710,POSstops_747,Lemmas,Label,Author,Play
0,0,0,"As I remember , Adam , it was upon this fashio...",ADP PRON VERB PUNCT PROPN PUNCT NOUN SPACE VE...,ADP PRON VERB PUNCT PROPN PUNCT VERB NOUN SPA...,ADP PRON VERB PUNCT PROPN PUNCT NOUN SPACE VE...,ADP PRON VERB PUNCT PROPN PUNCT it was upon t...,ADP PRON VERB PUNCT PROPN PUNCT it VERB upon ...,ADP PRON VERB PUNCT PROPN PUNCT it was upon t...,"as -PRON- remember , adam , -PRON- be upon th...",0,E-Shakespeare,asyoulikeit.txt.E-Shakespeare.tok
1,1,1,"Proceed , Solinus , to procure my fall\nAnd by...",PROPN PUNCT PROPN PUNCT VERB NOUN SPACE CCONJ...,PROPN PUNCT PROPN PUNCT VERB NOUN SPACE CCONJ...,PROPN PUNCT PROPN PUNCT VERB NOUN SPACE CCONJ...,PROPN PUNCT PROPN PUNCT to VERB my NOUN SPACE...,PROPN PUNCT PROPN PUNCT to VERB my NOUN SPACE...,PROPN PUNCT PROPN PUNCT to VERB my NOUN SPACE...,"proceed , solinus , to procure -PRON- fall \n...",0,E-Shakespeare,comedy_errors.txt.E-Shakespeare.tok
2,2,2,"Who 's there ?\nNay , answer me : stand , and ...",NOUN VERB PUNCT SPACE PROPN PUNCT VERB PUNCT ...,NOUN VERB PUNCT SPACE PROPN PUNCT VERB PUNCT ...,NOUN VERB PUNCT SPACE PROPN PUNCT VERB PUNCT ...,NOUN VERB there PUNCT SPACE PROPN PUNCT VERB ...,NOUN VERB there PUNCT SPACE PROPN PUNCT VERB ...,NOUN VERB there PUNCT SPACE PROPN PUNCT VERB ...,"who be there ? \n nay , answer -PRON- : stand...",0,E-Shakespeare,Hamlet.txt.E-Shakespeare.tok
3,3,3,"So shaken as we are , so wan with care ,\nFind...",ADV VERB PUNCT NOUN NOUN PUNCT SPACE VERB NOU...,ADV VERB PUNCT NOUN NOUN PUNCT SPACE VERB NOU...,ADV VERB PUNCT NOUN NOUN PUNCT SPACE VERB NOU...,ADV VERB as we are PUNCT so NOUN with NOUN PU...,ADV VERB as we are PUNCT so NOUN with NOUN PU...,ADV VERB as we are PUNCT so NOUN with NOUN PU...,"so shake as -PRON- be , so wan with care , \n...",0,E-Shakespeare,henryivPart1.txt.E-Shakespeare.tok
4,4,4,Open your ears ; for which of you will stop\nT...,VERB NOUN PUNCT VERB SPACE DET NOUN VERB ADJ ...,VERB NOUN PUNCT SPACE DET NOUN VERB ADJ PROPN...,VERB NOUN PUNCT SPACE DET NOUN VERB ADJ PROPN...,VERB your NOUN PUNCT for which of you will VE...,VERB your NOUN PUNCT for which of you will st...,VERB your NOUN PUNCT for which of you will st...,open -PRON- ear ; for which of -PRON- will st...,0,E-Shakespeare,henryivPart2.txt.E-Shakespeare.tok
5,5,5,"O for a Muse of fire , that would ascend\nThe ...",INTJ PROPN NOUN PUNCT VERB SPACE DET ADJ PROP...,INTJ PROPN NOUN PUNCT VERB SPACE DET ADJ PROP...,INTJ PROPN NOUN PUNCT VERB SPACE DET ADJ PROP...,INTJ for a PROPN of NOUN PUNCT that would VER...,INTJ for a PROPN of NOUN PUNCT that would VER...,INTJ for a PROPN of NOUN PUNCT that would VER...,"o for a muse of fire , that would ascend \n t...",0,E-Shakespeare,henryv.txt.E-Shakespeare.tok
6,6,6,"Hung be the heavens with black , yield day to ...",PROPN NOUN ADJ PUNCT NOUN NOUN NOUN PUNCT SPA...,PROPN NOUN ADJ PUNCT NOUN NOUN NOUN PUNCT SPA...,PROPN NOUN ADJ PUNCT NOUN NOUN NOUN PUNCT SPA...,PROPN be the NOUN with ADJ PUNCT NOUN NOUN to...,PROPN be the NOUN with ADJ PUNCT NOUN NOUN to...,PROPN be the NOUN with ADJ PUNCT NOUN NOUN to...,"hung be the heaven with black , yield day to ...",0,E-Shakespeare,henryviPart1.txt.E-Shakespeare.tok
7,7,7,As by your high imperial majesty\nI had in cha...,ADP ADJ ADJ NOUN SPACE PRON NOUN NOUN PROPN P...,ADP ADJ ADJ NOUN SPACE PRON NOUN ADP NOUN PRO...,ADP ADJ ADJ NOUN SPACE PRON NOUN NOUN PROPN P...,ADP by your ADJ ADJ NOUN SPACE PRON had in NO...,ADP by your ADJ ADJ NOUN SPACE PRON had in NO...,ADP by your ADJ ADJ NOUN SPACE PRON had in NO...,as by -PRON- high imperial majesty \n -PRON- ...,0,E-Shakespeare,henryviPart2.txt.E-Shakespeare.tok
8,8,8,I wonder how the king escaped our hands .\nWhi...,PRON VERB NOUN VERB NOUN PUNCT SPACE ADP VERB...,PRON VERB NOUN VERB NOUN PUNCT SPACE ADP VERB...,PRON VERB NOUN VERB NOUN PUNCT SPACE ADP VERB...,PRON VERB how the NOUN VERB our NOUN PUNCT SP...,PRON VERB how the NOUN VERB our NOUN PUNCT SP...,PRON VERB how the NOUN VERB our NOUN PUNCT SP...,-PRON- wonder how the king escape -PRON- hand...,0,E-Shakespeare,henryviPart3.txt.E-Shakespeare.tok
9,9,9,"Hence ! home , you idle creatures get you home...",ADV PUNCT ADV PUNCT ADJ NOUN ADV PUNCT SPACE ...,ADV PUNCT PUNCT ADJ NOUN PUNCT SPACE VERB NOU...,ADV PUNCT PUNCT ADJ NOUN PUNCT SPACE VERB NOU...,ADV PUNCT ADV PUNCT you ADJ NOUN get you ADV ...,ADV PUNCT home PUNCT you ADJ NOUN get you hom...,ADV PUNCT home PUNCT you ADJ NOUN get you hom...,"hence ! home , -PRON- idle creature get -PRON...",0,E-Shakespeare,julius_caesar.txt.E-Shakespeare.tok


### Get Basic Features

In [28]:
def get_preds_imp(dataframe):
    
    """
    change function get_features_GM(train_data, test_data) for feature engineering!
    input: Dataframe
    
    output: predictions for test instances, and coefficient matrix, cls
    """
    
    instances= get_LOO_instances(dataframe)
    predictions=[]
    for inst in instances:
        train_data= inst[0]
        test_data = inst[1]
        train_label=train_data["Label"]
        train_features, test_features= get_features_GM_imp(train_data, test_data)
        pred, coeff, cls= predict(train_features, train_label, test_features)
        predictions.append(pred)
        
    return np.array(predictions), coeff, cls


In [91]:

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model (created with the LOO function), both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I want to use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train["POS_305"]
    X_POSstops = X_train[ "POSstops_305"]
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test["POS_305"]
    X_test_POSstops = X_test[ "POSstops_305"]
    
    
    # using countvectorizors to get freqs 
    cvec1 = CountVectorizer(vocabulary = stopwords_305, strip_accents="ascii")#word freq of stop words
    cvec2 = CountVectorizer() #freq of POS tags
    
    #add max_feature=1000 to get different results
    cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii") #ngrams of POS/stops
    
    #cvec1 = CountVectorizer(vocabulary= stopwords_507, strip_accents="ascii")#word freq of stop words
    #cvec2 = CountVectorizer(max_features=1000, strip_accents="ascii") #freq of POS taggs
    #cvec3 = CountVectorizer(ngram_range=(2,2),max_features=1000, strip_accents="ascii") #ngrams of POS/stops 

    #fitting  and transforming of train data, and stack vectors at the same time to get X for model
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        cvec3.fit_transform(X_POSstops).toarray(),
        ))
    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            cvec3.transform(X_test_POSstops).toarray(),
        ))

    
    return train_features, test_features


In [92]:
predictions, coeff, cls =  get_preds_imp(df)





In [93]:
df["Pred_imp"]=predictions

In [94]:
df["acc_imp"] =df["Label"]==df[ "Pred_imp"]
acc_imp=df.loc[df.acc_imp== True, 'acc_imp'].count()/df.shape[0]
print(acc_imp)


0.8181818181818182


In [None]:
#507 stop words
#0.8181818181818182

#305 stop words
#0.8181818181818182

## Feature: Word Frequency

Add simple word frequency to the basic features rather than using it by itself as in the bag of words model.

In [95]:
#modified get_features function including word frequency

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model (created with the LOO function), both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I want to use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train["POS_305"]
    X_POSstops = X_train[ "POSstops_305"]
   
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test["POS_305"]
    X_test_POSstops = X_test[ "POSstops_305"]
    
    
    # using countvectorizors to get freqs 
    cvec1 = CountVectorizer(vocabulary = stopwords_305, strip_accents="ascii")#word freq of stop words
    cvec2 = CountVectorizer() #freq of POS tags
    
    #add max_feature=1000 to get different results
    cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii") #ngrams of POS/stops
    
    cvec4 = CountVectorizer(strip_accents="ascii") 

   # fitting  and transforming of train data, and stack vectors at the same time to get X for model
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        cvec3.fit_transform(X_POSstops).toarray(),
        cvec4.fit_transform(X_txt).toarray(),
        
        ))
    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            cvec3.transform(X_test_POSstops).toarray(),
            cvec4.transform(X_test_txt).toarray(),
        ))

    
    return train_features, test_features


In [96]:
predictions_word_freq, coeff, cls =  get_preds_imp(df)





In [97]:
df["Pred_word_freq"]=predictions_word_freq

In [98]:
df["acc_word_freq"] =df["Label"]==df[ "Pred_word_freq"]
acc_word_freq=df.loc[df.acc_word_freq== True, 'acc_word_freq'].count()/df.shape[0]
print(acc_word_freq)


0.8181818181818182


In [None]:
#507 stop words
#0.8181818181818182

#305 stop words
#0.8181818181818182

## Feature: Keyword Extraction

Uses package RAKE (Rapid Automated Keyword Extraction) to extract keywords and key phrases from the text which can then be counted to form a feature vector.

In [101]:
"""Get set out of 1000 highest ranked keywords for each play saved in one single string"""

import RAKE
rake = RAKE.Rake(stopwords_305) #use stopwords with best results from before

all_keywords = []
punctuation = """!"',;:.-?)([]<>*#\n\t\r """

for text in df["Raw Text"]:
    keywords = ""
    text_keywords = rake.run(text.strip(punctuation))
    text_keywords = set([keywords[0] for keywords in sorted(text_keywords, key=lambda word: word[1])[:1000]])
    for keyword in text_keywords:
        keywords += keyword + " "
    
    all_keywords.append(keywords)

all_keywords = np.array(all_keywords)


    

In [102]:
#create column and save in df
df["Keywords"] = all_keywords

In [103]:
#modified get_features function including keywords

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model (created with the LOO function), both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I want to use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train["POS_305"]
    X_POSstops = X_train[ "POSstops_305"]
    X_key = X_train["Keywords"]
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test["POS_305"]
    X_test_POSstops = X_test[ "POSstops_305"]
    X_test_key = X_test["Keywords"]
    
    # using countvectorizors to get freqs 
    cvec1 = CountVectorizer(vocabulary = stopwords_305, strip_accents="ascii")#word freq of stop words
    cvec2 = CountVectorizer() #freq of POS tags
    
    #add max_feature=1000 to get different results
    cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii", max_features=1000) #ngrams of POS/stops
    
    cvec4 = CountVectorizer(strip_accents="ascii") #keyword frequency

   # fitting  and transforming of train data, and stack vectors at the same time to get X for model
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        cvec3.fit_transform(X_POSstops).toarray(),
        cvec4.fit_transform(X_key).toarray(),
        
        ))
    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            cvec3.transform(X_test_POSstops).toarray(),
            cvec4.transform(X_test_key).toarray(),
        ))

    
    return train_features, test_features


In [104]:
predictions_key, coeff, cls =  get_preds_imp(df)





In [105]:
df["Pred_keywords"]=predictions_key

In [106]:
df["acc_keywords"] =df["Label"]==df[ "Pred_keywords"]
acc_keywords=df.loc[df.acc_keywords== True, 'acc_keywords'].count()/df.shape[0]
print(acc_keywords)


0.8181818181818182


In [None]:
#507 stop words
#0.8051948051948052

#305 stop words
#0.8181818181818182

## Feature: Frequency of Stemmed Words

Three different nltk stemmers were tested to add frequency of stemmed words as a feature. 

In [107]:
"""Transform given play to a stemmed version and save in df"""

#three different nltk stemmers to be tested
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer

punctuation = """!"',;:.-?)([]<>*#\n\t\r """
all_stemmed_texts = []

lancaster = LancasterStemmer()
porter = PorterStemmer()
snowball = SnowballStemmer("english")

#change this to test different stemmers
#Porter works best (see results below)
stemmer = porter

for text in df["Raw Text"]:
    stemmed_text = ""
    for word in text.split():
        stemmed_text += stemmer.stem(word.strip(punctuation)) + " "
    
    all_stemmed_texts.append(stemmed_text)

all_stemmed_texts = np.array(all_stemmed_texts)

In [108]:
#save Stem column in df
df["Stems"] = all_stemmed_texts

In [109]:
#modified get_features function including stemmed words frequency

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model (created with the LOO function), both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I want to use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train["POS_305"]
    X_POSstops = X_train[ "POSstops_305"]
    X_stem = X_train["Stems"]
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test["POS_305"]
    X_test_POSstops = X_test[ "POSstops_305"]
    X_test_stem = X_test["Stems"]
    
    # using countvectorizors to get freqs 
    cvec1 = CountVectorizer(vocabulary = stopwords_305, strip_accents="ascii")#word freq of stop words
    cvec2 = CountVectorizer() #freq of POS tags
    
    #add max_feature=1000 to get different results
    cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii", max_features = 1000) #ngrams of POS/stops 
    
    cvec4 = CountVectorizer()

   # fitting  and transforming of train data, and stack vectors at the same time to get X for model
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        cvec3.fit_transform(X_POSstops).toarray(),
        cvec4.fit_transform(X_stem).toarray(),
        
        ))
    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            cvec3.transform(X_test_POSstops).toarray(),
            cvec4.transform(X_test_stem).toarray(),
        ))

    
    return train_features, test_features


In [110]:
predictions_stem, coeff, cls =  get_preds_imp(df)





In [111]:
df["Pred_stems"]=predictions_stem

In [112]:
df["acc_stems"] =df["Label"]==df[ "Pred_stems"]
acc_stems=df.loc[df.acc_stems== True, 'acc_stems'].count()/df.shape[0]
print(acc_stems)

0.8181818181818182


In [None]:
#507 stop words
#0.8181818181818182

#305 stop words
#0.8181818181818182

In [67]:
df

Unnamed: 0.1,Unnamed: 0,Play_id,Raw Text,POS_new,POSstops_new,Label,Author,Play,Pred_imp,acc_imp,Pred_word_freq,acc_word_freq,Keywords,Pred_keywords,acc_keywords,Stems,Pred_stems,acc_stems
0,0,0,"As I remember , Adam , it was upon this fashio...",ADP PRON VERB PUNCT PROPN PUNCT ADP DET NOUN ...,ADP PRON VERB PUNCT PROPN PUNCT it was ADP DE...,0,E-Shakespeare,asyoulikeit.txt.E-Shakespeare.tok,0,True,0,True,dear rose enemy value avoid nature trust thee ...,0,True,As I rememb adam it wa upon thi fashion bequ...,0,True
1,1,1,"Proceed , Solinus , to procure my fall\nAnd by...",PROPN PUNCT PROPN PUNCT PART VERB ADJ NOUN SP...,PROPN PUNCT PROPN PUNCT PART VERB ADJ NOUN SP...,0,E-Shakespeare,comedy_errors.txt.E-Shakespeare.tok,0,True,0,True,saddler welcome falsehood wench nature marvel ...,0,True,proce solinu to procur my fall and by the do...,0,True
2,2,2,"Who 's there ?\nNay , answer me : stand , and ...",NOUN VERB PUNCT SPACE PROPN PUNCT VERB PUNCT ...,NOUN VERB there PUNCT SPACE PROPN PUNCT VERB ...,0,E-Shakespeare,Hamlet.txt.E-Shakespeare.tok,1,False,1,False,enemy constantly welcome advantage avoid marve...,1,False,who s there nay answer me stand and unfold...,1,False
3,3,3,"So shaken as we are , so wan with care ,\nFind...",ADV VERB VERB PUNCT ADV NOUN ADP NOUN PUNCT S...,ADV VERB as we VERB PUNCT ADV NOUN ADP NOUN P...,0,E-Shakespeare,henryivPart1.txt.E-Shakespeare.tok,0,True,0,True,enemy plant advantage falsehood nature looked ...,0,True,So shaken as we are so wan with care find we...,0,True
4,4,4,Open your ears ; for which of you will stop\nT...,VERB ADJ NOUN PUNCT ADJ PRON VERB SPACE DET N...,VERB ADJ NOUN PUNCT for ADJ of PRON VERB stop...,0,E-Shakespeare,henryivPart2.txt.E-Shakespeare.tok,0,True,0,True,enemy holy old northumberland nature owed look...,0,True,open your ear for which of you will stop the ...,0,True
5,5,5,"O for a Muse of fire , that would ascend\nThe ...",INTJ DET PROPN NOUN PUNCT ADJ VERB SPACE DET ...,INTJ for DET PROPN of NOUN PUNCT ADJ would VE...,0,E-Shakespeare,henryv.txt.E-Shakespeare.tok,0,True,0,True,assume highness claiming admit quickly absence...,0,True,O for a muse of fire that would ascend the br...,0,True
6,6,6,"Hung be the heavens with black , yield day to ...",PROPN NOUN ADP ADJ PUNCT NOUN NOUN ADP NOUN P...,PROPN be the NOUN ADP ADJ PUNCT NOUN NOUN ADP...,0,E-Shakespeare,henryviPart1.txt.E-Shakespeare.tok,0,True,2,False,advantage speech marvel pair idly quickly scat...,2,False,hung be the heaven with black yield day to ni...,2,False
7,7,7,As by your high imperial majesty\nI had in cha...,ADP ADP ADJ ADJ ADJ NOUN SPACE PRON NOUN ADJ ...,ADP ADP ADJ ADJ ADJ NOUN SPACE PRON had in NO...,0,E-Shakespeare,henryviPart2.txt.E-Shakespeare.tok,0,True,0,True,dowry enemy speech welcome avoid pair marvel q...,0,True,As by your high imperi majesti I had in charg ...,0,True
8,8,8,I wonder how the king escaped our hands .\nWhi...,PRON VERB NOUN VERB ADJ NOUN PUNCT SPACE ADP ...,PRON VERB how the NOUN VERB ADJ NOUN PUNCT SP...,0,E-Shakespeare,henryviPart3.txt.E-Shakespeare.tok,2,False,2,False,toss s bird cleft abide fair fell clifford pri...,2,False,I wonder how the king escap our hand while we...,2,False
9,9,9,"Hence ! home , you idle creatures get you home...",ADV PUNCT PUNCT PRON ADJ NOUN PRON PUNCT SPAC...,ADV PUNCT home PUNCT PRON ADJ NOUN get PRON h...,0,E-Shakespeare,julius_caesar.txt.E-Shakespeare.tok,0,True,1,False,months welcome avoid nature o cicero threw con...,0,True,henc home you idl creatur get you home Is t...,0,True


## Feature: Line Length

A structural feature representing the frequency of all occurring line lengths.
Seems to make the classification a lot worse.

In [68]:
#save lengths as list

all_linelengths = []
max_line_length = max([len(text.split('\n')) for text in df["Raw Text"]])

for text in df["Raw Text"]:
    line_lengths_list = []
    
    for line in text.split('\n'):
        line_lengths_list.append(len(line))
    for remain_lines in range(max_line_length - len(line_lengths_list)):
        line_lengths_list.append(0)
    
    all_linelengths.append(line_lengths_list)
    

In [69]:
df["Line Length"] = all_linelengths

In [70]:
#modified get_features function including line lengths

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model (created with the LOO function), both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I want to use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train["POS_new"]
    X_POSstops = X_train[ "POSstops_new"]
    X_lines = list(X_train["Line Length"])
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test["POS_new"]
    X_test_POSstops = X_test[ "POSstops_new"]
    X_test_lines = list(X_test["Line Length"])
    
    # using countvectorizors to get freqs 
    cvec1 = CountVectorizer(vocabulary = stopwords_507, strip_accents="ascii")#word freq of stop words
    cvec2 = CountVectorizer() #freq of POS tags
    
    #add max_feature=1000 to get different results
    cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii", max_features = 1000) #ngrams of POS/stops 

   # fitting  and transforming of train data, and stack vectors at the same time to get X for model
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        cvec3.fit_transform(X_POSstops).toarray(),
        np.asarray(X_lines),
        
        ))
    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            cvec3.transform(X_test_POSstops).toarray(),
            np.asarray(X_test_lines),
        ))

    
    return train_features, test_features


In [71]:
predictions_lines, coeff, cls =  get_preds_imp(df)





In [72]:
df["Pred_lines"]=predictions_lines

In [73]:
df["acc_lines"] =df["Label"]==df[ "Pred_lines"]
acc_lines=df.loc[df.acc_lines== True, 'acc_lines'].count()/df.shape[0]
print(acc_lines)

0.6883116883116883


## Feature: Sentence Length

In [74]:
from nltk.tokenize.punkt import PunktSentenceTokenizer

tok = PunktSentenceTokenizer()
all_sent_lengths = []
max_sent_length = max([len(tok.sentences_from_text(text)) for text in df["Raw Text"]])

                       
for text in df["Raw Text"]:
    sent_lengths = []
    for sent in tok.sentences_from_text(text):
        sent_lengths.append(len(sent))
    for remain_sents in range(max_sent_length - len(sent_lengths)):
        sent_lengths.append(0)  
    
    all_sent_lengths.append(sent_lengths)


In [75]:
df["Sent Length"] = all_sent_lengths

In [76]:
#modified get_features function including line lengths

def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model (created with the LOO function), both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I want to use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train["POS_new"]
    X_POSstops = X_train[ "POSstops_new"]
    X_sent = list(X_train["Sent Length"])
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test["POS_new"]
    X_test_POSstops = X_test[ "POSstops_new"]
    X_test_sent = list(X_test["Sent Length"])
    
    # using countvectorizors to get freqs 
    cvec1 = CountVectorizer(vocabulary = stopwords_507, strip_accents="ascii")#word freq of stop words
    cvec2 = CountVectorizer() #freq of POS tags
    cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii") #ngrams of POS/stops 

   # fitting  and transforming of train data, and stack vectors at the same time to get X for model
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        cvec3.fit_transform(X_POSstops).toarray(),
        np.asarray(X_sent),
        
        ))
    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            cvec3.transform(X_test_POSstops).toarray(),
            np.asarray(X_test_sent),
        ))

    
    return train_features, test_features


In [77]:
predictions_sent, coeff, cls =  get_preds_imp(df)

In [79]:
df["Pred_sent"]=predictions_sent

In [80]:
df["acc_sent"] =df["Label"]==df[ "Pred_sent"]
acc_sent=df.loc[df.acc_sent== True, 'acc_sent'].count()/df.shape[0]
print(acc_sent)

0.44155844155844154


## Feature: Frequency of Lemmatized Words

Lemmas of words are already to be found in the df (generated with spacy).

In [113]:
#modified get_features function including lemma frequency


def get_features_GM_imp(X_train, X_test):
    
    """
    Input: data used to get model (created with the LOO function), both for testing and training
    output: features for train and test instances 
    """
    
    #getting the columns I want to use for my features from the instances passed
    X_txt = X_train["Raw Text"]
    X_POS = X_train["POS_305"]
    X_POSstops = X_train[ "POSstops_305"]
    X_lemmas = X_train["Lemmas"]
    
    X_test_txt= X_test["Raw Text"]
    X_test_POS = X_test["POS_305"]
    X_test_POSstops = X_test[ "POSstops_305"]
    X_test_lemmas = X_test["Lemmas"]
    
    # using countvectorizors to get freqs 
    cvec1 = CountVectorizer(vocabulary = stopwords_305, strip_accents="ascii")#word freq of stop words
    cvec2 = CountVectorizer() #freq of POS tags
    
    #add max_feature=1000 to get different results
    cvec3 = CountVectorizer(ngram_range=(2,2), strip_accents="ascii", max_features = 1000) #ngrams of POS/stops 
    
    cvec4 = CountVectorizer(strip_accents="ascii")

   # fitting  and transforming of train data, and stack vectors at the same time to get X for model
    train_features= np.hstack((
        cvec1.fit_transform(X_txt).toarray(),
        cvec2.fit_transform(X_POS).toarray(),
        cvec3.fit_transform(X_POSstops).toarray(),
        cvec4.fit_transform(X_lemmas).toarray(),
        
        ))
    
    #only fit X_test data
    test_features = np.hstack((
            cvec1.transform(X_test_txt).toarray(),
            cvec2.transform(X_test_POS).toarray(),
            cvec3.transform(X_test_POSstops).toarray(),
            cvec4.transform(X_test_lemmas).toarray(),
        ))

    
    return train_features, test_features


In [114]:
predictions_lemmas, coeff, cls =  get_preds_imp(df)





In [115]:
df["Pred_lemmas"]=predictions_lemmas

In [116]:
df["acc_lemmas"] =df["Label"]==df[ "Pred_lemmas"]
acc_lemmas=df.loc[df.acc_lemmas== True, 'acc_lemmas'].count()/df.shape[0]
print(acc_lemmas)

0.8181818181818182


In [None]:
#507 stop words
#0.8051948051948052

#305 stop words
#0.8181818181818182