In [1]:
import pandas as pd
import re
import numpy as np
import nltk
import spacy

nlp = spacy.load("en_core_web_lg")
pd.set_option('display.max_colwidth', None)

#I left this script as a Jupyter notebook to showcase results in case there are setup issues.
#Please see the readme for setup assistance.

In [3]:
"""
Assuming a large text input, this spacy pipeline will convert the text into a dataframe split by paragraph and sentence.
Those two elements are broken out to support rhetoric tagging.   

Input: String of text of 1+ paragraphs - note: it currently cannot handle large datasets.
Output: Pandas dataframe where each row is a sentence indexed by paragraph number and sentence number.
        
"""
#Input: Large text string
def splitParagraphSentences(text):
    text = str(text).replace('"', "'") #Replaces all double-quotes with single (double quotes later used to support df creation)
    dataPara = text.split('\n\n') #split out each section by paragraph. Assumes there is
    
    #Each value to use is put in list form to generate the dataframe all at once (rather than append iteratively).
        #This will help us keep runtime down
    sentList = [] 
    p = 1

    for paragraph in dataPara:
        doc = nlp(paragraph)
        s= 1
        for sentence in doc.sents:
            #Add double-quotes around the record (otherwise, pandas will convert this to a column and treat it as a list of words)
            sentenceQuotes = f'"{sentence}"'
            sentenceQuotesLower = sentenceQuotes.lower() #Given all-caps is an emphasizer, we'll retain it.
            
            posTags = [] #POS tag list for each individual sentence (will attempt to recapture before/after structure later)
            for token in sentence:
                posTags.append(token.tag_)
            
            sentList.append([p,s,sentenceQuotes, sentenceQuotesLower, posTags])
            s+=1
        p+=1 

    #Each value to use is put in list form to generate the dataframe all at once (rather than append iteratively)
    sentencedf = pd.DataFrame(sentList, columns=['paragraphNo', 'sentenceNo', 'sentence', 'sentenceLower', 'posTags'])

    sentencedf['sentenceLower'] = sentencedf['sentenceLower'].str.strip('"')

    return sentencedf

 
"""
To accomodate for infomation lost by breaking paragraphs out by sentence, this function will join the sentence before/after
into the list with the goal of finding overlap between sentences.

"""
def findSentenceBeforeandAfter(df):
    #Simple +1 -1 to the index to allow the merge to itself. These columns will always be on the right side of the join.
    df['sentenceNoP1'] =  df["sentenceNo"] + 1
    df['sentenceNoM1'] =  df["sentenceNo"] - 1
    
    #Base table on left. Pandas gives "_x" and "_y" to column names when there are duplicates. 
    #It was easier to just rename them at the end than to figure out how to work around this.
    df1 = df.merge(df, left_on=['paragraphNo','sentenceNo'], right_on=['paragraphNo','sentenceNoP1'], how='left').drop(columns = ['sentenceNoP1_x', 'sentenceNoM1_x', 'sentenceNo_y', 'sentenceNoP1_y', 'sentenceNoM1_y'] )
    df2 = df1.merge(df, left_on=['paragraphNo','sentenceNo_x'], right_on=['paragraphNo','sentenceNoM1'], how='left').drop(columns = ['sentenceNo', 'sentenceNoP1', 'sentenceNoM1', 'sentence', 'sentence_y'] )
    
    #Cleanup after the merge. Note - we keep the lowcase before/after rather than original
    dfClean = df2.rename({'sentenceNo_x': 'sentenceNo','sentence_x':'sentenceRaw', 'sentenceLower_x':'sentenceLower',
                          'posTags_x':'sentencePOS',  'sentenceLower': 'sentenceAfter', 'posTags':'sentenceAfterPOS',
                          'sentenceLower_y':'sentenceBefore', 'posTags_y':'sentenceBeforePOS'}, axis=1) 

    return dfClean



In [4]:
"""
Helper functions for sentence transformations or comparing parts of sentences.
"""

#Custom stopwords list from a Github page that is no longer maintained:
#https://tm4ss.github.io/docs/Tutorial_1_Read_textdata.html
stopwords = []
with open("stopwords_en.txt", "r") as f:
    for line in f:
        stopwords.append(str(line.strip()))

def removeStopwords(sentence):
    #From: https://stackoverflow.com/questions/25346058/removing-list-of-words-from-a-string
    sentenceWords = sentence.split()
    resultwords  = [word for word in sentenceWords if word.lower() not in stopwords]
    result = ' '.join(resultwords)
    return result

#Function for checking duplicate records within a list.
def checkListDuplicates(textList):
    #The sentences had repetition in structure and was likely a list.
    wordSetList = []
    for item in textList:
        item = tuple(item)
        wordSetList.append(item) 
        
    #Compare list of pos tags to one that removes duplicates - if sizes differ, then
    if len(textList) != len(set(wordSetList)):
        return 1
    else:
        return 0
    
def removeMostPunctuation(sentence):
#Retain commas to help with the split() function
     sentence = sentence.replace('.','').replace('?','').replace('!','')
     return sentence

    
#Takes in a list and returns a list with 
def getFirstXWordsinListItems(sentenceGroups, x):
    tempText = []
    for section in sentenceGroups:
        text = section.split()[:x]
        tempText.append(text)
    return tempText
    
def getLastWordinListItems(sentenceGroups):
    tempText = []
    for section in sentenceGroups:
        text = section.split()[-1]
        tempText.append(text)
    return tempText

def removeListDuplicates(temp):
    textList = []
    [textList.append(x) for x in temp if x not in textList]
    return textList

In [5]:
"""
Functions dedicated to finding specific rhetorical devices :

Definitions from "A Handbook of Rhetorical Devices" by Robert A. Harris.

1) Sentential Adverb: A single word or short phrase, usually interrupting normal syntax,
used to lend emphasis to words immediately proximate to the adverb.

2) Onomatopoeia: 

3) Parallelism: Recurrent syntactical similarity.

4) Litotes: A particular form of understatement, is generated by denying the opposite or contrary
of the word which otherwise would be used.

5) Anaphora: Repition of the same word or words at the beginning of successive phrases, clauses, or sentences,
commonly in conjunction with climax and with parallelism.

6) Epistrophe/Antistrophe: Forms the counterpart to anaphora, because the repititionof the same word or words comes
at the end of the successive phrases, clauses, or sentences.

"""

#Sentential Adverb:  emphasize words immediately around an adverb.
def find_SententialAdverb(x, sentenceCol, sentencePOS):
    #low hanging fruit search terms
    search = []
    with open("sentential_adverb.txt", "r") as f:
        for line in f:
            search.append(str(line.strip()))

    found = 0 #regex match 
    
    #Low hanging fruit, check within list of common examples
    for w in search:
        if w in x[sentenceCol]:
            found = 1
            break
    return found

def find_Onomatopoeia (x, sentenceCol):
#Simple onomatopoeia lists found below:
    #https://kathytemean.wordpress.com/2009/12/29/onomatopoeia-word-list/
    #https://en.wikipedia.org/wiki/List_of_onomatopoeias

    search = [] #Load in list and append.
    with open("onomatopoeia_list.txt", "r") as f:
        for line in f:
            search.append(str(line.strip()))
            
    found = 0 #regex match 
    for w in search:
        if w in x[sentenceCol]:
            found = 1
            break   

    return found

#Will identify grammatical lists with duplicating PoS within one sentence as well as prior/next sentences.

def find_Parallel_Structure(x, sentenceCol, sentencePOS, sentenceBeforePOS, sentenceAfterPOS):
    found = 0
    
    #Search for parallelism between the prior and next sentences (within the same paragraph)
    if (x[sentencePOS] == x[sentenceBeforePOS]) or (x[sentencePOS] == x[sentenceAfterPOS]):
        found = 1
    
    #Take the individual sentence record to look within for any grammatical lists or parallel structure
    sentence = x[sentenceCol]
    sentence = removeMostPunctuation(sentence) #Punctuation will mess with PoS Tags
    
    #Look for parallelism based on PoS reptition
    sentenceGroups = sentence.split(',') 
    posTags = []
    for section in sentenceGroups:
        text = nlp(section)
        posTemp = [] #temp list to maintain the separation of sections among the sentences
        for token in text:
            posTemp.append(token.tag_)
        posTags.append(posTemp)
            
    #The sentences had repetition in structure and was likely a list.
    posSetList = []
    for item in posTags:
        item = tuple(item)
        posSetList.append(item) 
        
    #Compare list of pos tags to one that removes duplicates - if sizes differ, then
    if len(posTags) != len(set(posSetList)):
        found = 1    
    
    return found


def find_Litotes(x, sentenceCol):
    found = 0
    
    position = 0
    sentence = x[sentenceCol]
    length = len(sentence)-1
    
    search = [" no "," did not "," not " ," isn't ", " isnt ", "didnt", "didn't"] 
    search = [w for w in search]

    #If a word matches on the list, ccheck the next word with part of speech
    for w in search:
        if w in sentence:
            posNot = 0
            posSpace = 0
            PosEnd = 0
            nextWord = ''
            
            #Find the word following no/isnt/didnt
            posNot = sentence.find(w, 0, length-1) 
            posSpace = sentence.find(' ', posNot+1,length) 
            PosEnd = sentence.find(' ', posSpace+1,length)
            
            nextWord = sentence[posSpace:PosEnd].strip() 
            
            text = nlp(nextWord) #Get PoS of the following word
            
            posNextWord = []
            for token in text:
                posNextWord.append(token.tag_)

            #e.g., "It was *no small* accomplishment"; "She was *not unhappy*"
            if (w == " no " or w == " not ")  and posNextWord == ['JJ']:
                found = 1
                return found
                break
            #e.g., "Staying up late did *not do* your work any favors.
            if (w == " not " or w == "didnt" or w == "didn't") and posNextWord == ['VB']:
                found = 1
                return found
                break
    if found != 1:
        found = 0
        return found
    
#Requires a minimum of two words repeating per clause or sentence
def find_Anaphora(x, sentenceCol, sentenceBefore, sentenceAfter):
    found = 0
    textList = []
    
    #Take first two words of each part of sentences (broken out by commas)
    sentence = x[sentenceCol]
    sentence = removeMostPunctuation(sentence)
    
    sentenceGroups = sentence.split(',') 
    
    #Look for instances of the same first two words used in parts of sentences
    temp = getFirstXWordsinListItems(sentenceGroups, 2)
    textList = textList + temp
    
    
    #Repeat function for sentenceBefore
    sentence = str(x[sentenceBefore]) #convert to string in case it is null
    sentence = removeMostPunctuation(sentence)
    sentenceGroups = sentence.split(',') 
    
    temp = getFirstXWordsinListItems(sentenceGroups, 2)
    textList = textList + temp
    
    #Repeat function for sentenceAfter
    sentence = str(x[sentenceAfter]) #convert to string in case it is null
    sentence = removeMostPunctuation(sentence)
    sentenceGroups = sentence.split(',') 
    
    temp = getFirstXWordsinListItems(sentenceGroups, 2)
    textList = textList + temp
    
    #The sentences had repetition in structure and was likely a list.
    wordSetList = []
    for item in textList:
        item = tuple(item)
        wordSetList.append(item) 
        
    #Compare list of pos tags to one that removes duplicates - if sizes differ, then
    if len(textList) != len(set(wordSetList)):
        found = 1    
    
    return found  

def find_Epistrophe(x, sentenceCol, sentenceBefore, sentenceAfter):
    found = 0
    textList = []
    
    #Take first two words of each part of sentences (broken out by commas)
    sentence = x[sentenceCol]
    sentence = removeMostPunctuation(sentence)
    
    sentenceGroups = sentence.split(',') 
    
    #Look for instances of the same first two words used in parts of sentences
    temp = getLastWordinListItems(sentenceGroups)
    textList = textList + temp
    
    #Repeat function for sentenceBefore
    sentence = str(x[sentenceBefore]) #convert to string in case it is null
    sentence = removeMostPunctuation(sentence)
    sentenceGroups = sentence.split(',') 
    
    temp = getLastWordinListItems(sentenceGroups)
    textList = textList + temp
    
    #Repeat function for sentenceAfter
    sentence = str(x[sentenceAfter]) #convert to string in case it is null
    sentence = removeMostPunctuation(sentence)
    sentenceGroups = sentence.split(',') 
    
    temp = getLastWordinListItems(sentenceGroups)
    textList = textList + temp
    

    #The sentences had repetition in structure and was likely a list.
    wordSetList = []
    for item in textList:
        item = tuple(item)
        wordSetList.append(item) 

    #Compare list of pos tags to one that removes duplicates - if sizes differ, then
    if len(textList) != len(set(wordSetList)):
        found = 1    
    
    return found 



In [20]:
"""
This section demonstrates the functions with some sample sentences found in the aforementioned "A Handbook of Rhetorical Devices" by Robert A. Harris.
"""

text= 'But the lake was not, in fact, drained before April. He pursues his way, and swims, or sinks, or wades, or creeps, or flies. Heat waves are not rare in the summer. To think on death it is a misery, to think on life it is a vanity. To think on the world verily it is, to think that here man hath no perfect bliss. \n\n And all the night he did nothing by weep Philoclea, sigh Philoclea, and cry out Philoclea. Pleasure might cause her read, reading might make her know, knowledge might pity win, and pity grace obtain. '

#Clean up the text data and return a pandas dataframe with one row per sentence. 
text.encode('ascii', errors='ignore')
text = splitParagraphSentences(text)
df = findSentenceBeforeandAfter(text)

#Pandas apply functions that apply at the row-level. 
df['hasSententialAdverb'] = df.apply(lambda x: find_SententialAdverb(x, 'sentenceLower', 'sentencePOS'), axis = 1)
df['hasOnomatopoeia'] = df.apply(lambda x: find_Onomatopoeia(x, 'sentenceLower'), axis = 1)
df['hasParallelStructures'] = df.apply(lambda x: find_Parallel_Structure(x, 'sentenceLower','sentencePOS', 'sentenceBeforePOS', 'sentenceAfterPOS'), axis = 1)
df['hasLitotes'] = df.apply(lambda x: find_Litotes(x, 'sentenceLower'), axis = 1)
df['hasAnaphora'] = df.apply(lambda x :find_Anaphora(x, 'sentenceLower', 'sentenceBefore', 'sentenceAfter'), axis = 1)
df['hasEpistrophe'] = df.apply(lambda x :find_Epistrophe(x, 'sentenceLower', 'sentenceBefore', 'sentenceAfter'), axis = 1)


#Show results
df.head(10)

Unnamed: 0,paragraphNo,sentenceNo,sentenceRaw,sentenceLower,sentencePOS,sentenceBefore,sentenceBeforePOS,sentenceAfter,sentenceAfterPOS,hasSententialAdverb,hasOnomatopoeia,hasParallelStructures,hasLitotes,hasAnaphora,hasEpistrophe
0,1,1,"""But the lake was not, in fact, drained before April.""","but the lake was not, in fact, drained before april.","[CC, DT, NN, VBD, RB, ,, IN, NN, ,, VBN, IN, NNP, .]",,,"he pursues his way, and swims, or sinks, or wades, or creeps, or flies.","[PRP, VBZ, PRP$, NN, ,, CC, VB, ,, CC, NNS, ,, CC, NNS, ,, CC, NNS, ,, CC, NNS, .]",1,1,0,0,0,0
1,1,2,"""He pursues his way, and swims, or sinks, or wades, or creeps, or flies.""","he pursues his way, and swims, or sinks, or wades, or creeps, or flies.","[PRP, VBZ, PRP$, NN, ,, CC, VB, ,, CC, NNS, ,, CC, NNS, ,, CC, NNS, ,, CC, NNS, .]","but the lake was not, in fact, drained before april.","[CC, DT, NN, VBD, RB, ,, IN, NN, ,, VBN, IN, NNP, .]",heat waves are not rare in the summer.,"[NN, NNS, VBP, RB, JJ, IN, DT, NN, .]",0,0,1,0,0,0
2,1,3,"""Heat waves are not rare in the summer.""",heat waves are not rare in the summer.,"[NN, NNS, VBP, RB, JJ, IN, DT, NN, .]","he pursues his way, and swims, or sinks, or wades, or creeps, or flies.","[PRP, VBZ, PRP$, NN, ,, CC, VB, ,, CC, NNS, ,, CC, NNS, ,, CC, NNS, ,, CC, NNS, .]","to think on death it is a misery, to think on life it is a vanity.","[TO, VB, IN, NN, PRP, VBZ, DT, NN, ,, TO, VB, IN, NN, PRP, VBZ, DT, NN, .]",0,0,0,1,1,0
3,1,4,"""To think on death it is a misery, to think on life it is a vanity.""","to think on death it is a misery, to think on life it is a vanity.","[TO, VB, IN, NN, PRP, VBZ, DT, NN, ,, TO, VB, IN, NN, PRP, VBZ, DT, NN, .]",heat waves are not rare in the summer.,"[NN, NNS, VBP, RB, JJ, IN, DT, NN, .]","to think on the world verily it is, to think that here man hath no perfect bliss.","[TO, VB, IN, DT, NN, RB, PRP, VBZ, ,, TO, VB, IN, RB, NN, VBP, DT, JJ, NN, .]",0,0,0,0,1,0
4,1,5,"""To think on the world verily it is, to think that here man hath no perfect bliss.""","to think on the world verily it is, to think that here man hath no perfect bliss.","[TO, VB, IN, DT, NN, RB, PRP, VBZ, ,, TO, VB, IN, RB, NN, VBP, DT, JJ, NN, .]","to think on death it is a misery, to think on life it is a vanity.","[TO, VB, IN, NN, PRP, VBZ, DT, NN, ,, TO, VB, IN, NN, PRP, VBZ, DT, NN, .]",,,0,0,0,1,1,0
5,2,1,""" And all the night he did nothing by weep Philoclea, sigh Philoclea, and cry out Philoclea.""","and all the night he did nothing by weep philoclea, sigh philoclea, and cry out philoclea.","[_SP, CC, PDT, DT, NN, PRP, VBD, NN, IN, VB, NNP, ,, NNP, NNP, ,, CC, VB, RP, NNP, .]",,,"pleasure might cause her read, reading might make her know, knowledge might pity win, and pity grace obtain.","[NN, MD, VB, PRP, VB, ,, VBG, MD, VB, PRP, VB, ,, NN, MD, VB, NN, ,, CC, NN, NN, VB, .]",0,0,0,0,0,1
6,2,2,"""Pleasure might cause her read, reading might make her know, knowledge might pity win, and pity grace obtain.""","pleasure might cause her read, reading might make her know, knowledge might pity win, and pity grace obtain.","[NN, MD, VB, PRP, VB, ,, VBG, MD, VB, PRP, VB, ,, NN, MD, VB, NN, ,, CC, NN, NN, VB, .]","and all the night he did nothing by weep philoclea, sigh philoclea, and cry out philoclea.","[_SP, CC, PDT, DT, NN, PRP, VBD, NN, IN, VB, NNP, ,, NNP, NNP, ,, CC, VB, RP, NNP, .]",,,0,1,0,0,0,1
