# Counting profane oaths present in .txt files and exporting with pandas
This code opens .txt files of plays and counts the occurances of profane oaths in lemma form. Results are stored in pandas dataframes and exorted as .csv files.

In [2]:
# import pandas for dataframes
import pandas as pd
import os

In [10]:
# read the metadata csv file
df = pd.read_csv('Folger_EMED_SHC_metadata_rel_csv.csv')

In [12]:
# create a list of file names which can be later iterated over from a column in the dataframe
fileNamesArray = df['SHC'].to_numpy()

In [793]:
# each play has 5 acts so sum together files in directory that end with .txt and divide by 5. Make float into integer.
numberOfPlays = int(sum(f.endswith('.txt') for f in os.listdir('Broken_SHC_Plays'))/5)

print(numberOfPlays)

223


IN FUTURE what could be done:
Tokenise and find phrases similar to these using Word2Vec I just dunno how lmaoooooo. I'm sorry. Would hopefully find instances of 'gods-precious' as well as 'god precious'.

# REMEMBER V, when you're writing, these things are not arrays in Python but LISTS. Make sure to do a ctrl+f for 'array' in your essay.

In [14]:
# function which counts all profane oaths present in oathDict for selected .txt files
def findProfaneOaths(actNum, dataList):

    # dict to hold count of profane oaths.
    # for profane oaths that form unique words (ie, 'swounds, 'sblood) there is no ' prefacing them.
    # for profane oaths that are spelled the same as other words (ie, 'slight) they are prefaced by a ' to filter out examples of 'slight'
    # used in a non-profane manner.
    # lemmas 
    oathDict = {"acts": actNum, "by god": 0, "by christ": 0, "by jesus": 0, "god sake": 0, "christ sake": 0, "jesus sake": 0, "for the love of god": 0, "for the love of christ": 0,
                "for the love of jesus": 0, "swounds": 0, "god wound": 0, "sblood": 0, "god blood": 0, "sbones": 0, "god bone": 0, "sbody": 0, "god body": 0, "sheart": 0, 
                "god heart": 0, "'snails": 0, "god nail": 0, "sfoot": 0, "god foot": 0, "sfeet": 0, "god feet": 0, "'sarms": 0, "god arm": 0, "sfinger": 0, "god finger": 0,
                "sflesh": 0, "god flesh": 0, "sguts": 0, "god gut": 0, "snostrils": 0, "god nostril": 0, "stongue": 0, "god tongue": 0, "seyes": 0, "god eye": 0, "'slight": 0,
                "god light": 0, "slife": 0, "god life": 0, "sdeath": 0, "god death": 0, "god soul": 0, "sfast": 0, "god fast": 0, "spassion": 0, "god passion": 0, "'smother": 0,
                "god mother": 0, "struth": 0, "god truth": 0, "slid": 0, "god lid": 0, "sdeynes": 0}

    wordCount = 0
    
    for file in fileNamesArray:
        with open(f'Broken_SHC_Plays/{file}.xml_ACT_{actNum}.txt', 'r') as play:
            oathCount = 0
            playy = play.read().lower()
            wordCount += len(playy)
            # the or statement here adds an apostrophe to the search term. So slife and 'slife will be searched for.
            # for instances where other lemmas would be counted without apostrophes, such as slight, the profane oath
            # in oathDict already has an apostrophe included. This for loop then searches for 'slight or ''slight.
            for item in oathDict or "\'" + item in oathDict:
                oathDict[item] += playy.count(item)
                oathCount += playy.count(item)
                
        oathsPerPlay[file].append(oathCount)

    oathDict.update({'totalOaths': sum(oathDict.values())-actNum})
    oathDict.update({'totalWordCount': wordCount})
    dataList.append(oathDict)

            

In [43]:
# the dictionary below was added later to further explore quantity of profane oaths against the file
# rather than examining for profane oaths per act. This is expanded on at the end of the notebook.
oathsPerPlay = {name: [] for name in fileNamesArray}

numOfAct = 1
data = []

while numOfAct < 6:
    findProfaneOaths(numOfAct, data)
    numOfAct += 1
    

Creating a dataframe to hold 'data', dropping columns (profane oaths) which have counts of 0 throughout each act, and exporting.

In [32]:
# data now holds a list of five dictionaries, create dataframe with five rows.
actsAndOathsDF = pd.DataFrame.from_records(data)

# replaces 0 with a null that pandas recognises, then drops columns that are have
# only null values. A more efficient way of filtering out columns that have zeroes only.
actsAndOathsDF = actsAndOathsDF.replace(0, pd.NA).dropna(axis=1, how='all')

# replaces null values with 0
actsAndOathsDF = actsAndOathsDF.replace(pd.NA, 0)

  actsAndOathsDF = actsAndOathsDF.replace(pd.NA, 0)


In [34]:
actsAndOathsDF

Unnamed: 0,acts,by god,by christ,by jesus,god sake,christ sake,jesus sake,for the love of god,for the love of christ,swounds,...,god light,slife,god life,sdeath,god mother,struth,slid,god lid,totalOaths,totalWordCount
0,1,6,0,0,11,0,0,1,0,3,...,1,2,0,4,1,0,25,3,116,3957042
1,2,3,0,0,9,0,0,0,0,1,...,0,1,0,3,0,0,20,0,115,4159550
2,3,2,0,0,19,1,0,1,1,1,...,1,1,0,7,0,0,32,1,140,4276586
3,4,5,0,0,21,0,1,0,0,3,...,2,0,0,7,0,1,28,2,174,4344076
4,5,8,1,1,14,0,0,0,0,1,...,3,2,1,10,0,0,17,0,125,4003675


In [36]:
actsAndOathsDF.to_csv('Numbered_Act_and_Present_Profane_Oaths.csv', index=False)

Creating a dataframe for the other point of examination which is total profane oaths against a cut-down version of the initial metadata file.

In [45]:
# sum all counts and add total to the end
for entry in oathsPerPlay:
    oathsPerPlay[entry].append(sum(oathsPerPlay[entry]))
    print(entry, oathsPerPlay[entry])

158-A06583 [0, 0, 0, 0, 0, 0]
158-A06619 [0, 0, 0, 0, 0, 0]
158-A06621 [0, 1, 0, 1, 1, 3]
158-A06625 [0, 0, 0, 0, 0, 0]
158-A06991_0 [0, 0, 0, 0, 0, 0]
158-A07004_01_0 [0, 1, 0, 0, 0, 1]
158-A07004_02_0 [0, 0, 0, 0, 1, 1]
158-A09220 [1, 0, 0, 0, 0, 1]
159-A02168 [0, 1, 0, 0, 1, 2]
159-A04648 [8, 10, 16, 13, 25, 72]
159-A06589 [0, 0, 0, 0, 0, 0]
159-A06620 [0, 0, 0, 0, 0, 0]
159-A06622 [0, 0, 0, 0, 0, 0]
159-A07063 [0, 4, 5, 0, 2, 11]
159-A11262 [1, 1, 0, 1, 0, 3]
159-A19738 [0, 0, 0, 0, 0, 0]
159-A31675 [0, 0, 0, 0, 0, 0]
160-A01911 [10, 0, 4, 5, 4, 23]
160-A03248 [1, 1, 0, 0, 0, 2]
160-A03255 [0, 0, 0, 0, 0, 0]
160-A04539 [0, 0, 0, 2, 1, 3]
160-A04881 [1, 1, 0, 0, 4, 6]
160-A06177 [1, 1, 0, 5, 0, 7]
160-A06252 [1, 2, 0, 0, 0, 3]
160-A06343 [0, 0, 0, 1, 0, 1]
160-A06458 [2, 0, 1, 0, 0, 3]
160-A06742 [0, 0, 1, 0, 0, 1]
160-A06975 [3, 2, 4, 0, 0, 9]
160-A07064 [1, 2, 0, 3, 0, 6]
160-A07065 [0, 0, 1, 2, 2, 5]
160-A07071 [0, 0, 0, 0, 1, 1]
160-A07077 [0, 0, 0, 0, 2, 2]
160-A07495 [0, 0, 0,

In [49]:
# copy certain columns from the initial dataframe (holding the metadata file) to create new dataframe
playsAndSumOathsDF = df[['PlayCode', 'shortTitle', 'Genre', 'Author', 'TitlePagePrintingDate',
            'UncertainPerformDate', 'performDate', 'Format', 'printingDate', 'SHC']].copy()

# create dataframe using oathsPerPlay, naming the columns for clarity. Index column renamed to 'SHC'
oathsPerPlayDF = pd.DataFrame.from_dict(oathsPerPlay, orient='index', columns = ['Act1', 'Act2', 'Act3', 'Act4', 'Act5', 'OathTotal'])
oathsPerPlayDF.index.name = 'SHC'

# merge playsAndSumOathsDF with oathsPerPlayDF, merging on SHC to results line up with appropriate metadata
playsAndSumOathsDF = playsAndSumOathsDF.merge(oathsPerPlayDF, on='SHC')

In [59]:
playsAndSumOathsDF

Unnamed: 0,PlayCode,shortTitle,Genre,Author,TitlePagePrintingDate,UncertainPerformDate,performDate,Format,printingDate,SHC,Act1,Act2,Act3,Act4,Act5,OathTotal
0,Camp,"Campaspe (Alexander, Campaspe, and Diogenes)",Comedies,John Lyly (Uncertain: No)|,1584,Yes,1583,quarto,1584,158-A06583,0,0,0,0,0,0
1,Gal,Gallathea,Comedies,John Lyly (Uncertain: No)|,1592,Yes,1585,quarto,1592,158-A06619,0,0,0,0,0,0
2,Midas,Midas,Comedies,John Lyly (Uncertain: No)|,1592,No,1589,quarto,1592,158-A06621,0,1,0,1,1,3
3,SaP,Sappho and Phao,Comedies,John Lyly (Uncertain: No)|,1584,Yes,1583,quarto,1584,158-A06625,0,0,0,0,0,0
4,JoM,The Jew of Malta,Tragedies,Christopher Marlowe (Uncertain: No)|,1633,Yes,1589,quarto,1633,158-A06991_0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,Sis,The Sisters,Comedies,James Shirley (Uncertain: No)|,1652,No,1642,octavo,1652,164-A93178,0,0,0,0,0,0
219,QE,The Queen's Exchange (The Royal Exchange),Tragicomedies,Richard Brome (Uncertain: No)|,1657,Yes,1631,quarto,1657,165-A29644,0,1,0,1,4,6
220,NewAc,"The New Academy, or The New Exchange",Comedies,Richard Brome (Uncertain: No)|,1659,Yes,1635,octavo,1658,165-A77567_04,0,0,0,0,0,0
221,QC,The Queen and Concubine,Tragicomedies,Richard Brome (Uncertain: No)|,1659,Yes,1635,octavo,1659,165-A77567_05,0,0,0,0,0,0


In [55]:
# export the metadata to .csv file
playsAndSumOathsDF.to_csv('Plays_Metadata_and_Profane_Oaths_Per_Act.csv', index=False)