In [1]:
# Import 
import os
import pandas as pd
import regex as re
from pathlib import Path
from collections import Counter

In [2]:
# Generate a corpus by loading all the BPO xml files from the chosen directory 
# and list the names of the first 10 xml files 
corpus = os.listdir('suspense/')
corpus[:10]

['3536957.xml',
 '3766672.xml',
 '6147239.xml',
 '6957126.xml',
 '2535187.xml',
 '3463942.xml',
 '8164170.xml',
 '8011510.xml',
 '3279566.xml',
 '4008380.xml']

In [3]:
# Print how many xml files are in the corpus
corpus_length = len(corpus)
print(corpus_length)

2635


In [4]:
# Create an empty dictionary for preparation of the conversion of the xml-file-corpus to a data frame
empty_dictionary = {}

# Loop through the folder of documents to open and read each one
for document in corpus:
    with open('suspense/' + document, 'r', encoding = 'utf-8') as to_open:
         empty_dictionary[document] = to_open.read()

# Populate the data frame with two columns: file name and document text
suspense_texts = (pd.DataFrame.from_dict(empty_dictionary, 
                                       orient = 'index')
                .reset_index().rename(index = str, 
                                      columns = {'index': 'file_name', 0: 'document_text'}))

In [5]:
# show the first 10 lines of the data frame
suspense_texts[:10]


Unnamed: 0,file_name,document_text
0,3536957.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor..."
1,3766672.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor..."
2,6147239.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor..."
3,6957126.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor..."
4,2535187.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor..."
5,3463942.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor..."
6,8164170.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor..."
7,8011510.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor..."
8,3279566.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor..."
9,4008380.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor..."


In [6]:
# controll that the number of lines with text in the data frame equals the number of files in the folder
df_length = len(suspense_texts)
print(df_length)
corpus_length == df_length


2635


True

In [7]:
# Create a new column 'suspense' to store the frequency of the word 'suspense'for each text
suspense_texts['suspense_frequ'] = suspense_texts['document_text'].str.count('suspense')
# show the first 10 lines of the data frame
suspense_texts[:10]

Unnamed: 0,file_name,document_text,suspense_frequ
0,3536957.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2
1,3766672.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",1
2,6147239.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",1
3,6957126.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",1
4,2535187.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",1
5,3463942.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",1
6,8164170.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",1
7,8011510.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",1
8,3279566.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",3
9,4008380.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",1


In [8]:
# use a reg ex to clean the document text from certain noisy wrong OCR 
suspense_texts['clean_text'] = suspense_texts['document_text'].str.replace("&amp;amp;","")
suspense_texts['clean_text'] = suspense_texts['clean_text'].str.replace("&amp;quot;","")
suspense_texts['clean_text'] = suspense_texts['clean_text'].str.replace("&amp;apos;","")
suspense_texts['clean_text'] = suspense_texts['clean_text'].str.replace("&apos;","")


In [None]:
# show the first 10 lines of the data frame
suspense_texts[:10]

In [20]:
# optional: to controll if every file/line contains xml-tags or one chosen xml-tag, see regular expression

#suspense_texts['suspense_Title_frequ'] = suspense_texts['document_text'].str.count('</Title>')

# optional: to filter for files/lines containing xml-tags
#suspense_title_once = suspense_texts[suspense_texts['suspense_Title_frequ'] == 1]

# optional: controll if there is no file with </Title> more or less than 1 time
#xml-tagged-file_length = len(suspense_title_once)
#xml-tagged-file_length == df_length

In [10]:
# Show the column names of the current state of the data frame
suspense_texts.columns

Index(['file_name', 'document_text', 'suspense_frequ', 'clean_text'], dtype='object')

In [42]:
# optional: to delete a column of a data frame --> if you added a column by accident

#del suspense_texts['suspense_periodical_titel']

2635

In [11]:
# optional: try different regular expressions to extract content from inbetween xml-tags
# with the given xml-tag you extract the titles of the periodicals in the corpus
suspense_texts['document_text'].str.extract(r'(<Title>((.)*)</Title>)')#[1]
# use the index [1] to extract only the one column that contains the content between the xml-tags and not the column with indicated xml-tags

Unnamed: 0,0,1,2
0,<Title>Good words</Title>,Good words,s
1,"<Title>The London journal, and weekly record o...","The London journal, and weekly record of liter...",6
2,<Title>The St. James&apos;s magazine</Title>,The St. James&apos;s magazine,e
3,"<Title>The Contemporary review, 1866-1900</Ti...","The Contemporary review, 1866-1900",0
4,<Title>Belgravia : a London magazine</Title>,Belgravia : a London magazine,e
...,...,...,...
2630,"<Title>The Dublin review, 1836-1910</Title>","The Dublin review, 1836-1910",0
2631,<Title>The Bystander</Title>,The Bystander,r
2632,"<Title>The New monthly magazine, Jan. 1853-Dec...","The New monthly magazine, Jan. 1853-Dec. 1881",1
2633,<Title>The Imperial magazine</Title>,The Imperial magazine,e


In [12]:
# use a reg ex to extract the content between the chosen xml-tags. save the index [1] column only to the data frame
suspense_texts['suspense_journal_title'] = suspense_texts['document_text'].str.extract(r'(<Title>((.)*)</Title>)')[1]
 

In [13]:
# use a reg ex to extract the publication date and store it in a new column of the dataframe
suspense_texts['suspense_journal_date'] = suspense_texts['document_text'].str.extract(r'(<AlphaPubDate>((.)*)</AlphaPubDate>)')[1]


In [14]:
# use a reg ex to extract the pure file text without the xml-tags and metadata
suspense_texts['pure_text'] = suspense_texts['clean_text'].str.extract(r'(<FullText>((.)*)</FullText>)')[1]

In [15]:
# show the cleaned text for controll
suspense_texts['pure_text'][:20]

0     THE WOMANS KINGDOM. obe itore. BY TIHE AUTHOR ...
1     [THiE LADY OF TIHE NIGHT WATCHI.1 IV OMAt A NS...
2     ROYAL FAVOURITES. PART IT. THE term minion  ha...
3     SEVENTEEN YEARS AFTER. I HAVE given this artic...
4     9Xoobua Ianarbio Dauotcr. BY THE AUTHOR OF LAD...
5     THE WRONG ROAD. Bx SA1RAII GRAND. S HE wvas a ...
6     NO NAME. I BY THE AUTHOR OF THE WOMAN IN WHITE...
7     TE MP T A TI ON. Br J. F. SULTH, miSr l  2% WE...
8     BADLESMERE KNOLL ST 2 AUITS OF CAPTAI OUCALD, ...
9     IN VANITY AND VEXATION. A NORTH COUNTRY STORY....
10             IF you want to be in the movement you...
11    THEATRICAL EXAMINER. ainus THBATae. Suhtrmmy, ...
12    ONE OF TWO; OR. A LEFT-HANDED BRIDE. BY SAIN F...
13             AS a testimony to the versatility of ...
14    ECHOES FROM THE UKRAINE. BY ONE WHO HAS COME H...
15    A FAIR FIGHT FOR HER GOOD NAME. CHAPTER XXX. T...
16    FIVE MINUTES TOO LATE.  Miss not the occasion;...
17    LTU5 aUrt CLOUD OF SIRROW.] A DOUBLE vow. 

In [16]:
# show the first 10 lines of the data frame
suspense_texts[:10]

Unnamed: 0,file_name,document_text,suspense_frequ,clean_text,suspense_journal_title,suspense_journal_date,pure_text
0,3536957.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Good words,Nov 1868,THE WOMANS KINGDOM. obe itore. BY TIHE AUTHOR ...
1,3766672.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",1,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...","The London journal, and weekly record of liter...","Jun 19, 1869",[THiE LADY OF TIHE NIGHT WATCHI.1 IV OMAt A NS...
2,6147239.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",1,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",The St. James&apos;s magazine,Mar 1863,ROYAL FAVOURITES. PART IT. THE term minion ha...
3,6957126.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",1,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...","The Contemporary review, 1866-1900",Nov 1882,SEVENTEEN YEARS AFTER. I HAVE given this artic...
4,2535187.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",1,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Belgravia : a London magazine,Aug 1876,9Xoobua Ianarbio Dauotcr. BY THE AUTHOR OF LAD...
5,3463942.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",1,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",The English illustrated magazine,Dec 1895,THE WRONG ROAD. Bx SA1RAII GRAND. S HE wvas a ...
6,8164170.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",1,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",All the year round,"Aug 30, 1862",NO NAME. I BY THE AUTHOR OF THE WOMAN IN WHITE...
7,8011510.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",1,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...","The London reader : of literature, science, ar...","Jan 20, 1866","TE MP T A TI ON. Br J. F. SULTH, miSr l 2% WE..."
8,3279566.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",3,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Bow bells : a magazine of general literature a...,"Oct 28, 1868","BADLESMERE KNOLL ST 2 AUITS OF CAPTAI OUCALD, ..."
9,4008380.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",1,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Quiver,Jan 1881,IN VANITY AND VEXATION. A NORTH COUNTRY STORY....


In [28]:
# use reg ex to extract the publication year only of the indicated publication date that can include day and month as additional metadata 
# indicate that the extracted year is stored as type: integer for the following steps
suspense_texts['suspense_journal_year'] = suspense_texts['suspense_journal_date'].str.extract(r'(\d{4})')
suspense_texts['suspense_journal_year'] = suspense_texts['suspense_journal_year'].astype(int) 
print(type(suspense_texts['suspense_journal_year']))

<class 'pandas.core.series.Series'>


In [27]:
# optional: show and controll the current state of the dataframe before you store it in a csv-file in the next step
#suspense_texts[:15]

In [19]:
# store the dataframe in a new directory
filepath = Path('select/out_all_bpo_texts_as_df.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
suspense_texts.to_csv(filepath) 

Here starts the filtering of all bpo files to get only those interesting to our case study.

In [20]:
# build a subset out of all the documents that contain the word "suspense" at least 2 times.
suspense_subset = suspense_texts[suspense_texts['suspense_frequ'] > 1]
suspense_subset[:10]

Unnamed: 0,file_name,document_text,suspense_frequ,clean_text,suspense_journal_title,suspense_journal_date,pure_text,suspense_journal_year
0,3536957.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Good words,Nov 1868,THE WOMANS KINGDOM. obe itore. BY TIHE AUTHOR ...,1868
8,3279566.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",3,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Bow bells : a magazine of general literature a...,"Oct 28, 1868","BADLESMERE KNOLL ST 2 AUITS OF CAPTAI OUCALD, ...",1868
23,2633758.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...","Fraser&apos;s magazine for town and country, 1...",Nov 1857,INTEIRPRETER. THE aMaled 4z War* BY G. J. WHY...,1857
26,3057421.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...","The Argosy : a magazine of tales, travels, ess...",Dec 1888,LADY TREVELYAN. A LONG a country road a young ...,1888
27,3485960.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Every week : a journal of entertaining literature,"Dec 20, 1886",GOLDEN FETTERS. CHAPTER IX. Th E day following...,1886
31,2700423.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Duffy&apos;s Hibernian magazine : a monthly jo...,Oct 1862,"LILLIE BROWNE. r.y hutu murkay, author of the ...",1862
35,8008492.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",3,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...","The London reader : of literature, science, ar...","Mar 19, 1887",WIUTZ TO-DAY MDX ASK THIN TO COMB I AlID HAKOL...,1887
62,1692948508.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",The Graphic,"Dec 25, 1889",It was more than enough for one of th...,1889
75,6814194.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Temple bar : a London magazine for town and co...,Sep 1878,TI)e first Viollit. BOOK V. FRlIEDHELMS STORY....,1878
85,6946823.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",The Modern review : a quarterly magazine,Jan 1884,Mr. Percy Gregs Without God. * HE expectation...,1884


In [21]:
# show the number of files containing the word "suspense" ≤ 2 times.
len(suspense_subset)
# 368 of the 2635 documents contain the word "suspense" ≤ 2 times.

368

In [22]:
#print the column names of the subset
suspense_subset.columns


Index(['file_name', 'document_text', 'suspense_frequ', 'clean_text',
       'suspense_journal_title', 'suspense_journal_date', 'pure_text',
       'suspense_journal_year'],
      dtype='object')

In [29]:
# create a new subset of "suspense"-containing texts by filtering by the year of publication between 1800 and 1900.
suspense_19_subset = suspense_texts[(suspense_texts['suspense_journal_year'] > 1799) & (suspense_texts['suspense_journal_year'] < 1901) & (suspense_texts['suspense_frequ'] > 1)]
# this is a 19th century subset of bpo files that contain the word "suspense" at least 2 times.


In [30]:
# show and controll the current state of the dataframe before you store it in a csv-file in the next step
suspense_19_subset[:10]

Unnamed: 0,file_name,document_text,suspense_frequ,clean_text,suspense_journal_title,suspense_journal_date,pure_text,suspense_journal_year
0,3536957.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Good words,Nov 1868,THE WOMANS KINGDOM. obe itore. BY TIHE AUTHOR ...,1868
8,3279566.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",3,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Bow bells : a magazine of general literature a...,"Oct 28, 1868","BADLESMERE KNOLL ST 2 AUITS OF CAPTAI OUCALD, ...",1868
23,2633758.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...","Fraser&apos;s magazine for town and country, 1...",Nov 1857,INTEIRPRETER. THE aMaled 4z War* BY G. J. WHY...,1857
26,3057421.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...","The Argosy : a magazine of tales, travels, ess...",Dec 1888,LADY TREVELYAN. A LONG a country road a young ...,1888
27,3485960.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Every week : a journal of entertaining literature,"Dec 20, 1886",GOLDEN FETTERS. CHAPTER IX. Th E day following...,1886
31,2700423.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Duffy&apos;s Hibernian magazine : a monthly jo...,Oct 1862,"LILLIE BROWNE. r.y hutu murkay, author of the ...",1862
35,8008492.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",3,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...","The London reader : of literature, science, ar...","Mar 19, 1887",WIUTZ TO-DAY MDX ASK THIN TO COMB I AlID HAKOL...,1887
62,1692948508.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",The Graphic,"Dec 25, 1889",It was more than enough for one of th...,1889
75,6814194.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Temple bar : a London magazine for town and co...,Sep 1878,TI)e first Viollit. BOOK V. FRlIEDHELMS STORY....,1878
85,6946823.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",The Modern review : a quarterly magazine,Jan 1884,Mr. Percy Gregs Without God. * HE expectation...,1884


In [31]:
# show the number of files in the 19th century subset of bpo files that contain the word "suspense" at least 2 times
len(suspense_19_subset)
# 313 of the 368 "suspense"-containing files contain the word "suspense" ≤ 2 times and are taken from 19th century periodicals.

313

In [32]:
# store the subset of the dataframe in a new directory
filepath = Path('select/out_all_suspense_texts_19.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
suspense_19_subset.to_csv(filepath) 

In [151]:
# optional: indicate if the words "Dublin" or "Edinburgh" are part of the title of a periodical.
suspense_texts['suspense_journal_title'].str.match('Dublin' or 'Edinburgh')


0       False
1       False
2       False
3       False
4       False
        ...  
2630    False
2631    False
2632    False
2633    False
2634    False
Name: suspense_journal_title, Length: 2635, dtype: bool

In [34]:
# create a new subset of "suspense"-containing texts by filtering by the year of publication between 1800 and 1900 and excluding periodicals from Edinburgh and Dublin to get an English only corpus.
suspense_19_Eng_subset = suspense_texts[(suspense_texts['suspense_journal_year'] > 1799) & (suspense_texts['suspense_journal_year'] < 1901) & (suspense_texts['suspense_frequ'] > 1) & (suspense_texts['suspense_journal_title'].str.match('Dublin' or 'Edinburgh') == False)]


In [35]:
# show and controll the current state of the dataframe before you store it in a csv-file in the next step
suspense_19_Eng_subset[:10]

Unnamed: 0,file_name,document_text,suspense_frequ,clean_text,suspense_journal_title,suspense_journal_date,pure_text,suspense_journal_year
0,3536957.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Good words,Nov 1868,THE WOMANS KINGDOM. obe itore. BY TIHE AUTHOR ...,1868
8,3279566.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",3,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Bow bells : a magazine of general literature a...,"Oct 28, 1868","BADLESMERE KNOLL ST 2 AUITS OF CAPTAI OUCALD, ...",1868
23,2633758.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...","Fraser&apos;s magazine for town and country, 1...",Nov 1857,INTEIRPRETER. THE aMaled 4z War* BY G. J. WHY...,1857
26,3057421.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...","The Argosy : a magazine of tales, travels, ess...",Dec 1888,LADY TREVELYAN. A LONG a country road a young ...,1888
27,3485960.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Every week : a journal of entertaining literature,"Dec 20, 1886",GOLDEN FETTERS. CHAPTER IX. Th E day following...,1886
31,2700423.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Duffy&apos;s Hibernian magazine : a monthly jo...,Oct 1862,"LILLIE BROWNE. r.y hutu murkay, author of the ...",1862
35,8008492.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",3,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...","The London reader : of literature, science, ar...","Mar 19, 1887",WIUTZ TO-DAY MDX ASK THIN TO COMB I AlID HAKOL...,1887
62,1692948508.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",The Graphic,"Dec 25, 1889",It was more than enough for one of th...,1889
75,6814194.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",Temple bar : a London magazine for town and co...,Sep 1878,TI)e first Viollit. BOOK V. FRlIEDHELMS STORY....,1878
85,6946823.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",2,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Recor...",The Modern review : a quarterly magazine,Jan 1884,Mr. Percy Gregs Without God. * HE expectation...,1884


In [157]:
# store the subset of the dataframe in a new directory
filepath = Path('select/out_all_suspense_texts_19_Eng.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
suspense_19_Eng_subset.to_csv(filepath) 
# "suspense" min. 2 times, English only by excluding "Dublin" & "Edinburgh", 19th century only
#end of the notebook