In [3]:
import pandas as pd
import docx # install python-docx
import glob

In [4]:
# See options here: https://stackoverflow.com/questions/25228106/how-to-extract-text-from-an-existing-docx-file-using-python-docx

def get_text(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        if para.text.strip():
            fullText.append(para.text.strip())
    return fullText

In [3]:
word_files = glob.glob("scrubbed/*")
word_files

['scrubbed/Basket_Woman_-_Edgeworth.docx',
 'scrubbed/P_P.docx',
 'scrubbed/The_Governess_-_Fielding.docx',
 'scrubbed/Mary_-_Wollstonecraft.docx',
 'scrubbed/Evelina.docx',
 'scrubbed/NorthangerAbbey.docx',
 'scrubbed/Charlotte_Temple.docx',
 'scrubbed/David_Simple.docx',
 'scrubbed/Sense_Sensibility.docx',
 'scrubbed/Millenium_Hall.docx',
 'scrubbed/Wanderer.docx',
 'scrubbed/Lady_Susan.docx',
 'scrubbed/Blazing_World.docx',
 'scrubbed/Blue_Jar_-_Edgeworth.docx',
 'scrubbed/Hermione_-_Lennox.docx',
 'scrubbed/Mansfield_Park.docx',
 'scrubbed/Emma.docx',
 'scrubbed/Camilla.docx',
 'scrubbed/Persuasion.docx',
 'scrubbed/Cecilia.docx',
 'scrubbed/Euphemia_-_Lennox.docx']

In [4]:
word_text = []

for file in word_files:
    word_text.append(
        {
            "title": file.split("/")[-1][:-5],
            "text": get_text(file)
        }
    )

In [5]:
word_text[0]

{'title': 'Basket_Woman_-_Edgeworth',
 'text': ['foot steep slippery white hill',
  'called hut hovel',
  'travellers scarcely suppose inhabited',
  'smoke rising peaked roof old woman lived',
  'hovel years ago little boy girl',
  'children beggar died orphans perishing',
  'hunger happy old woman',
  'took hut bid warm small fire',
  'gave crust mouldy bread eat give',
  'gave goodwill kind poor',
  'children worked hard spinning wheel knitting',
  'support earned money',
  'follow carriages',
  'horses stopped take breath rest stones',
  'carriage wheels prevent rolling backwards',
  'steep slippery hill',
  'little boy girl loved stand goodnatured old',
  'woman spinning wheel spinning talk',
  'times taught hoped',
  'remember lives explained meant',
  'telling truth honest taught',
  'dislike idleness wish useful',
  'evening standing little boy',
  'grandmother name liked',
  'children call grandmother forced',
  'spinning wheel follow chaises coaches',
  'steep hill stones unde

In [6]:
df = pd.DataFrame(word_text)

In [7]:
df.head()

Unnamed: 0,title,text
0,Basket_Woman_-_Edgeworth,"[foot steep slippery white hill, called hut ho..."
1,P_P,[truth universally acknowledged single possess...
2,The_Governess_-_Fielding,"[lived northern parts gentlewoman, undertook e..."
3,Mary_-_Wollstonecraft,"[heroine fiction daughter, married gentle fash..."
4,Evelina,"[painful friendly mind, necessity communicatin..."


In [8]:
df_lines = df.explode("text")

In [9]:
df_lines

Unnamed: 0,title,text
0,Basket_Woman_-_Edgeworth,foot steep slippery white hill
0,Basket_Woman_-_Edgeworth,called hut hovel
0,Basket_Woman_-_Edgeworth,travellers scarcely suppose inhabited
0,Basket_Woman_-_Edgeworth,smoke rising peaked roof old woman lived
0,Basket_Woman_-_Edgeworth,hovel years ago little boy girl
...,...,...
20,Euphemia_-_Lennox,room uncle glad sufferer civility proceeded in...
20,Euphemia_-_Lennox,spoke severe look firm accent fit reply sure d...
20,Euphemia_-_Lennox,expected letter yesterdays post happen disappo...
20,Euphemia_-_Lennox,solitude nurse tender


In [10]:
df_lines["line_num"] = (df_lines.groupby("title").cumcount() + 1)

In [11]:
df_lines.head(20)

Unnamed: 0,title,text,line_num
0,Basket_Woman_-_Edgeworth,foot steep slippery white hill,1
0,Basket_Woman_-_Edgeworth,called hut hovel,2
0,Basket_Woman_-_Edgeworth,travellers scarcely suppose inhabited,3
0,Basket_Woman_-_Edgeworth,smoke rising peaked roof old woman lived,4
0,Basket_Woman_-_Edgeworth,hovel years ago little boy girl,5
0,Basket_Woman_-_Edgeworth,children beggar died orphans perishing,6
0,Basket_Woman_-_Edgeworth,hunger happy old woman,7
0,Basket_Woman_-_Edgeworth,took hut bid warm small fire,8
0,Basket_Woman_-_Edgeworth,gave crust mouldy bread eat give,9
0,Basket_Woman_-_Edgeworth,gave goodwill kind poor,10


In [12]:
df_lines.shape

(187683, 3)

In [13]:
df_lines.text.str.split().explode().str.strip().value_counts()[:50]

little      4071
time        3952
dear        2304
mind        2187
give        2140
house       2075
take        2060
room        1944
sir         1804
heart       1782
day         1764
saw         1752
life        1737
world       1731
lord        1665
having      1624
sure        1619
quite       1602
away        1600
poor        1599
friend      1589
came        1588
pleasure    1501
look        1491
mother      1488
father      1460
heard       1457
eyes        1367
place       1365
wish        1359
felt        1357
woman       1345
sister      1330
knew        1326
find        1308
hand        1306
believe     1303
family      1297
letter      1288
manner      1284
morning     1265
old         1255
present     1250
leave       1246
speak       1230
person      1208
gave        1207
happy       1183
done        1178
hear        1178
Name: text, dtype: int64