In [4]:
import pandas as pd
import spacy
import re

from spacytextblob.spacytextblob import SpacyTextBlob

from goodreads_book_scraper import get_listopia
from book_importer import BookImporter
from language_analysis import Book
from similarity import SimilarityChecker


## Get a list of books from GoodReads

In [5]:
url = 'https://www.goodreads.com/list/show/1.Best_Books_Ever'

goodreads_list = get_listopia(url)

In [3]:
book_data = []

for book in goodreads_list:
    title, author, text, file_path = BookImporter.gutendex(book)
    if title:
        book_data.append({'Title': title, 'Author': author, 'Text': text, 'File Path': file_path})


## Import the books from the Gutenberg Library and store the information into pandas.

In [6]:
book_id = [43, 345, 41445, 209, 1513, 11, 36, 768, 1342, 2701, 4300]

for book in book_id:
    title, author, text, file_path = BookImporter.get_book(book)
    book_data.append({'Title': title, 'Author': author, 'Text': text, 'File Path': file_path})


In [7]:
book_data = pd.DataFrame(book_data)
book_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Title      89 non-null     object
 1   Author     89 non-null     object
 2   Text       89 non-null     object
 3   File Path  89 non-null     object
dtypes: object(4)
memory usage: 2.9+ KB


In [8]:
print(book_data['Title'])

0                                   Pride and prejudice
1                                   Pride and prejudice
2     The Raid of John Brown at Harper's Ferry as I ...
3                                  A Thief in the Night
4                                      Domestic Animals
                            ...                        
84                                The War of the Worlds
85                                    Wuthering Heights
86                                  Pride and prejudice
87                              Moby-Dick; or The Whale
88                                              Ulysses
Name: Title, Length: 89, dtype: object


## Load the NLP and add TextBlob to it

In [None]:
nlp = spacy.load('en_core_web_lg')
nlp.add_pipe('spacytextblob')

books = []

## Read the files and analyse the novels

### Jekyll and Hyde

In [None]:
with open('data\The_Strange_Case_Of_Dr._Jekyll_And_Mr._Hyde.txt', 'r', encoding='UTF-8') as file:
    jekyll_hyde = Book('Jekyll and Hyde', file.read())
chapter_markers = r'((\nSTORY OF THE DOOR)|(\nSEARCH FOR MR. HYDE)|(\nDR. JEKYLL WAS QUITE AT EASE)|(\nTHE CAREW MURDER CASE)|(\nINCIDENT OF THE LETTER)|(\nINCIDENT OF DR. LANYON)|(\nINCIDENT AT THE WINDOW)|(\nTHE LAST NIGHT)|(\nDR. LANYON’S NARRATIVE)|(\nHENRY JEKYLL’S FULL STATEMENT OF THE CASE))'


In [None]:
jekyll_hyde.split_into_chapters(chapter_markers)


In [None]:
jekyll_hyde.do_nlp(nlp)


In [None]:
jekyll_hyde.chapter_nlp(nlp)


In [None]:
jekyll_hyde.blobify()

In [None]:
jekyll_hyde.get_analysis()

In [None]:
jekyll_hyde.chapter_analysis()


In [None]:
books.append(jekyll_hyde)

### Dracula

In [None]:
with open('data\Dracula.txt', 'r', encoding='UTF-8') as file:
    dracula = Book('Dracula', file.read())
chapter_markers = r'(PREFACE.)|(LETTER I+\.)|(CHAPTER [IVXLCDM]+\n)'




In [None]:
dracula.split_into_chapters(chapter_markers)


In [None]:
dracula.do_nlp(nlp)


In [None]:
dracula.chapter_nlp(nlp)


In [None]:
dracula.blobify()

In [None]:
dracula.get_analysis()


In [None]:
dracula.chapter_analysis()
books.append(dracula)

### Frankenstein

In [None]:
with open('data\Frankenstein.txt', 'r', encoding='UTF-8') as file:
    frankenstein = Book('Frankenstein', file.read())

chapter_markers = r'(PREFACE.)|(LETTER I+\.)|(CHAPTER [IVXLCDM]+\.)'
frankenstein.split_into_chapters(chapter_markers)


In [None]:
frankenstein.do_nlp(nlp)


In [None]:
frankenstein.chapter_nlp(nlp)


In [None]:
frankenstein.blobify()


In [None]:
frankenstein.get_analysis()


In [None]:
frankenstein.chapter_analysis()
books.append(frankenstein)

### The Turn of the Screw

In [None]:
with open('data\The_Turn_of_the_Screw.txt', 'r', encoding='UTF-8') as file:
    turn_of_the_screw = Book('The Turn of the Screw', file.read())

chapter_markers = r'((\nI\n)|(\nII)|(\nIII)|(\nIV)|(\nV)|(\nVI)|(\nVII)|(\nVIII)|(\nIX)|(\nX)|(\nXI)|(\nXII)|(\nXIII)|(\nXIV)|(\nXV)|(\nXVI)|(\nXVII)|(\nXVIII)|(\nXIX)|(\nXX)|(\nXXI)|(\nXXII)|(\nXXIII)|(\nXXIV))'


In [None]:
turn_of_the_screw.split_into_chapters(chapter_markers)


In [None]:
turn_of_the_screw.do_nlp(nlp)


In [None]:
turn_of_the_screw.chapter_nlp(nlp)


In [None]:
turn_of_the_screw.blobify()


In [None]:
turn_of_the_screw.get_analysis()


In [None]:
turn_of_the_screw.chapter_analysis()
books.append(turn_of_the_screw)

### Romeo and Juliet

In [None]:
with open('data\Romeo_and_Juliet.txt', 'r', encoding='UTF-8') as file:
    romeo_and_juliet = Book('Romeo and Juliet', file.read())

chapter_markers = r'(THE PROLOGUE\n)|(SCENE I. A public place)|(SCENE II. A Street)|(SCENE III. Room in Capulet’s House)|(SCENE IV. A Street)|(SCENE V. A Hall in Capulet’s House)|(ACT II\n\n)|(SCENE I. An open place adjoining Capulet’s Garden)|(SCENE II. Capulet’s Garden)|(SCENE III. Friar Lawrence’s Cell)|(SCENE IV. A Street)|(SCENE V. Capulet’s Garden)|(SCENE VI. Friar Lawrence’s Cell)|(SCENE I. A public Place)|(SCENE II. A Room in Capulet’s House)|(SCENE III. Friar Lawrence’s cell)|(SCENE IV. A Room in Capulet’s House)|(SCENE V. An open Gallery to Juliet’s Chamber, overlooking the Garden)|(SCENE I. Friar Lawrence’s Cell)|(SCENE II. Hall in Capulet’s House)|(SCENE III. Juliet’s Chamber)|(SCENE IV. Hall in Capulet’s House)|(SCENE V. Juliet’s Chamber; Juliet on the bed)|(SCENE I. Mantua. A Street)|(SCENE II. Friar Lawrence’s Cell)|(SCENE III. A churchyard; in it a Monument belonging to the Capulets)'


In [None]:
romeo_and_juliet.split_into_chapters(chapter_markers)


In [None]:
romeo_and_juliet.do_nlp(nlp)


In [None]:
romeo_and_juliet.chapter_nlp(nlp)


In [None]:
romeo_and_juliet.blobify()


In [None]:
romeo_and_juliet.get_analysis()


In [None]:
romeo_and_juliet.chapter_analysis()
books.append(romeo_and_juliet)

### Alice's Adventures in Wonderland

In [None]:
with open('data\Alice’s_Adventures_in_Wonderland.txt', 'r', encoding='UTF-8') as file:
    alice = Book('Alice in Wonderland', file.read())
chapter_markers = r'(CHAPTER I.\nDown the Rabbit-Hole)|(CHAPTER II.\nThe Pool of Tears)|(CHAPTER III.\nA Caucus-Race and a Long Tale)|(CHAPTER IV.\nThe Rabbit Sends in a Little Bill)|(CHAPTER V.\nAdvice from a Caterpillar)|(CHAPTER VI.\nPig and Pepper)|(CHAPTER VII.\nA Mad Tea-Party)|(CHAPTER VIII.\nThe Queen’s Croquet-Ground)|(CHAPTER IX.\nThe Mock Turtle’s Story)|(CHAPTER X.\nThe Lobster Quadrille)|(CHAPTER XI.\nWho Stole the Tarts\?)|(CHAPTER XII.\nAlice’s Evidence)'


In [None]:
alice.split_into_chapters(chapter_markers)


In [None]:
alice.do_nlp(nlp)


In [None]:
alice.chapter_nlp(nlp)


In [None]:
alice.blobify()


In [None]:
alice.get_analysis()


In [None]:
alice.chapter_analysis()
books.append(alice)

### The War of the Worlds

In [None]:
with open('data\The_War_of_the_Worlds.txt', 'r', encoding='UTF-8') as file:
    war_of_the_worlds = Book('War of the Worlds', file.read())

chapter_markers = r'\n[IVX]+\.\n'


In [None]:
war_of_the_worlds.split_into_chapters(chapter_markers)


In [None]:
war_of_the_worlds.do_nlp(nlp)


In [None]:
war_of_the_worlds.chapter_nlp(nlp)


In [None]:
war_of_the_worlds.blobify()


In [None]:
war_of_the_worlds.get_analysis()


In [None]:
war_of_the_worlds.chapter_analysis()
books.append(war_of_the_worlds)

### Wuthering Heights

In [None]:
with open('data\Wuthering_Heights.txt', 'r', encoding='UTF-8') as file:
    wuthering_heights = Book('Wuthering Heights', file.read())

chapter_markers = r'CHAPTER [IVX]+'


In [None]:
wuthering_heights.split_into_chapters(chapter_markers)


In [None]:
wuthering_heights.do_nlp(nlp)


In [None]:
wuthering_heights.chapter_nlp(nlp)


In [None]:
wuthering_heights.blobify()


In [None]:
wuthering_heights.get_analysis()


In [None]:
wuthering_heights.chapter_analysis()
books.append(wuthering_heights)

### Pride and Prejudice

In [None]:
with open('data\Pride_and_prejudice.txt', 'r', encoding='UTF-8') as file:
    pride_prejudice = Book('Pride and Prejudice', file.read())

chapter_markers = r'(Chapter I\.\])|CHAPTER [IVXL]+\.'


In [None]:
pride_prejudice.split_into_chapters(chapter_markers)

In [None]:
pride_prejudice.do_nlp(nlp)

In [None]:
pride_prejudice.chapter_nlp(nlp)

In [None]:
pride_prejudice.get_analysis()


In [None]:
pride_prejudice.chapter_analysis()
books.append(pride_prejudice)

### Moby Dick

In [None]:
with open('data\Moby-Dick;_or_The_Whale.txt', 'r', encoding='UTF-8') as file:
    moby_dick = Book('Moby Dick', file.read())


In [None]:
contents = [
    "ETYMOLOGY.",
    "EXTRACTS (Supplied by a Sub-Sub-Librarian).",
    "CHAPTER 1. Loomings.",
    "CHAPTER 2. The Carpet-Bag.",
    "CHAPTER 3. The Spouter-Inn.",
    "CHAPTER 4. The Counterpane.",
    "CHAPTER 5. Breakfast.",
    "CHAPTER 6. The Street.",
    "CHAPTER 7. The Chapel.",
    "CHAPTER 8. The Pulpit.",
    "CHAPTER 9. The Sermon.",
    "CHAPTER 10. A Bosom Friend.",
    "CHAPTER 11. Nightgown.",
    "CHAPTER 12. Biographical.",
    "CHAPTER 13. Wheelbarrow.",
    "CHAPTER 14. Nantucket.",
    "CHAPTER 15. Chowder.",
    "CHAPTER 16. The Ship.",
    "CHAPTER 17. The Ramadan.",
    "CHAPTER 18. His Mark.",
    "CHAPTER 19. The Prophet.",
    "CHAPTER 20. All Astir.",
    "CHAPTER 21. Going Aboard.",
    "CHAPTER 22. Merry Christmas.",
    "CHAPTER 23. The Lee Shore.",
    "CHAPTER 24. The Advocate.",
    "CHAPTER 25. Postscript.",
    "CHAPTER 26. Knights and Squires.",
    "CHAPTER 27. Knights and Squires.",
    "CHAPTER 28. Ahab.",
    "CHAPTER 29. Enter Ahab; to Him, Stubb.",
    "CHAPTER 30. The Pipe.",
    "CHAPTER 31. Queen Mab.",
    "CHAPTER 32. Cetology.",
    "CHAPTER 33. The Specksnyder.",
    "CHAPTER 34. The Cabin-Table.",
    "CHAPTER 35. The Mast-Head.",
    "CHAPTER 36. The Quarter-Deck.",
    "CHAPTER 37. Sunset.",
    "CHAPTER 38. Dusk.",
    "CHAPTER 39. First Night-Watch.",
    "CHAPTER 40. Midnight, Forecastle.",
    "CHAPTER 41. Moby Dick.",
    "CHAPTER 42. The Whiteness of the Whale.",
    "CHAPTER 43. Hark!",
    "CHAPTER 44. The Chart.",
    "CHAPTER 45. The Affidavit.",
    "CHAPTER 46. Surmises.",
    "CHAPTER 47. The Mat-Maker.",
    "CHAPTER 48. The First Lowering.",
    "CHAPTER 49. The Hyena.",
    "CHAPTER 50. Ahab’s Boat and Crew. Fedallah.",
    "CHAPTER 51. The Spirit-Spout.",
    "CHAPTER 52. The Albatross.",
    "CHAPTER 53. The Gam.",
    "CHAPTER 54. The Town-Ho’s Story.",
    "CHAPTER 55. Of the Monstrous Pictures of Whales.",
    "CHAPTER 56. Of the Less Erroneous Pictures of Whales, and the True\nPictures of Whaling Scenes.",
    "CHAPTER 57. Of Whales in Paint; in Teeth; in Wood; in Sheet-Iron; in\Stone; in Mountains; in Stars.",
    "CHAPTER 58. Brit.",
    "CHAPTER 59. Squid.",
    "CHAPTER 60. The Line.",
    "CHAPTER 61. Stubb Kills a Whale.",
    "CHAPTER 62. The Dart.",
    "CHAPTER 63. The Crotch.",
    "CHAPTER 64. Stubb’s Supper.",
    "CHAPTER 65. The Whale as a Dish.",
    "CHAPTER 66. The Shark Massacre.",
    "CHAPTER 67. Cutting In.",
    "CHAPTER 68. The Blanket.",
    "CHAPTER 69. The Funeral.",
    "CHAPTER 70. The Sphynx.",
    "CHAPTER 71. The Jeroboam’s Story.",
    "CHAPTER 72. The Monkey-Rope.",
    "CHAPTER 73. Stubb and Flask kill a Right Whale; and Then Have a Talk over Him.",
    "CHAPTER 74. The Sperm Whale’s Head—Contrasted View.",
    "CHAPTER 75. The Right Whale’s Head—Contrasted View.",
    "CHAPTER 76. The Battering-Ram.",
    "CHAPTER 77. The Great Heidelburgh Tun.",
    "CHAPTER 78. Cistern and Buckets.",
    "CHAPTER 79. The Prairie.",
    "CHAPTER 80. The Nut.",
    "CHAPTER 81. The Pequod Meets The Virgin.",
    "CHAPTER 82. The Honor and Glory of Whaling.",
    "CHAPTER 83. Jonah Historically Regarded.",
    "CHAPTER 84. Pitchpoling.",
    "CHAPTER 85. The Fountain.",
    "CHAPTER 86. The Tail.",
    "CHAPTER 87. The Grand Armada.",
    "CHAPTER 88. Schools and Schoolmasters.",
    "CHAPTER 89. Fast-Fish and Loose-Fish.",
    "CHAPTER 90. Heads or Tails.",
    "CHAPTER 91. The Pequod Meets The Rose-Bud.",
    "CHAPTER 92. Ambergris.",
    "CHAPTER 93. The Castaway.",
    "CHAPTER 94. A Squeeze of the Hand.",
    "CHAPTER 95. The Cassock.",
    "CHAPTER 96. The Try-Works.",
    "CHAPTER 97. The Lamp.",
    "CHAPTER 98. Stowing Down and Clearing Up.",
    "CHAPTER 99. The Doubloon.",
    "CHAPTER 100. Leg and Arm.",
    "CHAPTER 101. The Decanter.",
    "CHAPTER 102. A Bower in the Arsacides.",
    "CHAPTER 103. Measurement of The Whale’s Skeleton.",
    "CHAPTER 104. The Fossil Whale.",
    "CHAPTER 105. Does the Whale’s Magnitude Diminish?—Will He Perish?",
    "CHAPTER 106. Ahab’s Leg.",
    "CHAPTER 107. The Carpenter.",
    "CHAPTER 108. Ahab and the Carpenter.",
    "CHAPTER 109. Ahab and Starbuck in the Cabin.",
    "CHAPTER 110. Queequeg in His Coffin.",
    "CHAPTER 111. The Pacific.",
    "CHAPTER 112. The Blacksmith.",
    "CHAPTER 113. The Forge.",
    "CHAPTER 114. The Gilder.",
    "CHAPTER 115. The Pequod Meets The Bachelor.",
    "CHAPTER 116. The Dying Whale.",
    "CHAPTER 117. The Whale Watch.",
    "CHAPTER 118. The Quadrant.",
    "CHAPTER 119. The Candles.",
    "CHAPTER 120. The Deck Towards the End of the First Night Watch.",
    "CHAPTER 121. Midnight.—The Forecastle Bulwarks.",
    "CHAPTER 122. Midnight Aloft.—Thunder and Lightning.",
    "CHAPTER 123. The Musket.",
    "CHAPTER 124. The Needle.",
    "CHAPTER 125. The Log and Line.",
    "CHAPTER 126. The Life-Buoy.",
    "CHAPTER 127. The Deck.",
    "CHAPTER 128. The Pequod Meets The Rachel.",
    "CHAPTER 129. The Cabin.",
    "CHAPTER 130. The Hat.",
    "CHAPTER 131. The Pequod Meets The Delight.",
    "CHAPTER 132. The Symphony.",
    "CHAPTER 133. The Chase—First Day.",
    "CHAPTER 134. The Chase—Second Day.",
    "CHAPTER 135. The Chase.—Third Day.",
    "Epilogue"
]


In [None]:

for marker in contents:
    moby_dick.text.replace(marker, '', 1)

In [None]:
chapter_markers = r'(' + r')|('.join(contents) + r')'

In [None]:

moby_dick.split_into_chapters(chapter_markers)


In [None]:
moby_dick.do_nlp(nlp, 10000)


In [None]:
moby_dick.chapter_nlp(nlp)


In [None]:
moby_dick.blobify()


In [None]:
moby_dick.get_analysis()


In [None]:
moby_dick.chapter_analysis()
books.append(moby_dick)

### Ulysses

In [None]:
with open('data\\Ulysses.txt', 'r', encoding='UTF-8') as file:
    ulysses = Book('Ulysses', file.read())


In [None]:
chapter_markers = r''
for i in range(1, 18):
    chapter_markers += r'(\n\[ {} \])|'.format(i)
chapter_markers += r'(\n\[ 18 \])'

In [None]:

ulysses.split_into_chapters(chapter_markers)


In [None]:
ulysses.do_nlp(nlp, 10000)


In [None]:
ulysses.chapter_nlp(nlp)


In [None]:
ulysses.blobify()


In [None]:

ulysses.get_analysis()


In [None]:
ulysses.chapter_analysis()
books.append(ulysses)

## Conduct similarity analysis and display results

In [None]:
checker = SimilarityChecker(books)
checker.calculate_all_similarities()


In [None]:
checker.display_matrix()