In [1]:
import ebooklib
from ebooklib import epub
import bs4 as bs
import pandas as pd
import csv
from data_extract_clean import clean_text
from sklearn.feature_extraction.text import CountVectorizer
import requests
from fake_useragent import UserAgent
import pandas as pd
from data_extract_clean import clean_text, get_links, find_text

## Extract eBook and clean text into corpus

### By Chapter

In [2]:
book = epub.read_epub('/Users/stephan/Data_Science/Metis/Bootcamp/project_4/Book/the_wheel_of_time.epub')

The eBook contains differently formatted individual books, so I am defining exclusions and file names that will be treated differently.

In [3]:
# Special files and exclusions. The eBook contains differently formatted individual books.

title_pages = ['OEBPS/9781429961530_tp01.xhtml','OEBPS/title_2.xhtml','OEBPS/title_3.xhtml',
               'OEBPS/title_4.xhtml','OEBPS/e9781429960199_tp01.xhtml','OEBPS/title_6.xhtml','OEBPS/title_8.xhtml',
               'OEBPS/title_9.xhtml','OEBPS/title_10.xhtml','OEBPS/title_11.xhtml','OEBPS/title_12.xhtml',
               'OEBPS/title_13.xhtml','OEBPS/9780765325945_tp01.xhtml','OEBPS/title_15.xhtml','OEBPS/title_7.xhtml']


with open('../data_files/exclusions.csv', encoding='utf-8-sig') as f:
    reader = csv.reader(f, delimiter=',')
    exclusions = [item for sublist in reader for item in sublist]

In [4]:
items = []
for item in book.get_items():
    if item.get_type() == 9 and item.get_name() not in exclusions:
        soup = bs.BeautifulSoup(item.get_body_content(),'lxml')
        if item.get_name() in title_pages:
            if item.get_name() in ['OEBPS/9781429961530_tp01.xhtml','OEBPS/e9781429960199_tp01.xhtml',
                                   'OEBPS/9780765325945_tp01.xhtml', 'OEBPS/title_15.xhtml']:
                book_name = soup.find('h1', {'class': 'tbook-title'}).text
            elif item.get_name() == 'OEBPS/title_9.xhtml':
                book_name = soup.find('h1', {'class': 'h1'}).text
            else:
                book_name = soup.find('h1', {'class': 'book-title'}).text
        else:
            item_dict = {}
            item_dict['book_name'] = book_name
            item_dict['item_name']= item.get_name()
            item_dict['text'] = soup.get_text(separator=' ', strip=True)
            items.append(item_dict)

In [30]:
df = pd.DataFrame(items)

The final book has a chapter that was split. I am re-combining it below and dropping the extra row

In [31]:
chap_15_37 = df[df.item_name == "OEBPS/chapter37_15.xhtml"]['text'].item()
chap_15_37a = df[df.item_name == "OEBPS/chapter37_15a.xhtml"]['text'].item()
combined = chap_15_37 + chap_15_37a

df.loc[df.item_name == "OEBPS/chapter37_15.xhtml", 'text'] = combined
i = df[df.item_name == "OEBPS/chapter37_15a.xhtml"].index

df.drop(i, inplace=True)
df.reset_index(inplace=True)

Cleaning the text involves removing punctuation, lower casing and expanding contractions

In [32]:
df['clean_text'] = df.text.apply(clean_text)

I manually gathered some meta-data and collected it into an excel file.

In [33]:
meta_data = pd.read_excel('../data_files/chapters_and_pov.xlsx', sheet_name='Chapters', 
                          engine='openpyxl', usecols='A:H')

In [34]:
df2 = df.join(meta_data)

In [36]:
df2.to_pickle('../data_files/extracted_text.pickle')

In [10]:
chapter_corpus = df2.clean_text
chapter_corpus.to_pickle('../data_files/chapter_corpus.pickle')

### Create corpus for full books

In [11]:
book_names = df2.book_title.unique()

In [12]:
book_dict = {}
for book in book_names:
    df = df2[df2.book_title == book]['clean_text']
    book_dict[book] = ''.join(df)
book_df = pd.DataFrame.from_dict(book_dict, orient='index')
book_df.columns = ['clean_text']
book_corpus = book_df.clean_text
book_corpus.to_pickle('../data_files/book_corpus.pickle')

## Extract chapter summaries

Extracting summaries from the [WOT Re-Read by Leigh Butler](https://www.tor.com/series/wot-reread/)

In [13]:
url = 'https://www.tor.com/features/series/wot-reread/'
ua = UserAgent()
user_agent = {'User-agent': ua.random}
response = requests.get(url, headers=user_agent)

In [14]:
response.status_code

200

In [15]:
#Inspection of the re-read website showed only the first 337 links are relevant.
links = get_links(response)[:337]

In [16]:
summaries = []
counter = 0
for link in links:
    response = requests.get(link, headers=user_agent)
    soup = bs.BeautifulSoup(response.text, 'lxml')
    for p in soup.find_all(text='What Happens'):
        summaries.append(find_text(links, link, p))
    print(counter, link)
    counter += 1
    #time.sleep(1)

0 https://www.tor.com/2009/01/20/the-wheel-of-time-re-read-the-eye-of-the-world-part-1/
1 https://www.tor.com/2009/01/23/the-wheel-of-time-re-read-the-eye-of-the-world-part-2/
2 https://www.tor.com/2009/01/27/the-wheel-of-time-re-read-the-eye-of-the-world-part-3/
3 https://www.tor.com/2009/01/30/the-wheel-of-time-re-read-the-eye-of-the-world-part-4/
4 https://www.tor.com/2009/02/02/the-wheel-of-time-re-read-the-eye-of-the-world-part-5/
5 https://www.tor.com/2009/02/04/the-wheel-of-time-re-read-the-eye-of-the-world-part-6/
6 https://www.tor.com/2009/02/06/the-wheel-of-time-re-read-the-eye-of-the-world-part-7/
7 https://www.tor.com/2009/02/09/the-wheel-of-time-re-read-the-great-hunt-part-1/
8 https://www.tor.com/2009/02/11/the-wheel-of-time-re-read-the-great-hunt-part-2/
9 https://www.tor.com/2009/02/13/the-wheel-of-time-re-read-the-great-hunt-part-3/
10 https://www.tor.com/2009/02/16/the-wheel-of-time-re-read-the-great-hunt-part-4/
11 https://www.tor.com/2009/02/18/the-wheel-of-time-re-

96 https://www.tor.com/2009/10/07/the-wheel-of-time-re-read-lord-of-chaos-part-26/
97 https://www.tor.com/2009/10/09/the-wheel-of-time-re-read-lord-of-chaos-part-27/
98 https://www.tor.com/2009/10/12/the-wheel-of-time-re-read-lord-of-chaos-part-28/
99 https://www.tor.com/2009/10/14/the-wheel-of-time-re-read-lord-of-chaos-part-29/
100 https://www.tor.com/2009/10/16/the-wheel-of-time-re-read-lord-of-chaos-part-30/
101 https://www.tor.com/2009/10/19/the-wheel-of-time-re-read-lord-of-chaos-part-31/
102 https://www.tor.com/2009/12/07/the-wheel-of-time-re-read-a-crown-of-swords-part-1/
103 https://www.tor.com/2009/12/11/the-wheel-of-time-re-read-a-crown-of-swords-part-2/
104 https://www.tor.com/2009/12/14/the-wheel-of-time-re-read-a-crown-of-swords-part-3/
105 https://www.tor.com/2009/12/18/the-wheel-of-time-re-read-a-crown-of-swords-part-4/
106 https://www.tor.com/2010/01/04/the-wheel-of-time-re-read-a-crown-of-swords-part-5/
107 https://www.tor.com/2010/01/08/the-wheel-of-time-re-read-a-cr

190 https://www.tor.com/2011/02/18/the-wheel-of-time-re-read-new-spring-part-4/
191 https://www.tor.com/2011/02/22/the-wheel-of-time-re-read-new-spring-part-5/
192 https://www.tor.com/2011/02/25/the-wheel-of-time-re-read-new-spring-part-6/
193 https://www.tor.com/2011/03/04/the-wheel-of-time-re-read-new-spring-part-7/
194 https://www.tor.com/2011/03/08/the-wheel-of-time-re-read-new-spring-part-8/
195 https://www.tor.com/2011/03/11/the-wheel-of-time-re-read-new-spring-part-9/
196 https://www.tor.com/2011/03/15/the-wheel-of-time-re-read-new-spring-part-10/
197 https://www.tor.com/2011/03/22/the-wheel-of-time-re-read-new-spring-part-11/
198 https://www.tor.com/2011/03/29/the-wheel-of-time-re-read-new-spring-part-12/
199 https://www.tor.com/2011/04/05/the-wheel-of-time-re-read-new-spring-part-13/
200 https://www.tor.com/2011/04/12/the-wheel-of-time-re-read-knife-of-dreams-part-1/
201 https://www.tor.com/2011/04/26/the-wheel-of-time-re-read-knife-of-dreams-part-2/
202 https://www.tor.com/20

284 https://www.tor.com/2013/03/19/the-wheel-of-time-re-read-a-memory-of-light-part-7/
285 https://www.tor.com/2013/03/26/the-wheel-of-time-re-read-a-memory-of-light-part-8/
286 https://www.tor.com/2013/04/02/the-wheel-of-time-re-read-a-memory-of-light-part-9/
287 https://www.tor.com/2013/04/16/the-wheel-of-time-re-read-a-memory-of-light-part-10/
288 https://www.tor.com/2013/04/30/the-wheel-of-time-re-read-a-memory-of-light-part-11/
289 https://www.tor.com/2013/05/07/the-wheel-of-time-re-read-a-memory-of-light-part-12/
290 https://www.tor.com/2013/05/14/the-wheel-of-time-re-read-a-memory-of-light-part-13/
291 https://www.tor.com/2013/05/21/the-wheel-of-time-re-read-a-memory-of-light-part-14/
292 https://www.tor.com/2013/05/28/the-wheel-of-time-re-read-a-memory-of-light-part-15/
293 https://www.tor.com/2013/06/04/the-wheel-of-time-re-read-a-memory-of-light-part-16/
294 https://www.tor.com/2013/06/11/the-wheel-of-time-re-read-a-memory-of-light-part-17/
295 https://www.tor.com/2013/06/18/

In [17]:
df = pd.DataFrame(summaries)
df.to_pickle('../data_files/re_read.pickle')

### Align chapters with book corpus and clean

I manually aligned chapters in excel by outputting the df to an excel file, cleaning it up and saving results in "chapters_and_pov.xlsx". The cleaned file is available, so this step does not need to be repeated.

In [18]:
# df.to_excel('../data_files/reread.xlsx')

In [19]:
df_updated = pd.read_excel('../data_files/chapters_and_pov.xlsx', sheet_name='reread', engine='openpyxl', usecols='A:E')

In [20]:
# Combine cells that were too large to do in Excel
last_battle_text = df_updated[df_updated.chapter_title.str.contains('Last Battle')]['text'].str.cat(sep = ' ')
df_updated.loc[df_updated.chapter_reread == 'Chapter 37: The Last Battle (Part 1)','text'] = last_battle_text
i = df_updated[df_updated.chapter_reread.str.contains('Last Battle \[')].index
df_updated.drop(i, inplace=True)

In [21]:
kod_pr_text = df_updated[df_updated.chapter_title.str.contains('Embers Falling')]['text'].str.cat(sep = ' ')
df_updated.loc[df_updated.chapter_title == 'Prologue: Embers Falling on Dry Grass 1','text'] = kod_pr_text
i = df_updated.loc[df_updated.chapter_title == 'Prologue: Embers Falling on Dry Grass 2'].index
df_updated.drop(i, inplace=True)

In [22]:
cot_pr_text = df_updated[df_updated.chapter_title.str.contains('Prologue: Glimmers')]['text'].str.cat(sep = ' ')
df_updated.loc[df_updated.chapter_title == 'Prologue: Glimmers of the Pattern 1','text'] = cot_pr_text
i = df_updated.loc[df_updated.chapter_title == 'Prologue: Glimmers of the Pattern 2'].index
df_updated.drop(i, inplace=True)

In [23]:
wh_pr_text = df_updated[df_updated.chapter_title.str.contains('Prologue: Snow')]['text'].str.cat(sep = ' ')
df_updated.loc[df_updated.chapter_title == 'Prologue: Snow 1','text'] = wh_pr_text
i = df_updated.loc[df_updated.chapter_title == 'Prologue: Snow 2'].index
df_updated.drop(i, inplace=True)

In [24]:
loc_pr_text = df_updated[df_updated.chapter_title.str.contains('Prologue: The First Message')]['text'].str.cat(sep = ' ')
df_updated.loc[df_updated.chapter_title == 'Prologue: The First Message 1','text'] = loc_pr_text
i = df_updated.loc[df_updated.chapter_title == 'Prologue: The First Message 2'].index
df_updated.drop(i, inplace=True)

In [25]:
df_updated.reset_index(drop=True)
df_updated['clean_text'] = df_updated.text.apply(clean_text)

In [26]:
reread_corpus = df_updated.clean_text
reread_corpus.to_pickle('../data_files/reread_corpus.pickle')