### Get Book Title and Plot  

In [1]:
from threading import Thread

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import wikipediaapi
import wptools

In [2]:
book = pd.read_csv('book.csv')
book = book['name']

In [3]:
wiki_wiki = wikipediaapi.Wikipedia('en')
def get_plot(bookname, firstSection=True):
    try:
        if firstSection :
            p = wiki_wiki.page(bookname)
            sections = p.sections
            s = sections[0]
            return [s.title, s.text]
        else:
            p = wiki_wiki.page(bookname)
            sections = p.sections
            titles = []
            texts = []
            for s in sections:
                titles.append(s.title)
                texts.append(s.text)
            return dict(zip(titles, texts))  
    except:
        pass

def get_wiki(books, book_dict=None, firstSection=True):
    if book_dict is None:
        book_dict = {}
    for i in books:
        book_dict[i] = get_plot(i,firstSection)
    return book_dict

In [4]:
# thread process
def threaded_process(books,nthreads, firstSection=True):
    book_dict = {}
    threads = []
    for i in range(nthreads):
        b = books[i::nthreads]
        t = Thread(target = get_wiki, args =(b,book_dict,firstSection))
        threads.append(t)
    
    # start the threads
    [ t.start() for t in threads ]
    # wait for the threads to finish
    [ t.join() for t in threads ]
    return book_dict

In [6]:
print('Currently, there are %.0f books in the list' % (len(book)))

Currently, there are 35991 books in the list


In [19]:
data = threaded_process(book,100) 

In [20]:
data = pd.DataFrame.from_dict(data).T

data.head(3)

Unnamed: 0,0,1
Future Perfect (book),Key concepts,The main idea that Johnson promotes in Future ...
The Road (London book),Film adaptation,"The 1973 film Emperor of the North Pole, starr..."
Encyclopedia of Art,Chapters,"Introduction\nGuide to Use\n1. Main body, 2537..."


In [21]:
data.columns = ['section', 'content']

Check how many null values are included and remove them.

In [22]:
data.isnull().sum(axis=0)

section    16324
content    16324
dtype: int64

In [23]:
# remove invalid rows
data = data.dropna(how='any')

Now we need to determine if the first section is the plot.

In [24]:
names = data['section'].unique()
data.section.value_counts()[:50]

Plot summary               3579
Plot                       2892
Contents                   1321
Synopsis                   1118
References                  992
Summary                     871
Background                  836
Plot introduction           718
Reception                   465
See also                    463
Overview                    452
External links              283
Content                     242
History                     191
Characters                  176
Description                 142
Publication history         142
Stories                     133
Publication                  97
Setting                      95
Plot synopsis                95
Notes                        86
Reviews                      83
Awards                       79
Title                        76
Editions                     73
Story                        62
Themes                       58
Author                       57
Critical reception           53
Development                  52
Structur

We can categorize those sections into three types,
- Type one, the first section is plot. For example, when the section title is Plot Summary, Synopsis. 
- Type two, the first section is background of a book, which implies the plot might follow it and be the second or third section of a wiki page, so we need to explore the whole wiki page instead of only the first section.
- Type three, the plot is skipped, for example, when the first section title is References, Reception.

In [25]:
# Type one
plots = ['Plot summary','Plot', 'Contents', 'Synopsis', 'Summary', 'Plot introduction',\
         'Overview', 'Content', 'Description', 'Plot synopsis ', 'Context', 'Plot outline', 'Story outline',
         'Plot Summary', 'Chapters', 'Story', 'Book']

len(data[data.section.isin(plots)])/len(data)


0.5894137387501907

The type one only accounts for 58.9% of the whole dataset. Let's convert it into a clean dataset.

In [26]:
df = data[data.section.isin(plots)].reset_index()
df = df.drop('section', 1) 
df.columns = ['Title', 'Plot']

Now, let's process books of type two.

In [35]:
special = ['Background', 'History', 'Characters', 'Setting','Title', 'Development', \
           'Structure',  'Introduction', 'Inspiration']

print('Books in type two account for %.0f%% of the entire dataset' % (100*len(data[data.section.isin(special)])/len(data)))

Books in type two account for 8% of the entire dataset


Here we scrape the whole page of these books and get plot sections.

In [37]:
specialwiki = list(data[data.section.isin(special)].index)
specialbook = threaded_process(specialwiki, 100, firstSection=False)
specialbook =  pd.DataFrame(specialbook.items()) 

special_book_name = []
special_book_plot = []
for i in range(len(specialbook)):
    if specialbook.iloc[i,1]:
        for k,v in specialbook.iloc[i,1].items():
            if k in plots:
                special_book_name.append(specialbook.iloc[i,0])
                special_book_plot.append(v)
                continue

In [38]:
special_df= pd.DataFrame(
    {'Title': special_book_name,
     'Plot': special_book_plot,
    }) 

In [39]:
len(special_df)

900

In [40]:
df = df.append(special_df)

In [41]:
len(df)

12492

In [42]:
df.head(5)

Unnamed: 0,Title,Plot
0,Encyclopedia of Art,"Introduction\nGuide to Use\n1. Main body, 2537..."
1,No Time Like the Present,The novel is set during the period after the l...
2,Time of the Dragon,Time of the Dragon is an accessory for the Dra...
3,Hero Builder's Guidebook,Hero Builder's Guidebook provides assistance t...
4,Brother Dusty-Feet,Hugh Copplestone is an orphaned eleven-year-ol...


Now we get a very clean dataset containing 12492 wiki pages.

In [43]:
df.to_csv('plot.csv')

### Clean Data