In [1]:
from pyquery import PyQuery as pq
import requests
import csv
url = "http://quotes.toscrape.com/tag/books/page/"
columns=['id','author','quote','tags','quote_length','born_date','born_location','author_url'] #Columns for CSV header

In [2]:
authorSet=dict()
dataSet=list()
page=1
nextPage=True
uid=0

In [3]:
while (nextPage):
    
    print(url+str(page))
    response = requests.get(url+str(page))
    source = pq(response.content)    
    print(source.find('title').text())
    
    if source.find('ul.pager li.next a:contains("Next")'):    
        nextPage=True
    else:
        nextPage=False
        
    print(f"Processing {page}") 
    for quotes in source.find('.quote').items():
        quote = quotes.find('[itemprop="text"]').text().strip()
        author = quotes.find('[itemprop="author"]').text().strip()
        tags = quotes.find('[itemprop="keywords"]').attr('content').strip()
        authorUrl = quotes.find('a[href*="/author/"]').attr('href')
        
        if authorUrl:
            print(authorUrl)
            authorKey = author.replace('.','_').replace(' ','_').strip()
            print(authorKey)
            
        if authorUrl and authorKey not in authorSet.keys():
            authorUrl = "http://quotes.toscrape.com"+authorUrl
            print(authorUrl)
            source_author = pq(requests.get(authorUrl).content)
            bornDate = source_author.find('.author-born-date').text()
            bornLocation = source_author.find('.author-born-location').text().replace('in','').strip()
            authorSet[authorKey]={'name':author,'url':authorUrl,'date':bornDate,'location':bornLocation}
        else:
            print(f"Author ({authorKey}) details already found!")
            
        uid+=1    
        dataSet.append([uid,author,quote,tags.replace(',','|'),len(quote),
                        authorSet[authorKey]['date'],
                        authorSet[authorKey]['location'],
                        authorSet[authorKey]['url']            
        ]) 
    page+=1  

http://quotes.toscrape.com/tag/books/page/1
Quotes to Scrape
Processing 1
/author/Jane-Austen
Jane_Austen
http://quotes.toscrape.com/author/Jane-Austen
/author/Mark-Twain
Mark_Twain
http://quotes.toscrape.com/author/Mark-Twain
/author/Jorge-Luis-Borges
Jorge_Luis_Borges
http://quotes.toscrape.com/author/Jorge-Luis-Borges
/author/C-S-Lewis
C_S__Lewis
http://quotes.toscrape.com/author/C-S-Lewis
/author/Haruki-Murakami
Haruki_Murakami
http://quotes.toscrape.com/author/Haruki-Murakami
/author/Ernest-Hemingway
Ernest_Hemingway
http://quotes.toscrape.com/author/Ernest-Hemingway
/author/J-D-Salinger
J_D__Salinger
http://quotes.toscrape.com/author/J-D-Salinger
/author/Mark-Twain
Mark_Twain
Author (Mark_Twain) details already found!
/author/Jane-Austen
Jane_Austen
Author (Jane_Austen) details already found!
/author/Madeleine-LEngle
Madeleine_L'Engle
http://quotes.toscrape.com/author/Madeleine-LEngle
http://quotes.toscrape.com/tag/books/page/2
Quotes to Scrape
Processing 2
/author/George-R-R-Mar

In [4]:
dataSet

[[1,
  'Jane Austen',
  '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
  'aliteracy|books|classic|humor',
  104,
  'December 16, 1775',
  'Steventon Rectory, Hampshire, The United Kgdom',
  'http://quotes.toscrape.com/author/Jane-Austen'],
 [2,
  'Mark Twain',
  '“Good friends, good books, and a sleepy conscience: this is the ideal life.”',
  'books|contentment|friends|friendship|life',
  76,
  'November 30, 1835',
  'Florida, Missouri, The United States',
  'http://quotes.toscrape.com/author/Mark-Twain'],
 [3,
  'Jorge Luis Borges',
  '“I have always imagined that Paradise will be a kind of library.”',
  'books|library',
  65,
  'August 24, 1899',
  'Buenos Aires, Argenta',
  'http://quotes.toscrape.com/author/Jorge-Luis-Borges'],
 [4,
  'C.S. Lewis',
  '“You can never get a cup of tea large enough or a book long enough to suit me.”',
  'books|inspirational|reading|tea',
  79,
  'November 29, 1898',
  'Belfast, Ireland',
  '

In [5]:
authorSet

{'Jane_Austen': {'name': 'Jane Austen',
  'url': 'http://quotes.toscrape.com/author/Jane-Austen',
  'date': 'December 16, 1775',
  'location': 'Steventon Rectory, Hampshire, The United Kgdom'},
 'Mark_Twain': {'name': 'Mark Twain',
  'url': 'http://quotes.toscrape.com/author/Mark-Twain',
  'date': 'November 30, 1835',
  'location': 'Florida, Missouri, The United States'},
 'Jorge_Luis_Borges': {'name': 'Jorge Luis Borges',
  'url': 'http://quotes.toscrape.com/author/Jorge-Luis-Borges',
  'date': 'August 24, 1899',
  'location': 'Buenos Aires, Argenta'},
 'C_S__Lewis': {'name': 'C.S. Lewis',
  'url': 'http://quotes.toscrape.com/author/C-S-Lewis',
  'date': 'November 29, 1898',
  'location': 'Belfast, Ireland'},
 'Haruki_Murakami': {'name': 'Haruki Murakami',
  'url': 'http://quotes.toscrape.com/author/Haruki-Murakami',
  'date': 'January 12, 1949',
  'location': 'Kyoto, Japan'},
 'Ernest_Hemingway': {'name': 'Ernest Hemingway',
  'url': 'http://quotes.toscrape.com/author/Ernest-Hemingwa

### Write to CSV

In [6]:
def writeto_csv(data,filename,columns):
    with open(filename,'w+',newline='',encoding="UTF-8") as file:
        writer = csv.DictWriter(file, fieldnames=columns)
        writer.writeheader()
        writer = csv.writer(file)
        for element in data:
            writer.writerows([element])

#### Create CSV file from dataSet

In [7]:
writeto_csv(dataSet,'quotes.csv',columns)

### Write to JSON - author

In [8]:
import json

with open("quotes_author.json", "w") as file:
    json.dump(authorSet, file, indent=4, sort_keys=False)