## Scraping using bs4: CSS Selector

#### select() : Uses CSS selector and returns all the matching ones in [] (like find_all())
#### select_one() : Uses CSS selector and returns single matching one (like find()). 

In [15]:
from bs4 import BeautifulSoup as BSoup
import requests
import csv
import json
url = "http://quotes.toscrape.com/tag/hope/page/"
columns=['id','author','quote','tags','quote_length','born_date','born_location','author_url'] #Columns for CSV header

In [16]:
authorSet=dict()
dataSet=list()
page=1
nextPage=True
uid=0

In [17]:
while (nextPage):
    
    print(url+str(page))
    response = requests.get(url+str(page))
    source = BSoup(response.content)    
    print(source.select_one('title').get_text())
    
    if source.select_one('ul.pager li.next'):
        txtNext = source.select_one('ul.pager li.next a').get_text()
        print(f"Processing {page} {nextPage} {txtNext}") 
    else:
        txtNext=None
    
    if txtNext and re.findall(r".*(Next).*",txtNext)[0]=="Next":    
        nextPage=True
    else:
        nextPage=False
        
    print(f"Processing {page} {nextPage}") 
    
    for quotes in source.select('div.quote'):
        quote = quotes.select_one('[itemprop*="text"]').get_text().strip()
        author = quotes.select_one('.author').get_text().strip()
        tags = quotes.select_one('[itemprop*="keywords"]').get('content').strip()
        authorUrl = quotes.select_one('[href*="/author/"]').get('href')
        
        if authorUrl:
            print(authorUrl)
            authorKey = author.replace('.','_').replace(' ','_').strip()
            print(authorKey)
            
        if authorUrl and authorKey not in authorSet.keys():
            authorUrl = "http://quotes.toscrape.com"+authorUrl
            print(authorUrl)
            source_author = BSoup(requests.get(authorUrl).content)
            bornDate = source_author.select_one('.author-born-date').get_text().strip()
            bornLocation = source_author.select_one('.author-born-location').get_text().replace('in','').strip()
            authorSet[authorKey]={'name':author,'url':authorUrl,'date':bornDate,'location':bornLocation}
        else:
            print(f"Author ({authorKey}) details already found!")
            
        uid+=1    
        dataSet.append([uid,author,quote,tags.replace(',','|'),len(quote),
                        authorSet[authorKey]['date'],
                        authorSet[authorKey]['location'],
                        authorSet[authorKey]['url']            
        ]) 
    page+=1

http://quotes.toscrape.com/tag/hope/page/1
Quotes to Scrape
Processing 1 False
/author/Martin-Luther-King-Jr
Martin_Luther_King_Jr_
http://quotes.toscrape.com/author/Martin-Luther-King-Jr
/author/John-Lennon
John_Lennon
http://quotes.toscrape.com/author/John-Lennon


In [18]:
authorSet

{'Martin_Luther_King_Jr_': {'name': 'Martin Luther King Jr.',
  'url': 'http://quotes.toscrape.com/author/Martin-Luther-King-Jr',
  'date': 'January 15, 1929',
  'location': 'Atlanta, Georgia, The United States'},
 'John_Lennon': {'name': 'John Lennon',
  'url': 'http://quotes.toscrape.com/author/John-Lennon',
  'date': 'October 09, 1940',
  'location': 'Liverpool, England, The United Kgdom'}}

In [19]:
dataSet

[[1,
  'Martin Luther King Jr.',
  '“Only in the darkness can you see the stars.”',
  'hope|inspirational',
  45,
  'January 15, 1929',
  'Atlanta, Georgia, The United States',
  'http://quotes.toscrape.com/author/Martin-Luther-King-Jr'],
 [2,
  'John Lennon',
  "“You may say I'm a dreamer, but I'm not the only one. I hope someday you'll join us. And the world will live as one.”",
  'beatles|connection|dreamers|dreaming|dreams|hope|inspirational|peace',
  117,
  'October 09, 1940',
  'Liverpool, England, The United Kgdom',
  'http://quotes.toscrape.com/author/John-Lennon']]

## Write to CSV

In [20]:
def writeto_csv(data,filename,columns):
    with open(filename,'w+',newline='',encoding="UTF-8") as file:
        writer = csv.DictWriter(file, fieldnames=columns)
        writer.writeheader()
        writer = csv.writer(file)
        for element in data:
            writer.writerows([element])

### Create CSV file from dataSet

In [21]:
writeto_csv(dataSet,'quotes_hope.csv',columns)

### Write to JSON - author

In [22]:
with open("quotes_author_hope.json", "w") as file:
    json.dump(authorSet, file, indent=4, sort_keys=False)