### Web Scraping Example - Form handling(Login), Quotes found across pages & Logout.

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import re

chromedriver_path="C:\HOWScraping2E\driver\chromedriver.exe" #fit the path as required.

#### Prepare a 'driver' to use with Chrome

In [2]:
service = Service(service=chromedriver_path)
driver = webdriver.Chrome(service=service)

#### URL : https://toscrape.com/

In [3]:
mainUrl="https://toscrape.com/" 

In [4]:
driver.get(mainUrl)

#### Choose the link with text 'Login'

In [5]:
loginPage = driver.find_element(By.LINK_TEXT, "Login").get_attribute('href')

In [6]:
loginPage

'http://quotes.toscrape.com/login'

In [7]:
driver.find_element(By.LINK_TEXT, "Login").click()

#### Form Handling - login

In [8]:
print(f"Before Login : {driver.current_url}")
      
username = driver.find_element(By.ID, "username") #username
username.clear()
username.send_keys("test") #username

password = driver.find_element(By.ID, "password") #password
password.clear()
password.send_keys("test") #password

driver.find_element(By.CLASS_NAME,'btn').click() #submit click
quotesUrl=driver.current_url
print(f"After Login : {quotesUrl}") 

#Extract Logout URL
logoutUrl = driver.find_element(By.LINK_TEXT, "Logout").get_attribute('href')
print(f"Logout : {logoutUrl}") 

Before Login : http://quotes.toscrape.com/login
After Login : http://quotes.toscrape.com/
Logout : http://quotes.toscrape.com/logout


#### Verify if the link is quotes.toscrape.com

In [9]:
if 'quotes.toscrape.com' in quotesUrl:
    pagination=False  #default 
    column=['author','quote','tags','tag_count','author_url','goodread_url']        
    dataSet=[]   #empty dataset
    count=0 
    
    quotesCheck = driver.find_elements(By.CSS_SELECTOR,"div.row .quote")
    print(f"Total Quotes : {len(quotesCheck)}")
else:
    print("URL for quotes is not found!")

Total Quotes : 10


In [10]:
if len(quotesCheck)>0:
    pagination = True
    page=1
    while pagination:
        try:
            print(f"Processing Page {page} -- {(count+1)}")
            quotes = driver.find_elements(By.CSS_SELECTOR,"div.row .quote")
            #Iterate the quotes available in the page
            for quote in quotes:                
                content = quote.find_element(By.CSS_SELECTOR,'.text').text                
                author = quote.find_element(By.CLASS_NAME,"author").text
                authorLink = quote.find_element(By.PARTIAL_LINK_TEXT,"about").get_attribute('href')
                authorGoodread = quote.find_element(By.PARTIAL_LINK_TEXT,"Goodread").get_attribute('href') 
                tags = quote.find_element(By.TAG_NAME,'meta').get_attribute('content')
                print(f"Data..{author} -- {content[0:20]} -- {tags}")
                if re.search(',',tags):
                    totalTag = tags.split(',')
                    tag_count= len(totalTag)                    
                elif len(tags)>0:
                    tag_count=1
                else:
                    tag_count=0
                
                # Add values to dataSet
                count=count+1
                dataSet.append([author, content, tags, tag_count, authorLink,authorGoodread])                
                
            try:
                #Check for Pagination with text 'next' in the Listing page
                driver.find_element(By.CSS_SELECTOR,'li.next a').click()
                page=page+1
            except NoSuchElementException:                
                pagination = False
                print(f"Further Pagination is not possible, currently at {page}")
        except Exception as e:
            print(f"Exception Occured: {str(e)}")
            pagination = False
# print(dataSet)

Processing Page 1 -- 1
Data..Albert Einstein -- “The world as we hav -- change,deep-thoughts,thinking,world
Data..J.K. Rowling -- “It is our choices,  -- abilities,choices
Data..Albert Einstein -- “There are only two  -- inspirational,life,live,miracle,miracles
Data..Jane Austen -- “The person, be it g -- aliteracy,books,classic,humor
Data..Marilyn Monroe -- “Imperfection is bea -- be-yourself,inspirational
Data..Albert Einstein -- “Try not to become a -- adulthood,success,value
Data..André Gide -- “It is better to be  -- life,love
Data..Thomas A. Edison -- “I have not failed.  -- edison,failure,inspirational,paraphrased
Data..Eleanor Roosevelt -- “A woman is like a t -- misattributed-eleanor-roosevelt
Data..Steve Martin -- “A day without sunsh -- humor,obvious,simile
Processing Page 2 -- 11
Data..Marilyn Monroe -- “This life is what y -- friends,heartbreak,inspirational,life,love,sisters
Data..J.K. Rowling -- “It takes a great de -- courage,friends
Data..Albert Einstein -- “If you can

#### Logout of the system

In [11]:
driver.get(logoutUrl)
print(f"Current URL: {driver.current_url}")

Current URL: http://quotes.toscrape.com/


#### With final dataSet in hand, lets write the data to CSV and JSON files.

In [12]:
print(f"Total rows in dataSet : {len(dataSet)}")

Total rows in dataSet : 100


In [13]:
import csv
import json

##### Create a CSV file using column as first row


In [14]:
fileCsv = open('quote_details.csv', 'w', newline='', encoding='utf-8')
writer = csv.writer(fileCsv)
writer.writerow(column)
for data in dataSet:
    writer.writerow(data)
fileCsv.close()
print("CSV file Created")

CSV file Created


##### Create a JSON file.

In [15]:
finalDataSet=list() #empty Dataset
for data in dataSet:
    finalDataSet.append(dict(zip(column,data)))  #append a dictionary with column name as key and data as value
# print(finalDataSet)

with open('quote_details.json', 'w') as jsonfile:
    json.dump(finalDataSet,jsonfile)
print("JSON file Created")

JSON file Created


#### Closing and ending the 'driver' session using quit()

In [16]:
driver.quit()