11import requests
22from bs4 import BeautifulSoup
33
4- proxy = {'http' : 'http://SPusername:SPpassword @gate.smartproxy.com:7000' }
5- url = 'http://books.toscrape.com/catalogue/page-1.html'
4+ proxy = {'http' : 'http://username:password @gate.smartproxy.com:10000' } # Proxy authentication information
5+ url = 'http://books.toscrape.com/' # Website to make a GET request to
66
7- r = requests .get (url , proxies = proxy )
8- html = BeautifulSoup (r .content , 'html.parser' )
7+ r = requests .get (url , proxies = proxy ) # Make the GET request to a target URL using proxies
8+ html = BeautifulSoup (r .content , 'html.parser' ) # Parse the HTML
99
10- all_books = html .find_all ('article' , class_ = 'product_pod' )
10+ all_books = html .find_all ('article' , class_ = 'product_pod' ) # Find all article elements with the class "product_pod"
1111
12- for book in all_books :
12+ for book in all_books : # Loop through each element and find the title, price, availability, and description
1313 title = book .h3 .a ['title' ]
1414 price = book .find ('p' , class_ = 'price_color' ).text
1515 availability = book .find ('p' , class_ = 'instock availability' ).text .strip ()
1616 link_to_book = book .h3 .a ['href' ]
17+ link = "http://books.toscrape.com/{0}" .format (link_to_book )
1718
18- link = "http://books.toscrape.com/catalogue/{0}" .format (link_to_book )
19-
20- r2 = requests .get (link )
19+ r2 = requests .get (link , proxies = proxy ) # Make a new request to the URL extracted earlier
2120 html2 = BeautifulSoup (r2 .content , 'html.parser' )
22-
2321 description = html2 .find ('p' , class_ = '' ).text
2422
2523 print (title )
2624 print (price )
2725 print (availability )
28- print ("{0}..." .format (description [:150 ]))
26+ print ("{0}..." .format (description [:150 ])) # Truncate text that is too long (over 150 characters)
2927 print (link )
30- print ()
28+ print () # Print an empty line to separate each result
0 commit comments