In [1]:
pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [2]:
pip install requests



In [3]:
pip install texttable

Collecting texttable
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Installing collected packages: texttable
Successfully installed texttable-1.7.0


# **P1 - Scraping Quotes from website**

**URL:** [Quotes Website](https://www.passiton.com/inspirational-quotes)

**scrapes the website and saves quotes to a file inspirational_quotes.csv**

In [4]:
#Python program to scrape website and save quotes to a file inspirational_quotes.csv

import requests
from bs4 import BeautifulSoup
import csv

URL = "https://www.passiton.com/inspirational-quotes"
r = requests.get(URL)

soup = BeautifulSoup(r.content, 'html5lib')

quotes=[] # a list to store quotes

table = soup.find('div', attrs = {'id':'all_quotes'})

for row in table.findAll('div',
						attrs = {'class':'col-6 col-lg-4 text-center margin-30px-bottom sm-margin-30px-top'}):
	quote = {}
	quote['theme'] = row.h5.text
	quote['url'] = row.a['href']
	quote['img'] = row.img['src']
	quote['lines'] = row.img['alt'].split(" #")[0]
	quote['author'] = row.img['alt'].split(" #")[1]
	quotes.append(quote)

filename = 'inspirational_quotes.csv'
with open(filename, 'w', newline='') as f:
	w = csv.DictWriter(f,['theme','url','img','lines','author'])
	w.writeheader()
	for quote in quotes:
		w.writerow(quote)


# **P2 - Scraping Covid-19 stats**

URL: [COVID-19 STATS COUNTRY WISE](https://www.worldometers.info/coronavirus/countries-where-coronavirus-has-spread/)

In [5]:
# URl to Scrap: https://www.worldometers.info/coronavirus/countries-where-coronavirus-has-spread/

import requests
from bs4 import BeautifulSoup
import texttable as tt

# URL for scrapping data
url = 'https://www.worldometers.info/coronavirus/countries-where-coronavirus-has-spread/'

# get URL's html
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

data = []

# soup.find_all('td') will scrape every element in the url's table
data_iterator = iter(soup.find_all('td'))
# data_iterator is the iterator of the table

# This loop will keep repeating till there is data available in the iterator
while True:
	try:
		country = next(data_iterator).text
		confirmed = next(data_iterator).text
		deaths = next(data_iterator).text
		continent = next(data_iterator).text

		# For 'confirmed' and 'deaths', make sure to remove the commas and convert to int
		data.append((
			country,
			int(confirmed.replace(',', '')),
			int(deaths.replace(',', '')),
			continent
		))

	# StopIteration error is raised when there are no more elements left to iterate through
	except StopIteration:
		break

# Sort the data by the number of confirmed cases
data.sort(key = lambda row: row[1], reverse = True)


# create texttable object
table = tt.Texttable()
table.add_rows([(None, None, None, None)] + data)  # Add an empty row at the beginning for the headers
table.set_cols_align(('c', 'c', 'c', 'c'))  # 'l' denotes left, 'c' denotes center, and 'r' denotes right
table.header((' Country ', ' Number of cases ', ' Deaths ', ' Continent '))

print(table.draw())


+---------------------------+-------------------+----------+-------------------+
|          Country          |  Number of cases  |  Deaths  |     Continent     |
|       United States       |     1.118e+08     | 1219487  |   North America   |
+---------------------------+-------------------+----------+-------------------+
|           India           |     45035393      |  533570  |       Asia        |
+---------------------------+-------------------+----------+-------------------+
|          France           |     40138560      |  167642  |      Europe       |
+---------------------------+-------------------+----------+-------------------+
|          Germany          |     38828995      |  183027  |      Europe       |
+---------------------------+-------------------+----------+-------------------+
|          Brazil           |     38743918      |  711380  |   South America   |
+---------------------------+-------------------+----------+-------------------+
|        South Korea        

# **P3 - Scraping GPU Card Product Information**

URL: [GPU Card Info](https://www.newegg.com/p/pl?d=graphics+card&nm_mc=KNC-GoogleKWLess-Search-Broad&cm_mmc=KNC-GoogleKWLess-Search-Broad-_-VGA-_-graphics-card-_-PLP-Feature&page=2)

In [8]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq

my_url = 'https://www.newegg.com/p/pl?d=graphics+card&nm_mc=KNC-GoogleKWLess-Search-Broad&cm_mmc=KNC-GoogleKWLess-Search-Broad-_-VGA-_-graphics-card-_-PLP-Feature&page=2'
uclient = ureq(my_url)
page_html = uclient.read()
uclient.close()

page_soup = soup(page_html, "html.parser")
print(page_soup)
#print(page_soup.body.id)
containers = page_soup.findAll("div",{"class":"item-container"})

filename = "products.csv"
f = open(filename, "w")
headers = "brand, productname, shipping\n"
f.write(headers)
print("before for")
for container in containers:
  print("after for")
  brand = container.div.div.a.img["title"]
  title_container = container.findAll("a",{"class":"item-title"})
  product_name = title_container[0].text
  shipping = container.findAll("li",{"class":"price-ship"})
  shipping_price = shipping[0].text.strip()
  print(brand)
  print(product_name)
  print(shipping_price)
  f.write(brand + "," + product_name.replace(",","|") + "," + shipping_price + "\n")
f.close()


<!DOCTYPE html>
<html class="show-tab-store" lang="en-us"><head><title>graphics card | Newegg.com</title><meta charset="utf-8"/><meta content="text/html; charset=utf-8" http-equiv="content-type"/><meta content="always" name="referrer"/><meta content="graphics card" name="keywords"/><meta content="Search Newegg.com for graphics card. Get fast shipping and top-rated customer service." name="description"/><meta content="https://c1.neweggimages.com/WebResource/Themes/Nest/logos/logo_newegg_400400.png" property="og:image"/><meta content="Search Newegg.com for graphics card. Get fast shipping and top-rated customer service." property="og:description"/><meta content="https://www.newegg.com/p/pl?page=2&amp;d=graphics+card" property="og:url"/><meta content="website" property="og:type"/><meta content="graphics card | Newegg.com" property="og:title"/><meta content="english" name="language"/><meta content="© 2000-2024 Newegg Inc." name="copyright"/><meta content="IE=edge" http-equiv="X-UA-Compatib

AttributeError: 'NoneType' object has no attribute 'a'

In [7]:
pip install fake_useragent

Collecting fake_useragent
  Downloading fake_useragent-1.5.1-py3-none-any.whl.metadata (15 kB)
Downloading fake_useragent-1.5.1-py3-none-any.whl (17 kB)
Installing collected packages: fake_useragent
Successfully installed fake_useragent-1.5.1


# **P4 - Scraping Multiple web Pages**

Task is to Scrap java questions from codingbat website

URL: http://codingbat.com/java

I will divide the project into 3 parts:
1.   First - script will describe you how to fetch the link of each section of Java questions.
2.   Second -  we will open each section(catagory)and we scrap link for each question.
3.   Third -  we will open each question and get the problem statement, example associated with it.





In [9]:
#Part 1 - script will describe you how to fetch the link of each section of Java questions.

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent


user_agent = UserAgent()
main_url = 'http://codingbat.com/java'
page = requests.get(main_url,headers={'user-agent':user_agent.chrome})
soup = BeautifulSoup(page.content,'lxml')

base_url = 'http://codingbat.com'

'''
Here we are scraping the link to each section.
Observe in inspect element that link is a ralative link (Warm-up) not absolute link
thus we used base_url above
'''
all_divs = soup.find_all('div',class_='summ')

#prints all the relative link
for div in all_divs:
    print(div.a['href']) #Here 'a' is a child of 'div' tag


#prints all the absolute link
for div in all_divs:
    print(base_url + div.a['href'])  #Here 'a' is a child of 'div' tag

/java/Warmup-1
/java/Warmup-2
/java/String-1
/java/Array-1
/java/Logic-1
/java/Logic-2
/java/String-2
/java/String-3
/java/Array-2
/java/Array-3
/java/AP-1
/java/Recursion-1
/java/Recursion-2
/java/Map-1
/java/Map-2
/java/Functional-1
/java/Functional-2
http://codingbat.com/java/Warmup-1
http://codingbat.com/java/Warmup-2
http://codingbat.com/java/String-1
http://codingbat.com/java/Array-1
http://codingbat.com/java/Logic-1
http://codingbat.com/java/Logic-2
http://codingbat.com/java/String-2
http://codingbat.com/java/String-3
http://codingbat.com/java/Array-2
http://codingbat.com/java/Array-3
http://codingbat.com/java/AP-1
http://codingbat.com/java/Recursion-1
http://codingbat.com/java/Recursion-2
http://codingbat.com/java/Map-1
http://codingbat.com/java/Map-2
http://codingbat.com/java/Functional-1
http://codingbat.com/java/Functional-2


In [10]:
#Second we will open each section and we scrap link for each question.
#--------Start - Same as above Script ----------------------------
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent


user_agent = UserAgent()
main_url = 'http://codingbat.com/java'
page = requests.get(main_url,headers={'user-agent':user_agent.chrome})
soup = BeautifulSoup(page.content,'lxml')

base_url = 'http://codingbat.com'

all_divs = soup.find_all('div',class_='summ')


# all_links has link for each section (Page 1)
all_links = [base_url + div.a['href'] for div in all_divs] # This is list Comprahension

#--------End - Same as above Script ----------------------------

#Below code is to get link for each/all the section

for link in all_links:
    #link correspons to 2nd page ex:https://codingbat.com/java/Warmup-1
    inner_page = requests.get(link,headers={'user-agent':user_agent.chrome})
    inner_soup = BeautifulSoup(inner_page.content,'lxml')

    #Now we need to scrap the link from 2nd inner page. (Inspect the HTML Page)

    div = inner_soup.find('div',class_='tabc')
    question_links = [base_url + td.a['href'] for td in div.table.find_all('td')] # has link to all the questions (list Comprahension)
    print(question_links)

    break #on commenting it you will get complete links for all the sections



['http://codingbat.com/prob/p187868', 'http://codingbat.com/prob/p181646', 'http://codingbat.com/prob/p154485', 'http://codingbat.com/prob/p116624', 'http://codingbat.com/prob/p140449', 'http://codingbat.com/prob/p182873', 'http://codingbat.com/prob/p184004', 'http://codingbat.com/prob/p159227', 'http://codingbat.com/prob/p191914', 'http://codingbat.com/prob/p190570', 'http://codingbat.com/prob/p123384', 'http://codingbat.com/prob/p136351', 'http://codingbat.com/prob/p161642', 'http://codingbat.com/prob/p112564', 'http://codingbat.com/prob/p183592', 'http://codingbat.com/prob/p191022', 'http://codingbat.com/prob/p192082', 'http://codingbat.com/prob/p144535', 'http://codingbat.com/prob/p178986', 'http://codingbat.com/prob/p165701', 'http://codingbat.com/prob/p100905', 'http://codingbat.com/prob/p151713', 'http://codingbat.com/prob/p199720', 'http://codingbat.com/prob/p101887', 'http://codingbat.com/prob/p172021', 'http://codingbat.com/prob/p132134', 'http://codingbat.com/prob/p177372', 

In [11]:
#Final Script

#part 1

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent


user_agent = UserAgent()
main_url = 'http://codingbat.com/java'
page = requests.get(main_url,headers={'user-agent':user_agent.chrome})
soup = BeautifulSoup(page.content,'lxml')

base_url = 'http://codingbat.com'

all_divs = soup.find_all('div',class_='summ')

all_links = [base_url + div.a['href'] for div in all_divs]


# part 2

for link in all_links:
    inner_page = requests.get(link,headers={'user-agent':user_agent.chrome})
    inner_soup = BeautifulSoup(inner_page.content,'lxml')
    div = inner_soup.find('div',class_='tabc')
    question_links = [base_url + td.a['href'] for td in div.table.find_all('td')]


# part 3

    for question_link in question_links:
        final_page = requests.get(question_link)
        final_soup = BeautifulSoup(final_page.content, 'lxml')
        indent_div = final_soup.find('div', attrs={'class':'indent'})

        problem_statement = indent_div.table.div.string

        siblings_of_statement = indent_div.table.div.next_siblings

        examples = [sibling for sibling in siblings_of_statement if sibling.string is not None]

        print(problem_statement)
        for example in examples:
            print(example)

        print('\n\n\n')

The parameter weekday is true if it is a weekday, and the parameter vacation is true if we are on vacation. We sleep in if it is not a weekday or we're on vacation. Return true if we sleep in.
sleepIn(false, false) → true
sleepIn(true, false) → false
sleepIn(false, true) → true




We have two monkeys, a and b, and the parameters aSmile and bSmile indicate if each is smiling. We are in trouble if they are both smiling or if neither of them is smiling. Return true if we are in trouble.
monkeyTrouble(true, true) → true
monkeyTrouble(false, false) → true
monkeyTrouble(true, false) → false




Given two int values, return their sum. Unless the two values are the same, then return double their sum.
sumDouble(1, 2) → 3
sumDouble(3, 2) → 5
sumDouble(2, 2) → 8




Given an int n, return the absolute difference between n and 21, except return double the absolute difference if n is over 21.
diff21(19) → 2
diff21(10) → 11
diff21(21) → 0




We have a loud talking parrot. The "hour" parameter is t