In [1]:
# webscraping in python: Web scraping in python is used to extract and collect, data from websites
# 2 most common libraries for web scraping are BeautifulSoup and Requests
# this can be used in any public websites

# beautiful Soup & Requests

from bs4 import BeautifulSoup
import requests

In [2]:
url = 'https://webscraper.io/test-sites/e-commerce/allinone/phones'

In [3]:
requests.get(url)  # get function sends a get request to this url 

<Response [200]>

In [4]:
page = requests.get(url)

In [5]:
soup = BeautifulSoup(page.text, 'html')

In [6]:
print(soup)

<!DOCTYPE html>
<html lang="en">
<head>
<!-- Google Tag Manager -->
<script nonce="hly7gcDCV2JW5EzpKTYdBpVql5TjJUT3">(function (w, d, s, l, i) {
		w[l] = w[l] || [];
		w[l].push({
			'gtm.start':
				new Date().getTime(), event: 'gtm.js'
		});
		var f = d.getElementsByTagName(s)[0],
			j = d.createElement(s), dl = l != 'dataLayer' ? '&l=' + l : '';
		j.async = true;
		j.src =
			'https://www.googletagmanager.com/gtm.js?id=' + i + dl;
		f.parentNode.insertBefore(j, f);
	})(window, document, 'script', 'dataLayer', 'GTM-NVFPDWB');</script>
<!-- End Google Tag Manager -->
<title>Allinone | Web Scraper Test Sites</title>
<meta charset="utf-8"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="web scraping,Web Scraper,Chrome extension,Crawling,Cross platform scraper" name="keywords"/>
<meta content="Test Web Scraper's features and performance on mock e-commerce sites. Extract product data, prices, and categories in a controlled environment." name="description"/>

In [7]:
soup_pretty = soup.prettify()

print(soup_pretty)

<!DOCTYPE html>
<html lang="en">
 <head>
  <!-- Google Tag Manager -->
  <script nonce="hly7gcDCV2JW5EzpKTYdBpVql5TjJUT3">
   (function (w, d, s, l, i) {
		w[l] = w[l] || [];
		w[l].push({
			'gtm.start':
				new Date().getTime(), event: 'gtm.js'
		});
		var f = d.getElementsByTagName(s)[0],
			j = d.createElement(s), dl = l != 'dataLayer' ? '&l=' + l : '';
		j.async = true;
		j.src =
			'https://www.googletagmanager.com/gtm.js?id=' + i + dl;
		f.parentNode.insertBefore(j, f);
	})(window, document, 'script', 'dataLayer', 'GTM-NVFPDWB');
  </script>
  <!-- End Google Tag Manager -->
  <title>
   Allinone | Web Scraper Test Sites
  </title>
  <meta charset="utf-8"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <meta content="web scraping,Web Scraper,Chrome extension,Crawling,Cross platform scraper" name="keywords"/>
  <meta content="Test Web Scraper's features and performance on mock e-commerce sites. Extract product data, prices, and categories in a controlled env

In [8]:
# tags, variable strings, attributes

# tags are purple line code, when we inspect the webpage

soup.head

<head>
<!-- Google Tag Manager -->
<script nonce="hly7gcDCV2JW5EzpKTYdBpVql5TjJUT3">(function (w, d, s, l, i) {
		w[l] = w[l] || [];
		w[l].push({
			'gtm.start':
				new Date().getTime(), event: 'gtm.js'
		});
		var f = d.getElementsByTagName(s)[0],
			j = d.createElement(s), dl = l != 'dataLayer' ? '&l=' + l : '';
		j.async = true;
		j.src =
			'https://www.googletagmanager.com/gtm.js?id=' + i + dl;
		f.parentNode.insertBefore(j, f);
	})(window, document, 'script', 'dataLayer', 'GTM-NVFPDWB');</script>
<!-- End Google Tag Manager -->
<title>Allinone | Web Scraper Test Sites</title>
<meta charset="utf-8"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="web scraping,Web Scraper,Chrome extension,Crawling,Cross platform scraper" name="keywords"/>
<meta content="Test Web Scraper's features and performance on mock e-commerce sites. Extract product data, prices, and categories in a controlled environment." name="description"/>
<link href="/favicon.png" rel="i

In [10]:
soup.header

<header class="navbar fixed-top navbar-expand-lg navbar-dark navbar-static svg-background" id="navbar-top" role="banner">
<div class="container">
<div class="navbar-header">
<a data-bs-target=".side-collapse" data-bs-target-2=".side-collapse-container" data-bs-toggle="collapse-side">
<button aria-controls="navbar" aria-expanded="false" class="navbar-toggler float-end collapsed" data-bs-target="#navbar" data-bs-target-2=".side-collapse-container" data-bs-target-3=".side-collapse" data-bs-toggle="collapse" type="button">
<span class="visually-hidden">Toggle navigation</span>
<span class="icon-bar top-bar"></span>
<span class="icon-bar middle-bar"></span>
<span class="icon-bar bottom-bar"></span>
<span class="icon-bar extra-bottom-bar"></span>
</button>
</a>
<div class="navbar-brand">
<a href="/"><img alt="Web Scraper" height="50px" src="/img/logo_white.svg" width="150px"/></a>
</div>
</div>
<div class="side-collapse in">
<nav class="navbar-collapse collapse" id="navbar" role="navigation"

In [11]:
soup.h1

<h1>Test Sites</h1>

In [12]:
soup.title

<title>Allinone | Web Scraper Test Sites</title>

In [13]:
soup.title.string

'Allinone | Web Scraper Test Sites'

In [14]:
soup.title.name

'title'

In [15]:
soup.title.parent.name

'head'

In [16]:
# Attributes are properties that will influence the styling and layout of the elements in the website like sizes and coulor

soup.header.attrs # attrs stands for attributes



{'role': 'banner',
 'class': ['navbar',
  'fixed-top',
  'navbar-expand-lg',
  'navbar-dark',
  'navbar-static',
  'svg-background'],
 'id': 'navbar-top'}

In [17]:
# find and find all: find will find the 1st tag with a specific name or ID and return the tag.
# find all will return all tags with that specific tag name

soup.find('h1')

<h1>Test Sites</h1>

In [18]:
soup.find('h4')

<h4 class="price float-end card-title pull-right" itemprop="offers" itemscope="" itemtype="https://schema.org/Offer">
<span itemprop="price">$24.99</span>
<meta content="USD" itemprop="priceCurrency"/>
</h4>

In [21]:
soup.find('h4',{'class': 'price float-end card-title pull-right'})

<h4 class="price float-end card-title pull-right" itemprop="offers" itemscope="" itemtype="https://schema.org/Offer">
<span itemprop="price">$24.99</span>
<meta content="USD" itemprop="priceCurrency"/>
</h4>

In [22]:
soup.find('h4',{ 'price float-end card-title pull-right'})

<h4 class="price float-end card-title pull-right" itemprop="offers" itemscope="" itemtype="https://schema.org/Offer">
<span itemprop="price">$24.99</span>
<meta content="USD" itemprop="priceCurrency"/>
</h4>

In [24]:
soup.find('a', 'title')

<a class="title" href="/test-sites/e-commerce/allinone/product/1" itemprop="name" title="Nokia 123">
						Nokia 123
					</a>

In [25]:
soup.find_all('h4')

[<h4 class="price float-end card-title pull-right" itemprop="offers" itemscope="" itemtype="https://schema.org/Offer">
 <span itemprop="price">$24.99</span>
 <meta content="USD" itemprop="priceCurrency"/>
 </h4>,
 <h4>
 <a class="title" href="/test-sites/e-commerce/allinone/product/1" itemprop="name" title="Nokia 123">
 						Nokia 123
 					</a>
 </h4>,
 <h4 class="price float-end card-title pull-right" itemprop="offers" itemscope="" itemtype="https://schema.org/Offer">
 <span itemprop="price">$109.99</span>
 <meta content="USD" itemprop="priceCurrency"/>
 </h4>,
 <h4>
 <a class="title" href="/test-sites/e-commerce/allinone/product/4" itemprop="name" title="Nokia X">
 						Nokia X
 					</a>
 </h4>,
 <h4 class="price float-end card-title pull-right" itemprop="offers" itemscope="" itemtype="https://schema.org/Offer">
 <span itemprop="price">$93.99</span>
 <meta content="USD" itemprop="priceCurrency"/>
 </h4>,
 <h4>
 <a class="title" href="/test-sites/e-commerce/allinone/product/3" ite

In [26]:
soup.find_all('h4','price float-end card-title pull-right')

[<h4 class="price float-end card-title pull-right" itemprop="offers" itemscope="" itemtype="https://schema.org/Offer">
 <span itemprop="price">$24.99</span>
 <meta content="USD" itemprop="priceCurrency"/>
 </h4>,
 <h4 class="price float-end card-title pull-right" itemprop="offers" itemscope="" itemtype="https://schema.org/Offer">
 <span itemprop="price">$109.99</span>
 <meta content="USD" itemprop="priceCurrency"/>
 </h4>,
 <h4 class="price float-end card-title pull-right" itemprop="offers" itemscope="" itemtype="https://schema.org/Offer">
 <span itemprop="price">$93.99</span>
 <meta content="USD" itemprop="priceCurrency"/>
 </h4>]

In [28]:
soup.find_all('a', 'title')[:2]

[<a class="title" href="/test-sites/e-commerce/allinone/product/1" itemprop="name" title="Nokia 123">
 						Nokia 123
 					</a>,
 <a class="title" href="/test-sites/e-commerce/allinone/product/4" itemprop="name" title="Nokia X">
 						Nokia X
 					</a>]

In [29]:
soup.find_all(id = True)

[<header class="navbar fixed-top navbar-expand-lg navbar-dark navbar-static svg-background" id="navbar-top" role="banner">
 <div class="container">
 <div class="navbar-header">
 <a data-bs-target=".side-collapse" data-bs-target-2=".side-collapse-container" data-bs-toggle="collapse-side">
 <button aria-controls="navbar" aria-expanded="false" class="navbar-toggler float-end collapsed" data-bs-target="#navbar" data-bs-target-2=".side-collapse-container" data-bs-target-3=".side-collapse" data-bs-toggle="collapse" type="button">
 <span class="visually-hidden">Toggle navigation</span>
 <span class="icon-bar top-bar"></span>
 <span class="icon-bar middle-bar"></span>
 <span class="icon-bar bottom-bar"></span>
 <span class="icon-bar extra-bottom-bar"></span>
 </button>
 </a>
 <div class="navbar-brand">
 <a href="/"><img alt="Web Scraper" height="50px" src="/img/logo_white.svg" width="150px"/></a>
 </div>
 </div>
 <div class="side-collapse in">
 <nav class="navbar-collapse collapse" id="navbar"

In [33]:
soup.find_all(string = "Nokia X")

[]

In [40]:
# getting text from website: 

all_phone_titles = soup.find_all('a', 'title')

In [41]:
all_titles = [title.text for title in all_phone_titles]
print(all_titles)

['\n\t\t\t\t\t\tNokia 123\n\t\t\t\t\t', '\n\t\t\t\t\t\tNokia X\n\t\t\t\t\t', '\n\t\t\t\t\t\tSamsung Galaxy\n\t\t\t\t\t']


In [44]:
import pandas as pd

pd.DataFrame(all_titles)

Unnamed: 0,0
0,\n\t\t\t\t\t\tNokia 123\n\t\t\t\t\t
1,\n\t\t\t\t\t\tNokia X\n\t\t\t\t\t
2,\n\t\t\t\t\t\tSamsung Galaxy\n\t\t\t\t\t
