In [1]:
# install the bs4 library
# !pip install bs4   or !pip install beautifulsoup4

In [2]:
# install lxml parser - it process xml and html both
# !pip install lxml

In [3]:
# we can also use html5lib parser - it's upto us which parser we want to use
# html5lib is a pure-python library for parsing HTML
# !pip instal html5lib

In [4]:
# Both html5lib and xml are popular

In [5]:
from bs4 import BeautifulSoup
import requests

In [6]:
with open('test.html') as html_file:
    soup = BeautifulSoup(html_file,'lxml')

In [7]:
# it gives the HTML content in a very formatted way
print(soup)

<!DOCTYPE HTML>
<html>
<head>
<title>Dummy Website - for WebScraping</title>
</head>
<body>
<h1>Test Website</h1>
<hr/>
<div class="article">
<h2><a href="article1.html">Article 1 Headline</a></h2>
<p>This is a summary of article1</p><p>
</p></div>
<hr/>
<div class="article">
<h2><a href="article2.html">Article 2 Headline</a></h2>
<p>This is a summary of article2</p><p>
</p></div>
<hr/>
<div class="footer">
<p>Footer Information</p>
</div>
</body>
</html>


In [8]:
# to get the content in pretty format i.e. we see what tags are nested and with identation
print(soup.prettify())

<!DOCTYPE HTML>
<html>
 <head>
  <title>
   Dummy Website - for WebScraping
  </title>
 </head>
 <body>
  <h1>
   Test Website
  </h1>
  <hr/>
  <div class="article">
   <h2>
    <a href="article1.html">
     Article 1 Headline
    </a>
   </h2>
   <p>
    This is a summary of article1
   </p>
   <p>
   </p>
  </div>
  <hr/>
  <div class="article">
   <h2>
    <a href="article2.html">
     Article 2 Headline
    </a>
   </h2>
   <p>
    This is a summary of article2
   </p>
   <p>
   </p>
  </div>
  <hr/>
  <div class="footer">
   <p>
    Footer Information
   </p>
  </div>
 </body>
</html>


In [9]:
# lets grab the title 
match = soup.title
print(match)
# but it fetches the tags as well

<title>Dummy Website - for WebScraping</title>


In [10]:
# lets fetch only the text of title
match = soup.title.text
print(match)

Dummy Website - for WebScraping


In [11]:
# But searching the tags like this - it gives only first html element found on that page
match = soup.h1.text
print(match)

Test Website


In [12]:
print(soup.div)

<div class="article">
<h2><a href="article1.html">Article 1 Headline</a></h2>
<p>This is a summary of article1</p><p>
</p></div>


In [13]:
print(soup.div.text)


Article 1 Headline
This is a summary of article1



In [14]:
# So we will use the find method, this also find the first div tag
match = soup.find('div')
print(match)

<div class="article">
<h2><a href="article1.html">Article 1 Headline</a></h2>
<p>This is a summary of article1</p><p>
</p></div>


In [15]:
# we can pass the arguments in find method
# Now we get that content of div which has class 'footer' ; but not the first div
# class is a special keyword in python, thus we need underscore after class
# but for other elements like 'id' we dont need to pass underscore, we can directly give arguments
match = soup.find('div',class_='footer')
print(match)

<div class="footer">
<p>Footer Information</p>
</div>


In [17]:
# Now let's pull the Article
# if we have difficulty to find exact tag (for required element) by looking at source code
# we can choose particular element and click on Inspect

article = soup.find('div',class_='article')
print(article)

<div class="article">
<h2><a href="article1.html">Article 1 Headline</a></h2>
<p>This is a summary of article1</p><p>
</p></div>


In [18]:
# we can access the child tag as well
headline = article.h2
print(headline)

<h2><a href="article1.html">Article 1 Headline</a></h2>


In [20]:
# we can access the nested child tag as well
anchor = article.h2.a
print(anchor)
print(anchor.text)

<a href="article1.html">Article 1 Headline</a>
Article 1 Headline


In [21]:
summary = article.p
print(summary.text)

This is a summary of article1


In [23]:
# to find all the article ; we will use find_all method instead of find method
# it returns list of all the matching tags
articles = soup.find_all('div',class_='article')
print(articles)

[<div class="article">
<h2><a href="article1.html">Article 1 Headline</a></h2>
<p>This is a summary of article1</p><p>
</p></div>, <div class="article">
<h2><a href="article2.html">Article 2 Headline</a></h2>
<p>This is a summary of article2</p><p>
</p></div>]


In [27]:
# Now we can loop over this list
for article in articles:
    print(article.h2.a.text)
    print(article.p.text)
    print()

Article 1 Headline
This is a summary of article1

Article 2 Headline
This is a summary of article2



In [53]:
# Now let's do similar thing with actual website 
# https://coreyms.com/
# our aim is to grab the title, summary and link of the videos from this website

import requests
from bs4 import BeautifulSoup

# First thing is we need the source code of website
# we will get that using requests library

source = requests.get('https://coreyms.com/')
source = source.text

# Now we can pass this source in a BeautifulSoup

soup = BeautifulSoup(source,'lxml')

# Now we start parsing the information which we want
# So we will use inspect functionality of browser to get the exact 
# we could see everything under <article> tag

article = soup.find('article')
# print(article.prettify())
# Now fetch child/nested elements

headline = article.h2.text
summary = article.find('div',class_='entry-content').text

# video link is in iframe and src (but this time it's not a direct tag)
vid_src = article.find('iframe',class_='youtube-player')

# Now get the 'src' tag ; we can use it as a dictonary
vid_src = vid_src['src']

# But we will fetch only ID i.e. - z0gguhEmWiY for below url (which just come after forward slash)
# https://www.youtube.com/embed/z0gguhEmWiY?ver

vid_id = vid_src.split('/')

# we could notice vid_id is at 4th index
vid_id = vid_id[4]

# and we know '?' tell us from where the parameters started and video Id is before question mark
vid_id = vid_id.split('?')

# Now video ID is at 0th index
vid_id = vid_id[0]

# Now we will create our own you-tube link, using this video-id
yt_link = f"https://youtube.com/watch?v={vid_id}"
print(yt_link)

https://youtube.com/watch?v=z0gguhEmWiY


In [68]:
# Now we know how to get the information
# Now we loop the process for all the articles

import requests
from bs4 import BeautifulSoup

source = requests.get('https://coreyms.com/')
source = source.text

soup = BeautifulSoup(source,'lxml')

articles = soup.find_all('article')

for article in articles:
    headline = article.h2.text
    summary = article.find('div',class_='entry-content').text
    
    # we use the try, except block because for one article there is no video link
    # so if we not put this in try, except it failed while fetching the vid_src
    try:
        vid_src = article.find('iframe',class_='youtube-player')
        vid_src = vid_src['src']
        vid_src = vid_src.split('/')
        vid_id = vid_src[4]
        vid_id = vid_id.split('?')
        vid_id = vid_id[0]
        yt_link = f"https://youtube.com/watch?v={vid_id}"
    except Exception as e:
        # we can pass this as well, but we will set yt_link as None
        yt_link = None
    
    print(headline.strip())
    print(summary.strip())
    print(yt_link)
    print()

Python Tutorial: Zip Files – Creating and Extracting Zip Archives
In this video, we will be learning how to create and extract zip archives. We will start by using the zipfile module, and then we will see how to do this using the shutil module. We will learn how to do this with single files and directories, as well as learning how to use gzip as well. Let’s get started…
https://youtube.com/watch?v=z0gguhEmWiY

Python Data Science Tutorial: Analyzing the 2019 Stack Overflow Developer Survey
In this Python Programming video, we will be learning how to download and analyze real-world data from the 2019 Stack Overflow Developer Survey. This is terrific practice for anyone getting into the data science field. We will learn different ways to analyze this data and also some best practices. Let’s get started…
https://youtube.com/watch?v=_P7X8tMplsw

Python Multiprocessing Tutorial: Run Code in Parallel Using the Multiprocessing Module
In this Python Programming video, we will be learning how t

In [88]:
# As of Now, we are printing the information the screen
# but Now, we will save it to a csv file

import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
from tabulate import tabulate

source = requests.get('https://coreyms.com/')
source = source.text

soup = BeautifulSoup(source,'lxml')

articles = soup.find_all('article')

csv_file = open('cms_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['headline', 'summary', 'video_link'])

for article in articles:
    headline = article.h2.text
    summary = article.find('div',class_='entry-content').text
    try:
        vid_src = article.find('iframe',class_='youtube-player')
        vid_src = vid_src['src']
        vid_src = vid_src.split('/')
        vid_id = vid_src[4]
        vid_id = vid_id.split('?')
        vid_id = vid_id[0]
        yt_link = f"https://youtube.com/watch?v={vid_id}"
    except Exception as e:
        yt_link = None
    
    csv_writer.writerow([headline, summary, yt_link])

csv_file.close()

df = pd.read_csv('cms_scrape.csv')
print(tabulate(df))

-  ------------------------------------------------------------------------------------------------  -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------  ---------------------------------------
0  Python Tutorial: Zip Files – Creating and Extracting Zip Archives                                 In this video, we will be learning how to create and extract zip archives. We will start by using the zipfile module, and then we will see how to do 

In [89]:
# if we want to scrap the data from the larger website like Twitter/Facebook
# then it's not recommended to use this method
# we need to use the Public API - Usually they have the public API available

# For small and medium based website - above method is good.