# Web scraping exercises

In [1]:
#Disable autosave
%autosave 0

Autosave disabled


In [2]:
#Import dependencies
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
import json

## Exercise 1

Codeup Blog Articles

Visit [Codeup's Blog](https://codeup.com/blog/) and record the urls for at least 5 distinct blog posts. For each post, you should scrape at least the post's title and content.

Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries, with each dictionary representing one article.

URLs for blog posts:  
https://codeup.com/data-science/recession-proof-career/  
https://codeup.com/codeup-news/codeup-x-comic-con/  
https://codeup.com/featured/series-part-3-web-development/  
https://codeup.com/codeup-news/codeup-dallas-campus/  
https://codeup.com/codeup-news/codeup-tv-commercial/

In [3]:
#Define headers
headers = {'User-Agent': 'Codeup Data Science'}

In [4]:
#Make a request
response = get('https://codeup.com/data-science/recession-proof-career/', headers=headers)

In [5]:
#Create the soup and investigate
soup = BeautifulSoup(response.content, 'html.parser')
example = soup.find('h1')
example.text

'Is a Career in Tech Recession-Proof?'

In [6]:
#Access the date published
example2 = soup.find('span', class_='published')
example2.text

'Aug 12, 2022'

In [7]:
#Access the article content
example3 = soup.find('div', class_='entry-content')
example3.text

'\n\n\n\n\n\nGiven the current economic climate, many economists are considering the U.S. to be entering a recession. This can cause confusion, fear, and uncertainty, especially as it pertains to job security.'

In [8]:
#Create a list of links to scrape
links = ['https://codeup.com/codeup-news/dei-report/',
         'https://codeup.com/codeup-news/diversity-and-inclusion-award/',
         'https://codeup.com/featured/financing-career-transition/',
         'https://codeup.com/tips-for-prospective-students/tips-for-women/',
         'https://codeup.com/cloud-administration/cloud-computing-and-aws/']

In [9]:
#Loop through the links to collect the relevant information from the blog posts
article_info = []

for link in links:
    
    info_dict = {}
    
    response = get(link, headers=headers)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    info_dict['title'] = soup.find('h1').text
    
    info_dict['date_published'] = soup.find('span', class_='published').text
    
    lst = []
    cat = soup.find_all('a',rel="category tag")
    for kitty in cat:
        lst.append(kitty.text)

    info_dict['category'] = lst

    info_dict['content'] = soup.find('div', class_='entry-content').text
    
    article_info.append(info_dict)
    

In [10]:
#Create a function to collect the information and cache it as a json file
def get_blog_articles(article_list):
    
    file = 'blog_posts.json'
    
    if os.path.exists(file):
        
        with open(file) as f:
        
            return json.load(f)
    
    headers = {'User-Agent': 'Codeup Data Science'}
    
    article_info = []
    
    for article in article_list:
        
        info_dict = {}
        
        response = get(article, headers=headers)
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        info_dict['title'] = soup.find('h1').text
        
        info_dict['date_published'] = soup.find('span', class_='published').text
        
        lst = []
        cat = soup.find_all('a',rel="category tag")
        for kitty in cat:
            lst.append(kitty.text)

        info_dict['category'] = lst

        info_dict['content'] = soup.find('div', class_='entry-content').text
        
        article_info.append(info_dict)
        
    with open(file, 'w') as f:
        
        json.dump(article_info, f)
        
    return article_info

In [11]:
#Run my function to make sure it works!
article_info = get_blog_articles(links)
article_info

[{'title': 'Diversity Equity and Inclusion Report',
  'date_published': 'Oct 7, 2022',
  'category': ['Codeup News'],
  'content': '\nCodeup is excited to launch our first Diversity Equity, and Inclusion (DEI) report! In over eight years as an organization, we’ve implemented policies and grown our DEI efforts. We are extremely proud of the progress we’ve made as a staff and Codeup community, and we recognize there is more to learn. This report captures some of the ways that we’ve lived our value of Cultivating Inclusive Growth, and how we will continue doing so as we look to the future.\nWe wanted to shine a light on the demographics of our students and staff, and in particular how that compares to the tech industry as a whole. How we collect, organize, and share employee demographic data is informed by standards set by the Equal Employment Opportunity Commission (EEOC).\nWe are proud to celebrate how we’ve grown and are motivated and committed to do more and be better. To view the rep

## Exercise 2

News Articles

We will now be scraping text data from [inshorts](https://inshorts.com/), a website that provides a brief overview of many different topics.

Write a function that scrapes the news articles for the following topics:

Business  
Sports  
Technology  
Entertainment  

The end product of this should be a function named get_news_articles that returns a list of dictionaries, where each dictionary has this shape:

In [12]:
#Make a request of the business page
response2 = get('https://inshorts.com/en/read/business')
soup2 = BeautifulSoup(response2.content, 'html.parser')
soup2.text[:400]

'\n\n\n\n\n    /* The Modal (background) */\n    .modal_contact {\n        display: none; /* Hidden by default */\n        position: fixed; /* Stay in place */\n        z-index: 8; /* Sit on top */\n        left: 0;\n        top: 0;\n        width: 100%; /* Full width */\n        height: 100%;\n        overflow: auto; /* Enable scroll if needed */\n        background-color: rgb(0,0,0); /* Fallback color */\n      '

In [13]:
#Access the titles
titles = soup2.find_all('span', itemprop='headline')
titles

[<span itemprop="headline">HCLTech adds highest-ever 10,339 freshers, reports 23.8% attrition in Jul-Sept</span>,
 <span itemprop="headline">11 lakh railway employees to get 78 days' wages as productivity bonus</span>,
 <span itemprop="headline">Moonlighting a question of ethics, not legalities: Wipro CEO</span>,
 <span itemprop="headline">Retail inflation averages at 7.02% in Jul-Sept, down from 7.28% in Apr-Jun</span>,
 <span itemprop="headline">Mercedes-Benz, Microsoft collaborate to improve auto production</span>,
 <span itemprop="headline">Wipro to give 100% variable pay to 85% of employees in Q2 FY23: CEO</span>,
 <span itemprop="headline">MobiKwik raises more debt, changes ESOP policy: Report</span>,
 <span itemprop="headline">India-UK FTA on 'verge of collapse' over visa comments: Report</span>,
 <span itemprop="headline">OPEC cuts 2022 oil demand growth for 4th time amid rising inflation</span>,
 <span itemprop="headline">HCL Tech net profit increases 7% to ₹3,489 crore in Q2<

In [14]:
#Access the summaries
summaries = soup2.find_all('div', itemprop='articleBody')
summaries

[<div itemprop="articleBody">HCLTech reported its highest-ever hiring of 10,339 freshers in July-September quarter, MD and CEO C Vijayakumar said on Wednesday. HCLTech's attrition rate in the quarter stood at 23.8%, same as that in April-June quarter. Meanwhile, the firm reported net employee addition of 8,359 employees in Q2 FY23, up from net employee addition of 2,089 employees in Q1 FY23.</div>,
 <div itemprop="articleBody">Union Minister Anurag Thakur on Wednesday announced that the Centre has approved a productivity-linked bonus equivalent to the wage of 78 days for eligible non-gazetted railway employees. This will benefit more than 11 lakh non-gazetted railway employees and it will cost the government approximately ₹1,832 crore. The maximum amount payable per eligible railway employee is ₹17,951 for 78 days.</div>,
 <div itemprop="articleBody">On being asked whether moonlighting is legal or illegal, Wipro CEO Thierry Delaporte said, "It isn't a question of legalities, it's a que

In [15]:
#Make sure I'm grabbing an equal number of titles and summaries
len(titles), len(summaries)

(25, 25)

In [16]:
#Define a function to scrape articles from one topic
def scrape_one_page(topic):
    
    base_url = 'https://inshorts.com/en/read/'
    
    response = get(base_url + topic)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    titles = soup.find_all('span', itemprop='headline')
    
    summaries = soup.find_all('div', itemprop='articleBody')
    
    summary_list = []
    
    for i in range(len(titles)):
        
        temp_dict = {}
        
        temp_dict['title'] = titles[i].text
        
        temp_dict['content'] = summaries[i].text
        
        temp_dict['category'] = topic
        
        summary_list.append(temp_dict)
        
    return summary_list    

In [17]:
#Test my function on the business page
business_test = scrape_one_page('business')
business_test

[{'title': 'HCLTech adds highest-ever 10,339 freshers, reports 23.8% attrition in Jul-Sept',
  'content': "HCLTech reported its highest-ever hiring of 10,339 freshers in July-September quarter, MD and CEO C Vijayakumar said on Wednesday. HCLTech's attrition rate in the quarter stood at 23.8%, same as that in April-June quarter. Meanwhile, the firm reported net employee addition of 8,359 employees in Q2 FY23, up from net employee addition of 2,089 employees in Q1 FY23.",
  'category': 'business'},
 {'title': "11 lakh railway employees to get 78 days' wages as productivity bonus",
  'content': 'Union Minister Anurag Thakur on Wednesday announced that the Centre has approved a productivity-linked bonus equivalent to the wage of 78 days for eligible non-gazetted railway employees. This will benefit more than 11 lakh non-gazetted railway employees and it will cost the government approximately ₹1,832 crore. The maximum amount payable per eligible railway employee is ₹17,951 for 78 days.',
  

In [18]:
#Define a function that will scrape information about an array of topics
def get_news_articles():
    
    file = 'news_articles.json'
    
    if os.path.exists(file):
        
        with open(file) as f:
            
            return json.load(f)
    
    topic_list = ['business', 'sports', 'technology', 'entertainment']
    
    final_list = []
    
    for topic in topic_list:
        
        final_list.extend(scrape_one_page(topic))
        
    with open(file, 'w') as f:
        
        json.dump(final_list, f)
        
    return final_list    

In [19]:
#Test my function!
final_list = get_news_articles()
final_list

[{'title': 'HCLTech adds highest-ever 10,339 freshers, reports 23.8% attrition in Jul-Sept',
  'content': "HCLTech reported its highest-ever hiring of 10,339 freshers in July-September quarter, MD and CEO C Vijayakumar said on Wednesday. HCLTech's attrition rate in the quarter stood at 23.8%, same as that in April-June quarter. Meanwhile, the firm reported net employee addition of 8,359 employees in Q2 FY23, up from net employee addition of 2,089 employees in Q1 FY23.",
  'category': 'business'},
 {'title': "11 lakh railway employees to get 78 days' wages as productivity bonus",
  'content': 'Union Minister Anurag Thakur on Wednesday announced that the Centre has approved a productivity-linked bonus equivalent to the wage of 78 days for eligible non-gazetted railway employees. This will benefit more than 11 lakh non-gazetted railway employees and it will cost the government approximately ₹1,832 crore. The maximum amount payable per eligible railway employee is ₹17,951 for 78 days.',
  

In [20]:
#Confirm I've collected enough information
len(final_list)

99