# Web scraping exercises

In [1]:
#Disable autosave
%autosave 0

Autosave disabled


In [2]:
#Import dependencies
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
import json

## Exercise 1

Codeup Blog Articles

Visit [Codeup's Blog](https://codeup.com/blog/) and record the urls for at least 5 distinct blog posts. For each post, you should scrape at least the post's title and content.

Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries, with each dictionary representing one article.

URLs for blog posts:  
https://codeup.com/data-science/recession-proof-career/  
https://codeup.com/codeup-news/codeup-x-comic-con/  
https://codeup.com/featured/series-part-3-web-development/  
https://codeup.com/codeup-news/codeup-dallas-campus/  
https://codeup.com/codeup-news/codeup-tv-commercial/

In [3]:
#Define headers
headers = {'User-Agent': 'Codeup Data Science'}

In [4]:
#Make a request
response = get('https://codeup.com/data-science/recession-proof-career/', headers=headers)

In [5]:
#Create the soup and investigate
soup = BeautifulSoup(response.content, 'html.parser')
example = soup.find('h1')
example.text

'Is a Career in Tech Recession-Proof?'

In [6]:
#Access the date published
example2 = soup.find('span', class_='published')
example2.text

'Aug 12, 2022'

In [7]:
#Access the article content
example3 = soup.find('div', class_='entry-content')
example3.text

'\n\n\n\n\n\nGiven the current economic climate, many economists are considering the U.S. to be entering a recession. This can cause confusion, fear, and uncertainty, especially as it pertains to job security.'

In [8]:
#Create a list of links to scrape
links = ['https://codeup.com/codeup-news/dei-report/',
         'https://codeup.com/codeup-news/diversity-and-inclusion-award/',
         'https://codeup.com/featured/financing-career-transition/',
         'https://codeup.com/tips-for-prospective-students/tips-for-women/',
         'https://codeup.com/cloud-administration/cloud-computing-and-aws/']

In [9]:
#Loop through the links to collect the relevant information from the blog posts
article_info = []

for link in links:
    
    info_dict = {}
    
    response = get(link, headers=headers)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    info_dict['title'] = soup.find('h1').text
    
    info_dict['date_published'] = soup.find('span', class_='published').text
    
    lst = []
    cat = soup.find_all('a',rel="category tag")
    for kitty in cat:
        lst.append(kitty.text)

    info_dict['category'] = lst

    info_dict['content'] = soup.find('div', class_='entry-content').text
    
    article_info.append(info_dict)
    

In [10]:
#Create a function to collect the information and cache it as a json file
def get_blog_articles(article_list):
    
    file = 'blog_posts.json'
    
    if os.path.exists(file):
        
        with open(file) as f:
        
            return json.load(f)
    
    headers = {'User-Agent': 'Codeup Data Science'}
    
    article_info = []
    
    for article in article_list:
        
        info_dict = {}
        
        response = get(article, headers=headers)
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        info_dict['title'] = soup.find('h1').text
        
        info_dict['date_published'] = soup.find('span', class_='published').text
        
        lst = []
        cat = soup.find_all('a',rel="category tag")
        for kitty in cat:
            lst.append(kitty.text)

        info_dict['category'] = lst

        info_dict['content'] = soup.find('div', class_='entry-content').text
        
        article_info.append(info_dict)
        
    with open(file, 'w') as f:
        
        json.dump(article_info, f)
        
    return article_info

In [11]:
#Run my function to make sure it works!
article_info = get_blog_articles(links)
article_info

[{'title': 'Diversity Equity and Inclusion Report',
  'date_published': 'Oct 7, 2022',
  'category': ['Codeup News'],
  'content': '\nCodeup is excited to launch our first Diversity Equity, and Inclusion (DEI) report! In over eight years as an organization, we’ve implemented policies and grown our DEI efforts. We are extremely proud of the progress we’ve made as a staff and Codeup community, and we recognize there is more to learn. This report captures some of the ways that we’ve lived our value of Cultivating Inclusive Growth, and how we will continue doing so as we look to the future.\nWe wanted to shine a light on the demographics of our students and staff, and in particular how that compares to the tech industry as a whole. How we collect, organize, and share employee demographic data is informed by standards set by the Equal Employment Opportunity Commission (EEOC).\nWe are proud to celebrate how we’ve grown and are motivated and committed to do more and be better. To view the rep

## Exercise 2

News Articles

We will now be scraping text data from [inshorts](https://inshorts.com/), a website that provides a brief overview of many different topics.

Write a function that scrapes the news articles for the following topics:

Business  
Sports  
Technology  
Entertainment  

The end product of this should be a function named get_news_articles that returns a list of dictionaries, where each dictionary has this shape:

In [12]:
#Make a request of the business page
response2 = get('https://inshorts.com/en/read/business')
soup2 = BeautifulSoup(response2.content, 'html.parser')
soup2.text[:400]

'\n\n\n\n\n    /* The Modal (background) */\n    .modal_contact {\n        display: none; /* Hidden by default */\n        position: fixed; /* Stay in place */\n        z-index: 8; /* Sit on top */\n        left: 0;\n        top: 0;\n        width: 100%; /* Full width */\n        height: 100%;\n        overflow: auto; /* Enable scroll if needed */\n        background-color: rgb(0,0,0); /* Fallback color */\n      '

In [13]:
#Access the titles
titles = soup2.find_all('span', itemprop='headline')
titles

[<span itemprop="headline">Bandhan Bank onboards Sourav Ganguly as brand ambassador</span>,
 <span itemprop="headline">Infosys let go of employees working for two companies in last 12 months: CEO</span>,
 <span itemprop="headline">Layoffs will be the absolute last thing at Zoho, it destroys loyalty: CEO</span>,
 <span itemprop="headline">Musk is under federal probe over his conduct in $44 billion deal, says Twitter</span>,
 <span itemprop="headline">Centre announces one-time aide for paddy straw pellet makers</span>,
 <span itemprop="headline">JPMorgan cuts ties with Kanye, gives him until Nov 21 to move assets</span>,
 <span itemprop="headline">US-based Bitcoin firm NYDIG lays off around 33% of its staff: Report</span>,
 <span itemprop="headline">US-based 6sense fires 150 workers globally, including India: Report</span>,
 <span itemprop="headline">India's 5G is indigenous and can be provided to other nations: FM</span>,
 <span itemprop="headline">Infosys not to mandate return to offic

In [14]:
#Access the summaries
summaries = soup2.find_all('div', itemprop='articleBody')
summaries

[<div itemprop="articleBody">Bandhan Bank has announced Sourav Ganguly as its brand ambassador. Fondly called 'Dada' and 'Maharaja of Indian Cricket', Sourav Ganguly will be the voice of Bandhan Bank, helping the brand take its message to the masses. "This association is another step towards connecting strongly with consumers across the spectrum and reinforcing our ongoing mission of inclusive banking," the brand stated.</div>,
 <div itemprop="articleBody">Infosys CEO Salil Parekh has revealed that the company let go of employees who were found to be working at two specific companies in the last 12 months. The fired employees were working for companies where there were issues regarding confidentiality, Parekh added. Last month, Wipro fired around 300 employees who were working directly for the IT company's competitors.</div>,
 <div itemprop="articleBody">Software startup Zoho's CEO Sridhar Vembu said that layoffs will be the "absolute last thing" that the company will ever consider. "I

In [15]:
#Make sure I'm grabbing an equal number of titles and summaries
len(titles), len(summaries)

(25, 25)

In [16]:
#Define a function to scrape articles from one topic
def scrape_one_page(topic):
    
    base_url = 'https://inshorts.com/en/read/'
    
    response = get(base_url + topic)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    titles = soup.find_all('span', itemprop='headline')
    
    summaries = soup.find_all('div', itemprop='articleBody')
    
    summary_list = []
    
    for i in range(len(titles)):
        
        temp_dict = {}
        
        temp_dict['title'] = titles[i].text
        
        temp_dict['content'] = summaries[i].text
        
        temp_dict['category'] = topic
        
        summary_list.append(temp_dict)
        
    return summary_list    

In [17]:
#Test my function on the business page
business_test = scrape_one_page('business')
business_test

[{'title': 'Bandhan Bank onboards Sourav Ganguly as brand ambassador',
  'content': 'Bandhan Bank has announced Sourav Ganguly as its brand ambassador. Fondly called \'Dada\' and \'Maharaja of Indian Cricket\', Sourav Ganguly will be the voice of Bandhan Bank, helping the brand take its message to the masses. "This association is another step towards connecting strongly with consumers across the spectrum and reinforcing our ongoing mission of inclusive banking," the brand stated.',
  'category': 'business'},
 {'title': 'Infosys let go of employees working for two companies in last 12 months: CEO',
  'content': "Infosys CEO Salil Parekh has revealed that the company let go of employees who were found to be working at two specific companies in the last 12 months. The fired employees were working for companies where there were issues regarding confidentiality, Parekh added. Last month, Wipro fired around 300 employees who were working directly for the IT company's competitors.",
  'catego

In [18]:
#Define a function that will scrape information about an array of topics
def get_news_articles():
    
    file = 'news_articles.json'
    
    if os.path.exists(file):
        
        with open(file) as f:
            
            return json.load(f)
    
    topic_list = ['business', 'sports', 'technology', 'entertainment']
    
    final_list = []
    
    for topic in topic_list:
        
        final_list.extend(scrape_one_page(topic))
        
    with open(file, 'w') as f:
        
        json.dump(final_list, f)
        
    return final_list    

In [19]:
#Test my function!
final_list = get_news_articles()
final_list

[{'title': 'Bandhan Bank onboards Sourav Ganguly as brand ambassador',
  'content': 'Bandhan Bank has announced Sourav Ganguly as its brand ambassador. Fondly called \'Dada\' and \'Maharaja of Indian Cricket\', Sourav Ganguly will be the voice of Bandhan Bank, helping the brand take its message to the masses. "This association is another step towards connecting strongly with consumers across the spectrum and reinforcing our ongoing mission of inclusive banking," the brand stated.',
  'category': 'business'},
 {'title': 'Musk is under federal probe over his conduct in $44 billion deal, says Twitter',
  'content': 'Twitter has claimed that the world\'s richest person Elon Musk is being investigated by federal authorities over his conduct in his $44-billion takeover deal. Attorneys for the Tesla CEO claimed "investigative privilege" when refusing to hand over documents, Twitter said in a court filing. "This game of \'hide the ball\' must end," the company, which sued Musk in July, added.'

In [20]:
#Confirm I've collected enough information
len(final_list)

99