# Natural Language Processing
### Acquiring data from the internet

In [1]:
import pandas as pd
import os
from bs4 import BeautifulSoup
import time
from requests import get
from time import time
from time import sleep
from random import randint
from IPython.core.display import clear_output

By the end of this exercise, you should have a file named acquire.py that contains the specified functions. If you wish, you may break your work into separate files for each website (e.g. acquire_codeup_blog.py and acquire_news_articles.py), but the end function should be present in acquire.py (that is, acquire.py should import get_blog_articles from the acquire_codeup_blog module.)

Codeup Blog Articles

Scrape the article text from the following pages:

    https://codeup.com/codeups-data-science-career-accelerator-is-here/
    https://codeup.com/data-science-myths/
    https://codeup.com/data-science-vs-data-analytics-whats-the-difference/
    https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/
    https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/

Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries, with each dictionary representing one article. The shape of each dictionary should look like this:

In [2]:
blog_url = 'https://codeup.com/codeups-data-science-career-accelerator-is-here/'
headers = {"User-Agent": "Codeup Data Science"}
response = get(blog_url, headers=headers)

In [3]:
soup = BeautifulSoup(response.text, 'html.parser')

In [20]:
soup.find('time', itemprop='datePublished').text

'September 30, 2018'

In [None]:
September 30, 2018

In [5]:
soup.find('div', class_='jupiterx-post-content').text

'The rumors are true! The time has arrived. Codeup has officially opened applications to our new Data Science career accelerator, with only 25 seats available! This immersive program is one of a kind in San Antonio, and will help you land a job in\xa0Glassdoor’s #1 Best Job in America.\nData Science is a method of providing actionable intelligence from data.\xa0The data revolution has hit San Antonio,\xa0resulting in an explosion in Data Scientist positions\xa0across companies like USAA, Accenture, Booz Allen Hamilton, and HEB. We’ve even seen\xa0UTSA invest $70 M for a Cybersecurity Center and School of Data Science.\xa0We built a program to specifically meet the growing demands of this industry.\nOur program will be 18 weeks long, full-time, hands-on, and project-based. Our curriculum development and instruction is led by Senior Data Scientist, Maggie Giust, who has worked at HEB, Capital Group, and Rackspace, along with input from dozens of practitioners and hiring partners. Student

In [21]:
def get_blog_articles():
    '''
    
    '''
    file_name = 'blog_posts.csv'

    if os.path.isfile(file_name):
        return pd.read_csv(file_name, index_col=False)
    
    else:
        requests = 0
        start_time = time()
        blog_posts = []
        headers = {"User-Agent": "Codeup Data Science"}

        blogs= ['https://codeup.com/codeups-data-science-career-accelerator-is-here/',
                'https://codeup.com/data-science-myths/',
                'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/',
                'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/',
                'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']

        for blog in blogs:
            response = get(blog, headers=headers)

            sleep(randint(1,3))
            requests += 1
            elapsed_time = time() - start_time

            print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
            clear_output(wait=True)

            if response.status_code != 200:
                warn(f"Request{topic}, Status Code {response.status_code}")

            soup = BeautifulSoup(response.text, 'html.parser')

            title = soup.find('title').text
            date_published = soup.find('time', itemprop='datePublished').text
            article = soup.find('div', class_='jupiterx-post-content').text

            blog_posts.append({'title': title,
                               'date_published': date_published,
                               'article': article})
            
        pd.DataFrame(blog_posts).to_csv(file_name)
        return blog_posts

In [22]:
get_blog_articles()[0]

{'title': 'Codeup’s Data Science Career Accelerator is Here! - Codeup',
 'date_published': 'September 30, 2018',
 'article': 'The rumors are true! The time has arrived. Codeup has officially opened applications to our new Data Science career accelerator, with only 25 seats available! This immersive program is one of a kind in San Antonio, and will help you land a job in\xa0Glassdoor’s #1 Best Job in America.\nData Science is a method of providing actionable intelligence from data.\xa0The data revolution has hit San Antonio,\xa0resulting in an explosion in Data Scientist positions\xa0across companies like USAA, Accenture, Booz Allen Hamilton, and HEB. We’ve even seen\xa0UTSA invest $70 M for a Cybersecurity Center and School of Data Science.\xa0We built a program to specifically meet the growing demands of this industry.\nOur program will be 18 weeks long, full-time, hands-on, and project-based. Our curriculum development and instruction is led by Senior Data Scientist, Maggie Giust, wh

## Inshorts: Stay Informed - Web Scraping

In [None]:
response = get('https://inshorts.com/en/read', headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
article_containers = soup.find_all('div', class_='news-card z-depth-1')

In [None]:
# There are 25 news articles on each page.
len(article_containers)

In [None]:
print('Title')
print(article_containers[0].find('span', itemprop='headline').text, end='\n\n')
print('Article')
print(article_containers[0].find('div', itemprop="articleBody").text)

In [28]:
def get_news_articles():
    '''
    
    '''
    requests=0
    file_name = 'news_articles.csv'

    if os.path.isfile(file_name):
        return pd.read_csv(file_name, index_col=False)
    
    else:
        topics = ['business', 'sports', 'technology', 'entertainment']
        collection = []
        start_time = time()

        for topic in topics:
            inshorts_url = f'https://inshorts.com/en/read/{topic}'
            headers = {'User_Agent': 'Promeos'}

            response = get(inshorts_url, headers)

            sleep(randint(1, 3))

            requests += 1
            elapsed_time = time() - start_time
            print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
            clear_output(wait=True)

            if response.status_code != 200:
                warn(f"Request{response}, Status Code {response.status_code}")

            soup = BeautifulSoup(response.text, 'html.parser')
            article_containers = soup.find_all('div', class_='news-card z-depth-1')

            for article in article_containers:
                title = article.find('span', itemprop='headline').text
                content = article.find('div', itemprop='articleBody').text
                collection.append({'title': title,
                                   'content': content,
                                   'category': topic})
            pd.DataFrame(collection).to_csv(file_name)
    return collection

In [29]:
get_news_articles()[0]

{'title': "Moderna's early data shows its COVID-19 vaccine is 94.5% effective",
 'content': "American biotechnology company Moderna on Monday announced its experimental vaccine was 94.5% effective in preventing COVID-19 based on interim data from a late-stage clinical trial. Moderna's interim analysis was based on 95 infections among trial participants who received either a placebo or the vaccine. Among those, only five infections occurred in those who received the vaccine.",
 'category': 'business'}