In [1]:
import re
import csv
from time import sleep
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd
import numpy as np

In [19]:
# # # # NEWS API STREAMER # # # #
class ArticleStreamer():
    """
    Class for streaming and processing articles daily
    """

    def __init__(self):
        pass

    def get_article(self, card):
        """Extract article information from the raw html"""
        headline = card.find('h4', 's-title').text
        source = card.find("span", 's-source').text
        posted = card.find('span', 's-time').text.replace('·', '').strip()
        description = card.find('p', 's-desc').text.strip()
        raw_link = card.find('a').get('href')
        unquoted_link = requests.utils.unquote(raw_link)
        pattern = re.compile(r'RU=(.+)\/RK')
        clean_link = re.search(pattern, unquoted_link).group(1)

        article = (headline, source, posted, description, clean_link)
        return article

    def get_the_news(self, query):
        """Run the main program"""
        article_headers = ['title', 'outlet', 'uploaded', 'description', 'url']
        
        template = 'https://news.search.yahoo.com/search?p={}'
        url = template.format(query)
        articles = []
        links = set()

        while True:
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.text, 'html.parser')
            cards = soup.find_all('div', 'NewsArticle')

            # extract articles from page
            for card in cards:
                article = self.get_article(card)
                link = article[-1]
                if not link in links:
                    links.add(link)
                    articles.append(article)

                    # find the next page
            try:
                url = soup.find('a', 'next').get('href')
                sleep(1)
            except AttributeError:
                break
        
        all_articles = pd.DataFrame(articles, columns = article_headers)

        return all_articles

In [16]:
class ArticleCleaner():
    """
    Functionality for importing and cleaning news articles
    """

    def import_json(self, fetched_json_file):
        # Import json and normalize data
        df = pd.read_json(fetched_json_file)
        norm_articles = pd.json_normalize(df['articles'])

        return norm_articles

    def clean_article(self, article):
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", article).split())

    def create_article(self, articles):
        article = articles[['author', 'title', 'description', 'url', 'source.name', 'publishedAt']]
        outlet = articles[['source.id', 'source.name']]

        return article, outlet

In [24]:
if __name__ == '__main__':
    fetched_json_file = './news_articles.json'
    query = 'cryptocurrency'
    headers = {
        'accept': '*/*',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'en-US,en;q=0.9',
        'referer': 'https://www.google.com',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
    }

    article_cleaner = ArticleCleaner()
    article_streamer = ArticleStreamer()

    all_articles = article_streamer.get_the_news(query)

    # articles, outlets = article_cleaner.create_article(all_articles)

    clean_title = np.array([article_cleaner.clean_article(article) for article in all_articles['title']])
    clean_desc = np.array([article_cleaner.clean_article(article) for article in all_articles['description']])

    all_articles['clean_title'] = clean_title
    all_articles['clean_description'] = clean_desc

    all_articles.to_csv(r'.\results.csv', index=False)
    # outlets.to_csv(r'.\outlets.csv', index=False)

In [21]:
all_articles

Unnamed: 0,title,outlet,uploaded,description,url
0,Ethereum price hits record high amid ‘cryptocu...,The Independent via Yahoo News,1 hour ago,"The cryptocurrency reached $1,430 (£1,044) on ...",https://www.yahoo.com/finance/news/ethereum-pr...
1,Trustee of Collapsed Exchange Moves to Resolve...,CoinDesk via Yahoo Finance,3 hours ago,"Ernst and Young (EY), the bankruptcy trustee f...",https://finance.yahoo.com/news/trustee-collaps...
2,Bitcoin Steady as Analysts Say Getting Back to...,Bloomberg via Yahoo Finance,2 days ago,"Bitcoin hovered near $36,000 on Monday, below ...",https://finance.yahoo.com/news/bitcoin-retreat...
3,Bitcoin Crash Is Excellent Opportunity to Buy ...,InvestorPlace via Yahoo Finance,5 days ago,The cryptocurrency is still up roughly 89% on ...,https://finance.yahoo.com/news/bitcoin-crash-e...
4,First Mover: Ethereum Steals Limelight With Ne...,CoinDesk via Yahoo Finance,22 hours ago,“This period of consolidation is building a so...,https://finance.yahoo.com/news/first-mover-eth...
...,...,...,...,...,...
744,Five things you need to know about on the mark...,Business Insider,4 days ago,The stock market has shrugged off turmoil in D...,https://www.businessinsider.com/markets-outloo...
745,Is NVIDIA About to Bring More Pain to AMD?,Motley Fool,3 days ago,NVIDIA (NASDAQ: NVDA) and Advanced Micro Devic...,https://www.fool.com/investing/2021/01/17/is-n...
746,Will Bitcoin Or Ethereum Grow More By 2022?,Benzinga,4 days ago,"Every week, Benzinga conducts a survey to coll...",https://www.benzinga.com/markets/cryptocurrenc...
747,"Biden picks Chopra, Gensler for financial over...",WBRC Fox 6 Birmingham,2 days ago,Biden announced his intent to nominate Chopra ...,https://www.wbrc.com/2021/01/18/biden-picks-ch...
