In [1]:
import re
import csv
from time import sleep
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd
import numpy as np

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [2]:
# # # # NEWS API STREAMER # # # #
class ArticleStreamer():
    """
    Class for streaming and processing articles daily
    """

    def __init__(self):
        pass

    def get_article(self, card):
        """Extract article information from the raw html"""
        headline = card.find('h4', 's-title').text
        source = card.find("span", 's-source').text
        posted = card.find('span', 's-time').text.replace('·', '').strip()
        description = card.find('p', 's-desc').text.strip()
        raw_link = card.find('a').get('href')
        unquoted_link = requests.utils.unquote(raw_link)
        pattern = re.compile(r'RU=(.+)\/RK')
        clean_link = re.search(pattern, unquoted_link).group(1)

        article = (headline, source, posted, description, clean_link)
        return article

    def get_the_news(self, query):
        """Run the main program"""
        article_headers = ['title', 'outlet', 'uploaded', 'description', 'url']
        
        template = 'https://news.search.yahoo.com/search?p={}'
        url = template.format(query)
        articles = []
        links = set()

        while True:
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.text, 'html.parser')
            cards = soup.find_all('div', 'NewsArticle')

            # extract articles from page
            for card in cards:
                article = self.get_article(card)
                link = article[-1]
                if not link in links:
                    links.add(link)
                    articles.append(article)

                    # find the next page
            try:
                url = soup.find('a', 'next').get('href')
                sleep(1)
            except AttributeError:
                break
        
        all_articles = pd.DataFrame(articles, columns = article_headers)

        return all_articles

In [3]:
class ArticleCleaner():
    """
    Functionality for importing and cleaning news articles
    """

    def import_json(self, fetched_json_file):
        # Import json and normalize data
        df = pd.read_json(fetched_json_file)
        norm_articles = pd.json_normalize(df['articles'])

        return norm_articles

    def clean_article(self, article):
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", article).split())

    def create_article(self, articles):
        article = articles[['author', 'title', 'description', 'url', 'source.name', 'publishedAt']]
        outlet = articles[['source.id', 'source.name']]

        return article, outlet

In [4]:
class SentimentAnalyzer():
    """
    Functionality for analyzing headlines sentiment
    """
    def __init__(self):
        pass
    
    
    def analyze_headlines(self, data):
        # New words and values
        new_words = {
            'crushes': 10,
            'beats': 5,
            'misses': -5,
            'trouble': -10,
            'falls': -100,
            'slides': -50,
            'record high': 15,
            'low': -15,
            'one week low': -30,
            'worth more': 5,
            'digital gold': 5,
            'high': 15,
            'cryptocurrency fund': 10,
            'up': 5,
            'soars': 70,
            'rebound': 20,
            'pullback': -40,
            'slumps': -60,
            'jumps': 50,
            'record low': -100,
            'soaring': 70,
            'bearish': -50,
            'bullish': 50,
            'bulls': 10,
            'bears' : -10,
            'hodl': 10,
            'pulls back': -40,
            'selloff': -70,
            'retrace': -70,
            'drop': -50,
            'buying': 10,
            'selling': -10,
            'rally': 15,
            'bounces': 20,
            'testing support': -5,
            'climb': 5,
            'rise': 20,
            'crashes': -100,
            'crash': -100,
            'downward': -30,

        }
        
        # Instantiate the sentiment intensity analyzer with the existing lexicon
        vader = SentimentIntensityAnalyzer()
        
        # Update the lexicon
        vader.lexicon.update(new_words)
        
        data = self.reading_dataset()
        
        # Iterate through the headlines and get the polarity scores
        scores = data['Headlines'].apply(vader.polarity_scores)
        
        # Convert the list of dicts into a DataFrame
        scores_df = pd.DataFrame.from_records(scores)

        # Join the DataFrames
        scored_news = data.join(scores_df)
        
        scored_news['assigned_label'] = scored_news['Sentiment'].apply(lambda Sentiment: 'pos' if Sentiment>0 else 'neg')
        scored_news['predicted_label'] = scored_news['compound'].apply(lambda compound: 'pos' if compound>=0 else 'neg')
        
        return scored_news

In [None]:
if __name__ == '__main__':
    query = 'cryptocurrency'
    headers = {
        'accept': '*/*',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'en-US,en;q=0.9',
        'referer': 'https://www.google.com',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
    }

    article_cleaner = ArticleCleaner()
    article_streamer = ArticleStreamer()
    sentiment_analyzer = SentimentAnalyzer()

    all_articles = article_streamer.get_the_news(query)

    clean_title = np.array([article_cleaner.clean_article(article) for article in all_articles['title']])
    clean_desc = np.array([article_cleaner.clean_article(article) for article in all_articles['description']])

    all_articles['clean_title'] = clean_title
    all_articles['clean_description'] = clean_desc
    
    scored_news = sentiment_analyzer.analyze_headlines(all_articles['clean_title'])
    
    scored_news.to_csv(r'.\sentiment.csv', index=False)

In [26]:
scored_news

Unnamed: 0,Headlines,Sentiment,neg,neu,pos,compound,assigned_label,predicted_label
0,UPDATE 1 Bitcoin slides more than 5 after topp...,0,0.836,0.164,0.000,-0.9970,neg,neg
1,UPDATE 1 Bitcoin trades near Sunday record of ...,1,0.000,1.000,0.000,0.0000,pos,pos
2,Don t Use Telegram s New People Nearby Feature.,0,0.000,1.000,0.000,0.0000,neg,pos
3,UPDATE 1 Bitcoin hits one week low as rising U...,0,0.381,0.238,0.381,0.0000,neg,pos
4,Jack Dorsey criticized proposed cryptocurrency...,0,0.200,0.800,0.000,-0.3612,neg,neg
...,...,...,...,...,...,...,...,...
278,Guggenheim CIO expects Bitcoin to drop to 20 000,0,0.864,0.136,0.000,-0.9970,neg,neg
279,drop,0,1.000,0.000,0.000,-0.9970,neg,neg
280,Bitcoin Price Could Retrace to $20K This Year:...,0,0.917,0.083,0.000,-0.9995,neg,neg
281,Why did Bitcoin fall below $33K? Coinbase whal...,0,0.000,1.000,0.000,0.0000,neg,pos


In [30]:
accuracy_score(scored_news['assigned_label'],scored_news['predicted_label'])

0.7455830388692579

In [32]:
confusion_matrix(scored_news['assigned_label'],scored_news['predicted_label'])

array([[ 59,  49],
       [ 23, 152]], dtype=int64)

In [44]:
high_news = scored_news[(scored_news['compound'] > .5) | (scored_news['compound'] < -0.5)]

In [45]:
high_news

Unnamed: 0,Headlines,Sentiment,neg,neu,pos,compound,assigned_label,predicted_label
0,UPDATE 1 Bitcoin slides more than 5 after topp...,0,0.836,0.164,0.000,-0.9970,neg,neg
5,Bitcoin trading at 32 990 off Sunday record hi...,1,0.000,0.407,0.593,0.9682,pos,pos
8,Bitcoin hits one week high.,1,0.000,0.200,0.800,0.9682,pos,pos
10,Breakingviews Chancellor Was I totally wrong a...,1,0.361,0.639,0.000,-0.5256,pos,neg
15,Bitcoin Keeps Moving On Up Reaching 33 000 in ...,1,0.000,0.407,0.593,0.8807,pos,pos
...,...,...,...,...,...,...,...,...
274,Long term holders remain confident about Bitco...,1,0.000,0.385,0.615,0.9756,pos,pos
276,Ethereum shoots past 1 400 sets new all time high,1,0.000,0.333,0.667,0.9682,pos,pos
278,Guggenheim CIO expects Bitcoin to drop to 20 000,0,0.864,0.136,0.000,-0.9970,neg,neg
279,drop,0,1.000,0.000,0.000,-0.9970,neg,neg


In [47]:
high_news.count()

Headlines          107
Sentiment          107
neg                107
neu                107
pos                107
compound           107
assigned_label     107
predicted_label    107
dtype: int64

In [46]:
accuracy_score(high_news['assigned_label'],high_news['predicted_label'])

0.8691588785046729