In [252]:
import re
import csv
from time import sleep
import requests
import json
import pandas as pd
import numpy as np
import requests, bs4
from bs4 import BeautifulSoup
import datetime
from datetime import datetime, timedelta, date

import pandas_datareader as pdr
import matplotlib.pyplot as plt

from yahoofinancials import YahooFinancials
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [190]:
# # # # NEWS API STREAMER # # # #
class ArticleStreamer():
    """
    Class for streaming and processing articles daily
    """

    def __init__(self):
        pass

    def get_article(self, card):
        """Extract article information from the raw html"""
        headline = card.find('h4', 's-title').text
        source = card.find("span", 's-source').text
        posted = card.find('span', 's-time').text.replace('·', '').strip()
        description = card.find('p', 's-desc').text.strip()
        raw_link = card.find('a').get('href')
        unquoted_link = requests.utils.unquote(raw_link)
        pattern = re.compile(r'RU=(.+)\/RK')
        clean_link = re.search(pattern, unquoted_link).group(1)

        article = (headline, source, posted, description, clean_link)
        return article

    def get_the_news(self, query):
        """Run the main program"""
        article_headers = ['title', 'outlet', 'uploaded', 'description', 'url']
        
        template = 'https://news.search.yahoo.com/search?p={}'
        url = template.format(query)
        articles = []
        links = set()

        while True:
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.text, 'html.parser')
            cards = soup.find_all('div', 'NewsArticle')

            # extract articles from page
            for card in cards:
                article = self.get_article(card)
                link = article[-1]
                if not link in links:
                    links.add(link)
                    articles.append(article)

                    # find the next page
            try:
                url = soup.find('a', 'next').get('href')
                sleep(1)
            except AttributeError:
                break
        
        all_articles = pd.DataFrame(articles, columns = article_headers)

        #for article in all_articles:
        #    if article.uploaded
        
        return all_articles

In [257]:
class ArticleCleaner():
    """
    Functionality for importing and cleaning news articles
    """

    def import_json(self, fetched_json_file):
        # Import json and normalize data
        df = pd.read_json(fetched_json_file)
        norm_articles = pd.json_normalize(df['articles'])

        return norm_articles

    def clean_article(self, article):
        
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", article).split())

    def create_article(self, articles):
        article = articles[['author', 'title', 'description', 'url', 'source.name', 'publishedAt']]
        outlet = articles[['source.id', 'source.name']]

        return article, outlet
    
    def get_recent_articles(self, all_articles):
        recent_articles = pd.DataFrame()

        for index, row in all_articles.iterrows():
            if 'hour' in row.uploaded:
                recent_articles = recent_articles.append(row)

            if 'minute' in row.uploaded:
                recent_articles = recent_articles.append(row)

            if 'second' in row.uploaded:
                recent_articles = recent_articles.append(row)

        time = []
        hours = recent_articles['uploaded'].str.split(' ')
        df = pd.DataFrame(hours.values.tolist(), index=hours.index)
        hours_ago = df[0]

        for index, hours in hours_ago.items():
            if hours < 24:
                d = datetime.today() - timedelta(hours=int(hours), minutes=0)
                time.append(d.strftime("%Y-%m-%d %H:%M"))
            else:
                d = datetime.today() - timedelta(hours=0, minutes=int(hours))
                time.append(d.strftime("%Y-%m-%d %H:%M"))
        
        recent_articles['publishedAt'] = time
        recent_articles = recent_articles.drop(['uploaded'])

        return recent_articles

In [192]:
class SentimentAnalyzer():
    """
    Functionality for analyzing headlines sentiment
    """
    def __init__(self):
        self.data = pd.read_csv('./datasets/headlines_labelled.txt',
                        sep='\t', header= None)
        self.scores_df = pd.DataFrame()
        
        # New words and values
        self.new_words = {
            'crushes': 10,
            'beats': 5,
            'misses': -5,
            'trouble': -10,
            'falls': -100,
            'slides': -50,
            'slide': -50,
            'record high': 15,
            'low': -15,
            'one week low': -30,
            'worth more': 5,
            'digital gold': 5,
            'high': 15,
            'cryptocurrency fund': 10,
            'up': 5,
            'soars': 70,
            'rebound': 20,
            'pullback': -40,
            'slumps': -60,
            'jumps': 50,
            'record low': -100,
            'soaring': 70,
            'bearish': -50,
            'bullish': 50,
            'bulls': 10,
            'bears' : -10,
            'hodl': 10,
            'pulls back': -40,
            'selloff': -70,
            'retrace': -70,
            'drop': -50,
            'buying': 10,
            'selling': -10,
            'rally': 15,
            'bounces': 20,
            'testing support': -5,
            'climb': 5,
            'rise': 20,
            'crashes': -100,
            'crash': -100,
            'downward': -30,
            'plunges': -100,
            'plunge' : -80,
            'cardano': 0,
            'descends': -30,
            'descend': -30,
            'gain' : 20,
            'gains' : 20,
            'worst' : -25,
            'loss' : -15,
            'without risk': 10,
            'tumbles': -50,
            'jeopardy': -50
        }
        
        
    def reading_dataset(self):
        columnName = ['Headlines','Sentiment']
        self.data.columns = columnName
        self.data.head()
        
        return self.data
    
    
    def analyze_test_headlines(self):        
        # Instantiate the sentiment intensity analyzer with the existing lexicon
        vader = SentimentIntensityAnalyzer()
        
        # Update the lexicon
        vader.lexicon.update(self.new_words)
        
        data = self.reading_dataset()
        
        # Iterate through the headlines and get the polarity scores
        scores = data['Headlines'].apply(vader.polarity_scores)
        
        # Convert the list of dicts into a DataFrame
        scores_df = pd.DataFrame.from_records(scores)

        # Join the DataFrames
        scored_news = data.join(scores_df)
        
        scored_news['assigned_label'] = scored_news['Sentiment'].apply(lambda Sentiment: 'pos' if Sentiment>0 else 'neg')
        scored_news['predicted_label'] = scored_news['compound'].apply(lambda compound: 'pos' if compound>=0 else 'neg')
        
        return scored_news
    
    def analyze_recent_headlines(self, data):        
        # Instantiate the sentiment intensity analyzer with the existing lexicon
        vader = SentimentIntensityAnalyzer()
        
        # Update the lexicon
        vader.lexicon.update(self.new_words)
        
        # Iterate through the headlines and get the polarity scores
        #scores = data['clean_title'].apply(vader.polarity_scores)
        
        self.scores_df['neg'] = [vader.polarity_scores(x)['neg'] for x in data['clean_title']]
        self.scores_df['neu'] = [vader.polarity_scores(x)['neu'] for x in data['clean_title']]
        self.scores_df['pos'] = [vader.polarity_scores(x)['pos'] for x in data['clean_title']]
        self.scores_df['compound'] = [vader.polarity_scores(x)['compound'] for x in data['clean_title']]
        
        # Convert the list of dicts into a DataFrame
        #scores_df = pd.DataFrame.from_records(scores)

        # Join the DataFrames
        data = data.reset_index()
        scored_news = data.merge(self.scores_df,left_index=True, right_index=True, how='inner')
        
        scored_news['predicted_label'] = scored_news['compound'].apply(lambda compound: 'pos' if compound > 0 else ('neu' if compound == 0 else 'neg'))
        
        return scored_news

In [193]:
class PriceStreamer():
    """
    Functionality for constantly streaming BTC price
    """
    def parse_price():
        res = requests.get('https://finance.yahoo.com/quote/BTC-USD?p=BTC-USD&.tsrc=fin-srch')
        soup = bs4.BeautifulSoup(res.text, 'lxml')

        price = soup.find_all('span', class_='Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(ib)')[0].text
        price_change = soup.find_all('span', class_='Trsdu(0.3s) Fw(500) Pstart(10px) Fz(24px) C($positiveColor)')[0].text

        return price, price_change
    
    def get_hist_data():
        cryptocurrencies = ['BTC-USD', 'ETH-USD', 'ADA-USD']
        yahoo_financials_cryptocurrencies = YahooFinancials(cryptocurrencies)

        d6 = (datetime.date.today() - datetime.timedelta(days=6)).strftime("%Y-%m-%d")
        d13 = (datetime.date.today() - datetime.timedelta(days=13)).strftime("%Y-%m-%d")

        daily_crypto_prices = yahoo_financials_cryptocurrencies.get_historical_price_data(d12, d6, 'daily')
        data = pd.DataFrame.from_records(daily_crypto_prices)
        price_df = pd.json_normalize(data['BTC-USD']['prices'])
        
        return price_df

In [256]:
if __name__ == '__main__':
    query = 'cryptocurrency'
    headers = {
        'accept': '*/*',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'en-US,en;q=0.9',
        'referer': 'https://www.google.com',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
    }
    todays_date = date.today().strftime("%d%m%Y")

    article_cleaner = ArticleCleaner()
    article_streamer = ArticleStreamer()
    sentiment_analyzer = SentimentAnalyzer()

    #all_articles = article_streamer.get_the_news(query)

    clean_title = np.array([article_cleaner.clean_article(article) for article in all_articles['title']])
    clean_desc = np.array([article_cleaner.clean_article(article) for article in all_articles['description']])

    all_articles['clean_title'] = clean_title
    all_articles['clean_description'] = clean_desc
    
    recent_articles = article_cleaner.get_recent_articles(all_articles)
    
    scored_news = sentiment_analyzer.analyze_recent_headlines(recent_articles)
    
    high_news = scored_news[(scored_news['compound'] > .5) | (scored_news['compound'] < -0.5)]

    high_news.to_csv(f'.\{todays_date}sentiment.csv', index=False)

In [176]:
from yahoofinancials import YahooFinancials

cryptocurrencies = ['BTC-USD', 'ETH-USD', 'ADA-USD']
yahoo_financials_cryptocurrencies = YahooFinancials(cryptocurrencies)

d6 = (datetime.date.today() - datetime.timedelta(days=6)).strftime("%Y-%m-%d")
d12 = (datetime.date.today() - datetime.timedelta(days=12)).strftime("%Y-%m-%d")

daily_crypto_prices = yahoo_financials_cryptocurrencies.get_historical_price_data(d12, d6, 'daily')
data = pd.DataFrame.from_records(daily_crypto_prices)
price_df = pd.json_normalize(data['BTC-USD']['prices'])

In [196]:
final_news = high_news[['date','clean_title','compound']]
recent_articles

Unnamed: 0,clean_description,clean_title,description,outlet,title,uploaded,url,date
0,Strong hands look to have been backing the rec...,Big Investors Stacked Up Ether as Price Rose t...,Strong hands look to have been backing the rec...,CoinDesk via Yahoo Finance,Big Investors Stacked Up Ether as Price Rose t...,22 hours ago,https://finance.yahoo.com/news/big-investors-s...,2021-01-26
1,Investment flows into cryptocurrency funds and...,Bitcoin crypto inflows hit record last week Co...,Investment flows into cryptocurrency funds and...,Reuters via Yahoo Finance,"Bitcoin, crypto inflows hit record last week -...",7 hours ago,https://finance.yahoo.com/news/bitcoin-crypto-...,2021-01-26
7,The vertiginous rise that the price of Bitcoin...,Bitcoin May Never Go Above 40 000 Again JP Mor...,The vertiginous rise that the price of Bitcoin...,Entrepreneur via Yahoo Finance,"Bitcoin May Never Go Above $ 40,000 Again, JP ...",15 hours ago,https://finance.yahoo.com/news/bitcoin-may-nev...,2021-01-26
9,The pace of flows into the 20 billion Grayscal...,Bitcoin Return to 40 000 in Doubt as Flows to ...,The pace of flows into the $20 billion Graysca...,Bloomberg via Yahoo Finance,"Bitcoin Return to $40,000 in Doubt as Flows to...",23 hours ago,https://finance.yahoo.com/news/bitcoin-return-...,2021-01-26
10,Bitcoin s price could exceed 50 000 over the l...,Bitcoin Seen Topping 50 000 Long Term as it Vi...,"Bitcoin’s price could exceed $50,000 over the ...",Bloomberg,"Bitcoin Seen Topping $50,000 Long Term as it V...",5 hours ago,https://www.bloomberg.com/news/articles/2021-0...,2021-01-26
...,...,...,...,...,...,...,...,...
549,Futures The Dow futures are up by 0 13 and the...,Global Markets Start Week On Positive Note Eth...,"Futures: The Dow futures are up by 0.13%, and ...",Benzinga via Yahoo Finance,"Global Markets Start Week On Positive Note, Et...",24 hours ago,https://finance.yahoo.com/news/global-markets-...,2021-01-26
550,The U K s former cybersecurity chief has said ...,Former UK Cybersecurity Chief Says Laws Needed...,The U.K.’s former cybersecurity chief has said...,Coindesk,Former UK Cybersecurity Chief Says Laws Needed...,24 hours ago,https://www.coindesk.com/former-uk-cybersecuri...,2021-01-26
580,In financial markets filled to the brim with f...,Crypto Markets Show More Signs of Excess Amid ...,In financial markets filled to the brim with f...,Bloomberg,Crypto Markets Show More Signs of Excess Amid ...,18 hours ago,https://www.bloomberg.com/news/articles/2021-0...,2021-01-26
583,We ll maintain our own presence everywhere we ...,Online merchants linked to QAnon down but not ...,"""We'll maintain our own presence everywhere we...",Reuters via AOL,"Online merchants linked to QAnon down, but not...",20 hours ago,https://www.aol.com/online-merchants-linked-qa...,2021-01-26


In [248]:
recent_articles['publishedAt'] = time

In [213]:
df = pd.DataFrame(hours.values.tolist(), index=hours.index)
df

Unnamed: 0,0,1,2
0,22,hours,ago
1,7,hours,ago
7,15,hours,ago
9,23,hours,ago
10,5,hours,ago
...,...,...,...
549,24,hours,ago
550,24,hours,ago
580,18,hours,ago
583,20,hours,ago
