In [5]:
uber_url = "https://www.reuters.com/article/us-uber-lawsuit-california/uber-is-sued-over-resistance-to-california-gig-employment-law-idUSKCN1VX1VE"
apple_url = "https://www.reuters.com/article/us-apple-macbook/apple-refreshes-macbook-pro-laptop-with-16-inch-screen-idUSKBN1XN1V8"
apple_url2 = "https://www.reuters.com/article/us-apple-macbook/apple-refreshes-macbook-pro-laptop-with-16-inch-screen-idUSKBN1XN1V8"

article_urls = [uber_url, apple_url, apple_url2]

In [4]:
import requests
import string
from collections import Counter

In [None]:
class Scraper:
        
    def fetch_news(self, urls):
        article_contents = []
        for url in urls:
            try:
                contents = requests.get(url).text
                article_contents.append(contents)
            except Exception as e:
                print(e)
        return article_contents

In [6]:
def is_clean(word):
    blacklist = {"var", "img", "e", "void"}
    if not word:
        return False
    if word in blacklist:
        return False
    for i, letter in enumerate(word):
        if i > 0 and letter in string.ascii_uppercase:
            return False
        if letter not in string.ascii_letters:
            return False
    return True

In [10]:
class Cleaner:
    
    def clean_articles(self, articles):
        clean_articles = []

        for article in articles:
            clean_words = []
            try:
                for word in article.split(" "):
                    if is_clean(word):
                        clean_words.append(word)
            except Exception as e:
                print(e)
            clean_articles.append(' '.join(clean_words))
        return clean_articles

In [11]:
class Deduplicator:
    
    def deduplicate_articles(self, articles):
        seen_articles = set()
        deduplicated_articles = []
        for article in articles:
            if hash(article) in seen_articles:
                continue
            else:
                seen_articles.add(hash(article))
                deduplicated_articles.append(article)
                    
        return deduplicated_articles

In [12]:
class Analyzer:
    good_words = {"unveiled", "available", "faster", "stable"}
    bad_words = {"sued", "defiance", "violation"}

    def extract_entities_and_sentiment(self, articles):
        entity_score_pairs = []
        for article in articles:
            score = 0
            entities = []
            for word in article.split(" "):
                if word[0] == word[0].upper():
                    entities.append(word)
                if word.lower() in self.good_words:
                    score += 1
                elif word.lower() in self.bad_words:
                    score -= 1
            main_entities = [i[0] for i in Counter(entities).most_common(2)]
            entity_score_pair = (main_entities, score)
            entity_score_pairs.append(entity_score_pair)
        return entity_score_pairs

In [14]:
class DecisionMaker:
    target_companies = set(['Apple', 'Uber', 'Google'])
        
    def make_decisions(self, entity_score_pairs):
        decisions = []
        for entities, score in entity_score_pairs:
            for entity in entities:
                if entity in self.target_companies:
                    quantity = abs(score)
                    order = "Buy" if score > 0 else "Sell"
                    decision = (order, quantity, entity)
                    decisions.append(decision)
        return decisions

In [15]:
scraper = Scraper()
cleaner = Cleaner()
deduplicator = Deduplicator()
analyzer = Analyzer()
decision_maker = DecisionMaker()

In [19]:
contents = scraper.fetch_news(article_urls)
contents[0][:500]

'<!--[if !IE]> This has been served from cache <![endif]-->\n<!--[if !IE]> Request served from apache server: produs--i-0a4a08336159d88d2 <![endif]-->\n<!--[if !IE]> Cached on Wed, 26 Feb 2020 23:01:28 GMT and will expire on Wed, 26 Feb 2020 23:16:19 GMT <![endif]-->\n<!--[if !IE]> token: f9fd82a6-e004-4871-85e1-63089475bceb <![endif]-->\n<!--[if !IE]> App Server /produs--i-0655f4557687834a5/ <![endif]-->\n\n<!doctype html><html lang="en" data-edition="BETAUS">\n    <head>\n\n    <title>\n                U'

In [22]:
clean_articles = cleaner.clean_articles(contents)
clean_articles[0][:500]

'This has been served from cache Request served from apache Cached on Feb and will expire on Feb App Server Uber is sued over resistance to California employment law Segment snippet included if Page hiding snippet Data Layer Object Declaration New Google Tag Manager new End Google Tag Manager new driver for Uber has sued the company for misclassifying its drivers as independent hours after California legislators voted to help thousands of those workers and enjoy the benefits of produced in Proces'

In [25]:
print(len(clean_articles))
deduplicated = deduplicator.deduplicate_articles(clean_articles)
print(len(deduplicated))

3
2


In [27]:
entity_score_pairs = analyzer.extract_entities_and_sentiment(deduplicated)
print(entity_score_pairs)

[(['Uber', 'California'], -18), (['Pro', 'Apple'], 16)]


In [30]:
decisions = decision_maker.make_decisions(entity_score_pairs)
print(decisions)

[('Sell', 18, 'Uber'), ('Buy', 16, 'Apple')]
