In [0]:
import requests
import time
import os
from pathlib import Path
from random import randint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import datetime
from datetime import date
str(date.today())

'2020-02-11'

In [0]:
WRITE_DIR=Path('news')

In [0]:
'''
function to sleep for a random amount of time between min_sleep and max_sleep
'''
def random_sleep(min_sleep=1, max_sleep=10, verbose=False):
    sleep_time = randint(min_sleep, max_sleep)
    if verbose:
        print('sleep', str(sleep_time))
    time.sleep(sleep_time)

In [0]:
def parse_date(date_str):
    return_date = date_str.replace('T',' ').replace('Z','')
    return_date = datetime.datetime.strptime(return_date,'%Y-%m-%d %H:%M:%S').date()
    return return_date

In [0]:
'''
function to extract list of news items from the json content
'''
def extract_news(content):
    news_items = []
    for item in content['wireitems']:
        if item['wireitem_type'] != 'story':
            continue
        news={'id': item['wireitem_id'],
              'headline': item['templates'][0]['story']['hed'],
              'lede': item['templates'][0]['story']['lede'],
              'date': item['templates'][0]['story']['updated_at'],
              'url': item['templates'][0]['template_action']['url']}
        if 'image' in item['templates'][0]:
            news['img_cap'] = item['templates'][0]['image']['caption']
            news['img_url'] = item['templates'][0]['image']['url']
        news_items.append(news)
    return news_items

In [0]:
'''
function to fetch news items for a given stock symbol with different start and stop criteria

parameters:
stop_count -> min num of news articles to fetch
stop_date -> date in past that should be included in the fetched news items
last_id -> start fetching from the intermediate id
'''
def fetch_news(symbol,stop_count=100000, stop_date=date(2019,1,1), last_id=None, verbose=False):
    if verbose:
        print(f'\nfetch news for {symbol}, stop_count:{stop_count}, stop_date:{stop_date}')
    URL=f'https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:{symbol}'
    retry=0
    news_items=[]
    errors = []
    last_date = date.today()
    while len(news_items) < stop_count and stop_date <= last_date:
        params = {'until':last_id} if last_id else {}
        while retry < 3:
            try:
                retry = retry + 1
                random_sleep(verbose=verbose)
                if verbose:
                    print('Fetching URL:', URL, 'params:', params)
                response = requests.get(URL, params=params)
                if response:
                    retry = 0
                    break
            except Exception as e:
                print(f'Exception thrown in fetch_news for {symbol}',e)
                errors.append(e)
                pass
        items = extract_news(response.json())
        news_items.extend(items)
        last_id = news_items[-1]['id']
        last_date = parse_date(news_items[-1]['date'])
        if verbose:
            print(f'Collected {len(news_items)} news items for {symbol} with {len(errors)} errors' )
    if verbose:
        print(f'last_id: {last_id}, last_date: {str(last_date)}')
    return news_items

In [0]:
'''
function to fetch news items for a list of stock symbols and store results in local directory as json file

parameters:
stop_count -> min num of news articles to fetch
stop_date -> date in past that should be included in the fetched news items
last_id -> start fetching from the intermediate id
write_dir -> directory to write results in json format
'''
def scrape_news(symbols,stop_count=100000, stop_date=date(2019,1,1),write_dir=WRITE_DIR,verbose=False):
    for symbol in symbols:
        sym_stop_date = stop_date[symbol] if type(stop_date) == dict and symbol in stop_date else stop_date
        news_items = fetch_news(symbol,stop_count=stop_count,stop_date=sym_stop_date,verbose=verbose)
        data_dict = {'ticker':symbol, 
                     'news_items':news_items}
        filename=symbol.split('.')[0]
        file_path_str = write_dir / (filename +'.json')
        with open(file_path_str, "w") as write_file:
            print(f'Writing file for {symbol}')
            json.dump(data_dict, write_file)

In [0]:
def find_stop_date(data,symbol):
    symbol = symbol.split('.')[0]
    dates = data.loc[data['ticker'] == symbol, 'created_at']
    return dates.min()

In [0]:
all_data = pd.read_pickle('stocktwits_labelled.pkl')
print(all_data.shape)
print(all_data.columns)
all_data['created_at'] = all_data['created_at'].apply(parse_date)

(137996, 35)
Index(['id', 'body', 'created_at', 'user', 'source', 'symbols',
       'mentioned_users', 'entities', 'filters', 'conversation', 'likes',
       'links', 'reshare_message', 'reshares', 'structurable', 'ticker',
       'user_followers', 'user_following', 'user_join_date', 'user_ideas',
       'user_identity', 'user_like_count', 'user_official',
       'user_wtchlst_count', 'username', 'sentiment', 'num_likes',
       'num_reshares', 'num_replies', 'day_counts', 'raw_body', 'char_length',
       'bearish_score', 'bullish_score', 'sentiment_pred'],
      dtype='object')


In [0]:
symbols=['AAPL.O','TWTR.N','FB.O','TSLA.O', 'SPY.N', 'DIA.N','GOOGL.O', 'MSFT.O','AMZN.O', 'AMD.O', 'INTC.O','BABA.N']
stop_date = date(2019,11,25)
stop_date = {s:find_stop_date(all_data,s) for s in symbols} # extract stop_date for each symbol
scrape_news(symbols,stop_date=stop_date,write_dir=WRITE_DIR,verbose=True)


fetch news for AAPL.O, stop_count:100000, stop_date:2020-01-11
sleep 5
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:AAPL.O params: {}
Collected 20 news items for AAPL.O with 0 errors
sleep 5
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:AAPL.O params: {'until': '1580425516023208000'}
Collected 39 news items for AAPL.O with 0 errors
sleep 2
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:AAPL.O params: {'until': '1580213040039227000'}
Collected 58 news items for AAPL.O with 0 errors
sleep 2
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:AAPL.O params: {'until': '1579179543044276000'}
Collected 77 news items for AAPL.O with 0 errors
last_id: 1576261313014149000, last_date: 2019-12-13
Writing file for AAPL.O

fetch news for TWTR.N, stop_count:100000, stop_date:2019-10-29
sleep 6
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:TWTR.N params: {}
Collected 20 ne

Collected 267 news items for DIA.N with 0 errors
sleep 3
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:DIA.N params: {'until': '1579254595004780000'}
Collected 286 news items for DIA.N with 0 errors
sleep 6
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:DIA.N params: {'until': '1579095097053501000'}
Collected 305 news items for DIA.N with 0 errors
sleep 7
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:DIA.N params: {'until': '1578947787058494000'}
Collected 324 news items for DIA.N with 0 errors
sleep 4
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:DIA.N params: {'until': '1578816028032484000'}
Collected 343 news items for DIA.N with 0 errors
sleep 4
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:DIA.N params: {'until': '1578675488044050000'}
Collected 362 news items for DIA.N with 0 errors
sleep 2
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/

Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:DIA.N params: {'until': '1571896813064976000'}
Collected 1160 news items for DIA.N with 0 errors
sleep 2
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:DIA.N params: {'until': '1571731616053501000'}
Collected 1179 news items for DIA.N with 0 errors
sleep 3
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:DIA.N params: {'until': '1571565483032484000'}
Collected 1198 news items for DIA.N with 0 errors
sleep 1
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:DIA.N params: {'until': '1571387062004780000'}
Collected 1217 news items for DIA.N with 0 errors
sleep 6
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:DIA.N params: {'until': '1571221440058494000'}
Collected 1236 news items for DIA.N with 0 errors
sleep 7
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:DIA.N params: {'until': '157110770103

Collected 20 news items for GOOGL.O with 0 errors
sleep 4
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:GOOGL.O params: {'until': '1575930438014149000'}
Collected 39 news items for GOOGL.O with 0 errors
sleep 3
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:GOOGL.O params: {'until': '1573091230002256000'}
Collected 58 news items for GOOGL.O with 0 errors
sleep 6
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:GOOGL.O params: {'until': '1568278362039227000'}
Collected 77 news items for GOOGL.O with 0 errors
sleep 2
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:GOOGL.O params: {'until': '1564352936015257000'}
Collected 96 news items for GOOGL.O with 0 errors
last_id: 1559615493054695000, last_date: 2019-06-04
Writing file for GOOGL.O

fetch news for MSFT.O, stop_count:100000, stop_date:2019-11-15
sleep 6
Fetching URL: https://wireapi.reuters.com/v8/feed/rcom/us/marketnews/ric:MSFT.O p

In [0]:
NEWS_DIR=Path('news')
df_list = []
for file in os.listdir(NEWS_DIR):
    with open(NEWS_DIR / file, "r") as read_file:
        data = json.load(read_file)
    df = pd.DataFrame(data['news_items'])
    df['ticker'] = data['ticker'].split('.')[0]
    df_list.append(df)
    
news = pd.concat(df_list, ignore_index=True,sort=False)
news['date'] = news['date'].apply(parse_date)

In [0]:
news.to_pickle('reuters.pkl')
news = pd.read_pickle('reuters.pkl')
news.shape

(2881, 8)