###  ---------------------------------------------------------------------------------------------------------------------------------------
##### Copyright (c) Rajdeep Biswas
##### Licensed under the MIT license.
###### File: get_snp500_news.ipynb
###### Date: 09/27/2021
###  ---------------------------------------------------------------------------------------------------------------------------------------

### Table of Contents

* [Initial Configurations](#IC)
    * [Import Libraries](#IL)
    * [Autheticate the AML Workspace](#AML)
* [Get Data (Bronze)](#GD)
    * [Setup Directory Structure](#SD)
    * [Configure SnP500 List](#SL)
    * [Download Current Stock News](#DD)    
* [Transform Data](#TD)   
    * [Write Data (Silver)](#WS)   
    * [Clean the Text](#CT)
    * [Write Data (Gold)](#WG)    

### Initial Configurations <a class="anchor" id="IC"></a>

#### Import Libraries <a class="anchor" id="IL"></a>

In [4]:
#Import required Libraries
import os

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.image import imread
import cv2
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

#pip install pandas_datareader
import pandas_datareader.data as web
import pandas as pd
import datetime as dt

import azureml.core
import azureml.automl
from azureml.core import Workspace, Dataset, Datastore

#### Autheticate the AML Workspace <a class="anchor" id="AML"></a>

In [3]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.33.0 to work with houston-techsummit-workspace


### Get Data (Bronze) <a class="anchor" id="GD"></a>
- Raw data extraction for the file, API based and web datasets. Let us call this __Bronze Layer__.
- Data transformation using python from Raw to Processed stage. We will call this __Silver Layer__.
- Finally store the processed data using standard taxonomy in a SQL based serving layer. We will call this __Gold Layer__.

#### Setup Directory Structure <a class="anchor" id="SD"></a>

In [6]:
data_folder = os.path.join(os.getcwd(), 'data')

#Create the data directory
os.makedirs(data_folder, exist_ok=True)

#Create the bronze, silver and gold folders
bronze_data_folder = data_folder +"/bronze"
os.makedirs(bronze_data_folder, exist_ok=True)

silver_data_folder = data_folder +"/silver"
os.makedirs(silver_data_folder, exist_ok=True)

gold_data_folder = data_folder +"/gold"
os.makedirs(gold_data_folder, exist_ok=True)

#Create sub folder for stock news data in bronze
news_data_bronze = bronze_data_folder +"/snp500_news"
os.makedirs(news_data_bronze, exist_ok=True)

#Create sub folder for stock news data in silver
news_data_silver = silver_data_folder +"/snp500_news"
os.makedirs(news_data_silver, exist_ok=True)

#Create sub folder for stock news data in gold
news_data_gold = gold_data_folder +"/snp500_news"
os.makedirs(news_data_gold, exist_ok=True)

#### Configure SnP500 List<a class="anchor" id="SL"></a>

In [7]:
snp500_list = si.tickers_sp500()

#### Download Current Stock News <a class="anchor" id="DD"></a>

In [19]:
all_news = news.get_rss("A")
all_news[0]

{'summary': 'SANTA CLARA, Calif., September 23, 2021--Pfizer executive Dr. Mikael Dolsten appointed to Agilent board of directors.',
 'summary_detail': {'type': 'text/html',
  'language': None,
  'base': 'https://feeds.finance.yahoo.com/rss/2.0/headline?s=A&region=US&lang=en-US',
  'value': 'SANTA CLARA, Calif., September 23, 2021--Pfizer executive Dr. Mikael Dolsten appointed to Agilent board of directors.'},
 'id': '2fa4717a-ebe3-3c6e-9bde-57c6629f6574',
 'guidislink': False,
 'links': [{'rel': 'alternate',
   'type': 'text/html',
   'href': 'https://finance.yahoo.com/news/dr-mikael-dolsten-named-agilent-200500494.html?.tsrc=rss'}],
 'link': 'https://finance.yahoo.com/news/dr-mikael-dolsten-named-agilent-200500494.html?.tsrc=rss',
 'published': 'Thu, 23 Sep 2021 20:05:00 +0000',
 'published_parsed': time.struct_time(tm_year=2021, tm_mon=9, tm_mday=23, tm_hour=20, tm_min=5, tm_sec=0, tm_wday=3, tm_yday=266, tm_isdst=0),
 'title': 'Dr. Mikael Dolsten Named to Agilent Board of Directors

In [11]:
all_news[0]['summary']

"Netflix's co-CEO Ted Sarandos noted that French-language crime drama Lupin was the first non-English-language show to become the platform's top program. “Squid Game is bigger,” he says."

In [12]:
len(all_news)

20

In [33]:
snp500_news = {}
combined_snp500_news = pd.DataFrame(columns = ["Ticker", "NewsNum", "Value"])
#combined_snp500_news.columns = ["Ticker", "NewsNum", "Value"]
for i in range (len(all_news)):
    snp500_news[ticker+'_'+str(i)] = all_news_list[i]['summary']
for key,value in snp500_news.items():
    news_num = key.split('_')[1]
    new_row = {'Ticker':ticker, 'NewsNum':news_num, 'Value':value}
    #append row to the dataframe
    combined_snp500_news = combined_snp500_news.append(new_row, ignore_index=True)    

combined_snp500_news    
#snp500_news
#combined_snp500_news = pd.concat(snp500_news)
 
#combined_snp500_news = combined_snp500_news.reset_index()
 
#del combined_extra_stats["level_1"]
 
#combined_extra_stats.columns = ["Ticker", "Value"]
#combined_extra_stats

Unnamed: 0,Ticker,NewsNum,Value
0,CDAY,0,Shares of cruise line stocks had a great start...
1,CDAY,1,Shares of cruise line stocks including Carniva...
2,CDAY,2,Shares of Carnival were up almost 5%. The gai...
3,CDAY,3,Carnival's (CCL) gradual resumption of busines...
4,CDAY,4,Princess Cruises sails guests from the Port of...
5,CDAY,5,CCL earnings call for the period ending August...
6,CDAY,6,Doral-based Carnival Corp. & plc is on course ...
7,CDAY,7,Yahoo Finance’s Ines Ferre reports on the day'...
8,CDAY,8,View more earnings on CCLSee more from Benzing...
9,CDAY,9,In this article we will take a look at the som...


In [52]:
#snp500_list = si.tickers_sp500()
snp500_news = {}
combined_snp500_news = pd.DataFrame(data=None, columns = ["Ticker", "NewsNum", "Value"])

import time
from time import sleep
import importlib

for ticker in snp500_list:
    try: 
        all_news_list = news.get_yf_rss(ticker)
        for i in range (len(all_news_list)):
            snp500_news[ticker+'_'+str(i)] = all_news_list[i]['summary']
        for key,value in snp500_news.items():
            news_num = key.split('_')[1]
            new_row = {'Ticker':ticker, 'NewsNum':news_num, 'Value':value}
            #append row to the dataframe
            combined_snp500_news = combined_snp500_news.append(new_row, ignore_index=True)    
        #break
    except Exception as e:
        print (ticker, e)
        sleep(300) 
        all_news_list = news.get_yf_rss(ticker)
        for i in range (len(all_news_list)):
            snp500_news[ticker+'_'+str(i)] = all_news_list[i]['summary']
        for key,value in snp500_news.items():
            news_num = key.split('_')[1]
            new_row = {'Ticker':ticker, 'NewsNum':news_num, 'Value':value}
            #append row to the dataframe
            combined_snp500_news = combined_snp500_news.append(new_row, ignore_index=True) 

In [37]:
combined_snp500_news

Unnamed: 0,Ticker,NewsNum,Value
0,A,0,"SANTA CLARA, Calif., September 23, 2021--Pfize..."
1,A,1,"SANTA CLARA, Calif., September 22, 2021--Agile..."
2,A,2,Hedge fund manager William Ackman's 10% bet on...
3,A,3,Smart Beta ETF report for FTCS
4,A,4,Stocks Cut Losses Amid Mixed Economic Data
5,A,5,"The worst result, after buying shares in a com..."
6,A,6,Agilent (A) reported earnings 30 days ago. Wha...
7,A,7,"SANTA CLARA, Calif., September 07, 2021--Agile..."
8,A,8,Agilent (A) signs an agreement with Visiopharm...
9,A,9,Is (A) Outperforming Other Computer and Techno...


### Transform Data <a class="anchor" id="TD"></a>

#### Write Data (Silver) <a class="anchor" id="WS"></a>

In [53]:
output_file_name = news_data_silver + '/snp500_all_news_1.csv'
combined_snp500_news.to_csv(output_file_name, index=False)  

In [39]:
combined_snp500_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2043940 entries, 0 to 2043939
Data columns (total 3 columns):
Ticker     object
NewsNum    object
Value      object
dtypes: object(3)
memory usage: 46.8+ MB


In [46]:
current_array = combined_snp500_news['Ticker'].unique()

In [47]:
snp500_list = si.tickers_sp500()

In [48]:
for ticker in current_array:
    snp500_list.remove(ticker)

In [49]:
snp500_list

['TYL',
 'UA',
 'UAA',
 'UAL',
 'UDR',
 'UHS',
 'ULTA',
 'UNH',
 'UNP',
 'UPS',
 'URI',
 'USB',
 'V',
 'VFC',
 'VIAC',
 'VLO',
 'VMC',
 'VNO',
 'VRSK',
 'VRSN',
 'VRTX',
 'VTR',
 'VTRS',
 'VZ',
 'WAB',
 'WAT',
 'WBA',
 'WDC',
 'WEC',
 'WELL',
 'WFC',
 'WHR',
 'WLTW',
 'WM',
 'WMB',
 'WMT',
 'WRB',
 'WRK',
 'WST',
 'WU',
 'WY',
 'WYNN',
 'XEL',
 'XLNX',
 'XOM',
 'XRAY',
 'XYL',
 'YUM',
 'ZBH',
 'ZBRA',
 'ZION',
 'ZTS']

In [50]:
combined_snp500_news.head()

Unnamed: 0,Ticker,NewsNum,Value
0,A,0,"SANTA CLARA, Calif., September 23, 2021--Pfize..."
1,A,1,"SANTA CLARA, Calif., September 22, 2021--Agile..."
2,A,2,Hedge fund manager William Ackman's 10% bet on...
3,A,3,Smart Beta ETF report for FTCS
4,A,4,Stocks Cut Losses Amid Mixed Economic Data


In [51]:
combined_snp500_news_backup = combined_snp500_news

#### Clean the Text <a class="anchor" id="CT"></a>

In [55]:
#Build punctuation dictionary
import unicodedata
import sys

# Create a dictionary of punctuation characters
punctuation = dict.fromkeys(i for i in range(sys.maxunicode)
                            if unicodedata.category(chr(i)).startswith('P'))
# Add the backtick/ Grave accent character
punctuation.update({96:None})

In [58]:
import glob

#Read he 
all_files = glob.glob(news_data_silver + "/*.csv")
li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

df_snp500_news_silver = pd.concat(li, axis=0, ignore_index=True)

In [59]:
df_snp500_news_silver.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2071500 entries, 0 to 2071499
Data columns (total 3 columns):
Ticker     object
NewsNum    int64
Value      object
dtypes: int64(1), object(2)
memory usage: 47.4+ MB


In [61]:
df_snp500_news_silver.shape

(2071500, 3)

In [63]:
df_snp500_news_silver.head(100)

Unnamed: 0,Ticker,NewsNum,Value
0,A,0,"SANTA CLARA, Calif., September 23, 2021--Pfize..."
1,A,1,"SANTA CLARA, Calif., September 22, 2021--Agile..."
2,A,2,Hedge fund manager William Ackman's 10% bet on...
3,A,3,Smart Beta ETF report for FTCS
4,A,4,Stocks Cut Losses Amid Mixed Economic Data
...,...,...,...
95,AAP,15,Yahoo Finance’s Ines Ferre reports on the day'...
96,AAP,16,"Shares of JetBlue (JBLU), which started flying..."
97,AAP,17,The Justice Department is readying a lawsuit c...
98,AAP,18,WASHINGTON (Reuters) -The U.S. Justice Departm...


In [66]:
# applying the cleaning function to text column
df_snp500_news_silver['CleanedText'] = df_snp500_news_silver['Value'].apply(lambda text: function_clean_stop(text))

In [67]:
df_snp500_news_silver.head()

Unnamed: 0,Ticker,NewsNum,Value,CleanedText
0,A,0,"SANTA CLARA, Calif., September 23, 2021--Pfize...",santa clara calif september pfizer executive d...
1,A,1,"SANTA CLARA, Calif., September 22, 2021--Agile...",santa clara calif september agilent announces ...
2,A,2,Hedge fund manager William Ackman's 10% bet on...,hedge fund manager william ackman bet universa...
3,A,3,Smart Beta ETF report for FTCS,smart beta etf report ftc
4,A,4,Stocks Cut Losses Amid Mixed Economic Data,stock cut loss amid mixed economic data


In [76]:
df_snp500_news_silver.iloc[0:1, 3].to_list()

['santa clara calif september pfizer executive dr mikael dolsten appointed agilent board director']

In [77]:
df_snp500_news_silver.iloc[0:1, 2].to_list()

['SANTA CLARA, Calif., September 23, 2021--Pfizer executive Dr. Mikael Dolsten appointed to Agilent board of directors.']

#### Write Data (Gold) <a class="anchor" id="WG"></a>

In [78]:
df_snp500_news_gold = df_snp500_news_silver[['Ticker','NewsNum','CleanedText']]

In [79]:
output_file_name = news_data_gold + '/snp500_all_news.csv'
df_snp500_news_gold.to_csv(output_file_name, index=False)  

In [80]:
df_snp500_news_gold.head()

Unnamed: 0,Ticker,NewsNum,CleanedText
0,A,0,santa clara calif september pfizer executive d...
1,A,1,santa clara calif september agilent announces ...
2,A,2,hedge fund manager william ackman bet universa...
3,A,3,smart beta etf report ftc
4,A,4,stock cut loss amid mixed economic data


In [81]:
df_snp500_news_gold.count()

Ticker         2071500
NewsNum        2071500
CleanedText    2071500
dtype: int64

In [82]:
#Create sub folder for stock news data in gold
news_data_gold_chunks = news_data_gold +"/small_chunks"
os.makedirs(news_data_gold_chunks, exist_ok=True)

In [83]:
#Split into small files
number_of_chunks = 25
for idx, chunk in enumerate(np.array_split(df_snp500_news_gold, number_of_chunks)):
    chunk.to_csv(f'/{news_data_gold_chunks}/snp500_news_{idx}.csv', index=False) 

In [84]:
filename = news_data_gold_chunks + '/snp500_news_2.csv'
test_df = pd.read_csv(filename, index_col=None, header=0)
test_df.count()

Ticker         82860
NewsNum        82860
CleanedText    82860
dtype: int64

In [85]:
test_df.head()

Unnamed: 0,Ticker,NewsNum,CleanedText
0,DD,5,month working remotely due covid pandemic pare...
1,DD,6,first time global scale new research found hea...
2,DD,7,carrier global corporation nyse carr chairman ...
3,DD,8,carrier today announced completed acquisition ...
4,DD,9,aerospace giant leading heating ventilation ai...
