###  ---------------------------------------------------------------------------------------------------------------------------------------
##### Copyright (c) Rajdeep Biswas
##### Licensed under the MIT license.
###### File: get_stock_data.ipynb
###### Date: 09/04/2021
###  ---------------------------------------------------------------------------------------------------------------------------------------

### Table of Contents

* [Initial Configurations](#IC)
    * [Import Libraries](#IL)
    * [Autheticate the AML Workspace](#AML)
* [Get Data (Bronze)](#GD)
    * [Setup Directory Structure](#SD)
    * [Download Daily Stock Data](#DD)
    * [Configure SnP500 List](#SL)
    * [Configure Time Window](#DT)
    * [Download Daily Stock Data](#DSD)
    * [Download Historical Stock Dividend Data](#HD)
    * [Download Historical Stock Split Data](#SS)
    * [Download Stock Sector Data](#SSD)    
    * [Download Stock Financial Data](#SFD)      

### Initial Configurations <a class="anchor" id="IC"></a>

#### Import Libraries <a class="anchor" id="IL"></a>

In [48]:
#Import required Libraries
import os

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.image import imread
import cv2
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

#pip install pandas_datareader
import pandas_datareader.data as web
import pandas as pd
import datetime as dt

import azureml.core
import azureml.automl
from azureml.core import Workspace, Dataset, Datastore

#### Autheticate the AML Workspace <a class="anchor" id="AML"></a>

In [21]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.33.0 to work with houston-techsummit-workspace


### Get Data (Bronze) <a class="anchor" id="GD"></a>
- Raw data extraction for the file, API based and web datasets. Let us call this __Bronze Layer__.
- Data transformation using python from Raw to Processed stage. We will call this __Silver Layer__.
- Finally store the processed data using standard taxonomy in a SQL based serving layer. We will call this __Gold Layer__.

#### Setup Directory Structure <a class="anchor" id="SD"></a>

In [42]:
data_folder = os.path.join(os.getcwd(), 'data')

#Create the data directory
os.makedirs(data_folder, exist_ok=True)

#Create the bronze, silver and gold folders
bronze_data_folder = data_folder +"/bronze"
os.makedirs(bronze_data_folder, exist_ok=True)

silver_data_folder = data_folder +"/silver"
os.makedirs(silver_data_folder, exist_ok=True)

gold_data_folder = data_folder +"/gold"
os.makedirs(gold_data_folder, exist_ok=True)

#Create sub folder for stock daily close data
file_data_bronze = bronze_data_folder +"/snp500"
os.makedirs(file_data_bronze, exist_ok=True)

#Create sub folder for dividends
dividend_data_bronze = bronze_data_folder +"/snp500_dividends"
os.makedirs(dividend_data_bronze, exist_ok=True)

#Create sub folder for splits
split_data_bronze = bronze_data_folder +"/snp500_splits"
os.makedirs(split_data_bronze, exist_ok=True)

#Create sub folder for sectors
sector_data_bronze = bronze_data_folder +"/snp500_sectors"
os.makedirs(sector_data_bronze, exist_ok=True)

#Create sub folder for financials
financial_data_bronze = bronze_data_folder +"/snp500_financial"
os.makedirs(financial_data_bronze, exist_ok=True)

#### Configure SnP500 List<a class="anchor" id="SL"></a>

In [49]:
s_n_p_list = ['AAPL', 'MSFT', 'AMZN', 'FB', 'GOOGL', 'GOOG', 'TSLA', 'NVDA', 'BRK.B', 'JPM', 'JNJ',
              'UNH', 'V', 'PG', 'HD', 'PYPL', 'DIS', 'ADBE', 'BAC', 'MA', 'CMCSA', 'PFE', 'CRM', 'NFLX',
              'CSCO', 'XOM', 'VZ', 'ABT', 'KO', 'INTC', 'TMO', 'PEP', 'ACN', 'ABBV', 'NKE', 'DHR', 'LLY', 
              'WMT', 'AVGO', 'COST', 'T', 'MRK', 'WFC', 'CVX', 'MDT', 'MCD', 'TXN', 'QCOM', 'NEE', 'LIN', 
              'ORCL', 'HON', 'PM', 'MS', 'INTU', 'BMY', 'C', 'LOW', 'UNP', 'UPS', 'GS', 'SBUX', 'AMD', 'BLK',
              'AMT', 'AMGN', 'RTX', 'NOW', 'IBM', 'ISRG', 'AMAT', 'TGT', 'BA', 'DE', 'CAT', 'GE', 'MRNA', 'CVS',
              'MMM', 'CHTR', 'SCHW', 'AXP', 'SPGI', 'PLD', 'ZTS', 'BKNG', 'MO', 'ANTM', 'GILD', 'SYK', 'ADP',
              'LMT', 'TJX', 'ADI', 'MDLZ', 'LRCX', 'CCI', 'CB', 'MU', 'PNC', 'DUK', 'TMUS', 'MMC', 'FIS', 'EL',
              'USB', 'TFC', 'EQIX', 'COF', 'COP', 'CSX', 'BDX', 'EW', 'SHW', 'CI', 'CME', 'REGN', 'FISV', 'SO',
              'ADSK', 'ICE', 'ITW', 'ETN', 'ILMN', 'HCA', 'CL', 'FDX', 'AON', 'BSX', 'ATVI', 'NSC', 'EMR', 'D',
              'GM', 'MCO', 'WM', 'APD', 'NXPI', 'IDXX', 'PGR', 'ECL', 'NOC', 'JCI', 'CMG', 'A', 'FCX', 'DG', 'MSCI',
              'HUM', 'KLAC', 'VRTX', 'TWTR', 'BIIB', 'ALGN', 'DXCM', 'F', 'TROW', 'ROP', 'SNPS', 'TEL', 'IQV',
              'EBAY', 'PSA', 'GPN', 'LHX', 'EXC', 'TT', 'CARR', 'DOW', 'AIG', 'KMB', 'NEM', 'MET', 'DLR', 'APH',
              'GD', 'CDNS', 'INFO', 'BK', 'AEP', 'SPG', 'MCHP', 'FTNT', 'RMD', 'ROST', 'PRU', 'SRE', 'EA', 'ORLY',
              'MSI', 'APTV', 'SYY', 'ALL', 'TRV', 'CTSH', 'DD', 'EOG', 'SBAC', 'SLB', 'DFS', 'YUM', 'BAX', 'PH',
              'XLNX', 'PPG', 'ROK', 'MPC', 'IFF', 'OTIS', 'XEL', 'PAYX', 'CNC', 'WELL', 'WBA', 'MTD', 'MNST', 'MAR',
              'AFL', 'HPQ', 'STZ', 'GIS', 'NUE', 'CTAS', 'KR', 'FRC', 'PXD', 'HLT', 'CMI', 'AZO', 'ADM', 'WST',
              'TDG', 'EFX', 'AWK', 'KEYS', 'VRSK', 'SIVB', 'CTVA', 'STT', 'CBRE', 'MCK', 'PEG', 'FAST', 'AVB',
              'KMI', 'AMP', 'ANSS', 'BLL', 'DHI', 'ZBRA', 'AME', 'ZBH', 'ES', 'SWK', 'PSX', 'GLW', 'SWKS', 'CPRT',
              'WMB', 'WEC', 'LH', 'LEN', 'AJG', 'LUV', 'EQR', 'WLTW', 'PCAR', 'ARE', 'CDW', 'ALB', 'ETSY', 'ODFL',
              'GNRC', 'FITB', 'VLO', 'SYF', 'O', 'WY', 'IT', 'RSG', 'GRMN', 'HSY', 'BBY', 'ED', 'LYB', 'DAL',
              'WAT', 'KSU', 'URI', 'VIAC', 'DOV', 'EXR', 'FTV', 'NTRS', 'VMC', 'XYL', 'VFC', 'HIG', 'PAYC', 'MLM',
              'TRMB', 'IP', 'ENPH', 'OKE', 'KHC', 'DTE', 'TSN', 'CERN', 'HBAN', 'NDAQ', 'PPL', 'AEE', 'TSCO',
              'CTLT', 'ETR', 'COO', 'CRL', 'MAA', 'FLT', 'EIX', 'OXY', 'VRSN', 'TDY', 'ESS', 'MKC', 'QRVO', 'ULTA',
              'CZR', 'FE', 'EXPD', 'DLTR', 'MPWR', 'VTR', 'STE', 'CLX', 'PKI', 'CHD', 'EXPE', 'HOLX', 'KMX', 'ANET',
              'TER', 'DPZ', 'HPE', 'DGX', 'BR', 'POOL', 'AMCR', 'NTAP', 'IR', 'TYL', 'KEY', 'DRI', 'RF', 'DRE',
              'TECH', 'PEAK', 'WDC', 'CCL', 'GWW', 'HES', 'AVY', 'CFG', 'TTWO', 'CINF', 'CMS', 'AKAM', 'TFX', 'RCL',
              'MKTX', 'ABC', 'MTB', 'BBWI', 'CE', 'HAL', 'NVR', 'VTRS', 'GPC', 'J', 'DVN', 'STX', 'MGM', 'BIO', 'RJF',
              'IEX', 'PFG', 'BKR', 'TXT', 'ABMD', 'BXP', 'UDR', 'WAB', 'AES', 'K', 'CAG', 'OMC', 'EVRG', 'NLOK',
              'MAS', 'EMN', 'CAH', 'LNT', 'UAL', 'JBHT', 'LKQ', 'LVS', 'CNP', 'IPG', 'PKG', 'PHM', 'INCY', 'PWR',
              'FANG', 'WHR', 'PTC', 'IRM', 'WRK', 'SJM', 'CBOE', 'XRAY', 'FBHS', 'AAP', 'JKHY', 'KIM', 'LDOS',
              'ALLE', 'BF.B', 'PNR', 'AAL', 'HRL', 'ATO', 'CTXS', 'HAS', 'L', 'LYV', 'HWM', 'LNC', 'FOXA', 'SNA',
              'LUMN', 'FFIV', 'UHS', 'FMC', 'PENN', 'CHRW', 'RHI', 'HST', 'MHK', 'TPR', 'NRG', 'MOS', 'REG', 'DISH',
              'WYNN', 'HSIC', 'RE', 'WRB', 'AIZ', 'CMA', 'BWA', 'AOS', 'CF', 'NWL', 'LW', 'IVZ', 'NI', 'JNPR', 'ZION',
              'NCLH', 'DXC', 'SEE', 'MRO', 'DVA', 'GL', 'TAP', 'WU', 'BEN', 'NWSA', 'PNW', 'ROL', 'OGN', 'CPB',
              'FRT', 'HII', 'DISCK', 'NLSN', 'PVH', 'APA', 'ALK', 'PBCT', 'VNO', 'HBI', 'LEG', 'COG', 'IPGP', 
              'RL', 'GPS', 'PRGO', 'UNM', 'FOX', 'NOV', 'DISCA', 'UAA', 'UA', 'NWS']

#### Configure Time Window<a class="anchor" id="DT"></a>

In [50]:
from datetime import datetime

#download window of 5 years
current_time = datetime.now()
lookback_time = datetime(current_time.year - 5, current_time.month , current_time.day)

#### Download Daily Stock Data <a class="anchor" id="DSD"></a>

In [24]:
failed_list = []
def download_daily_stock_data (stock_name, lookback_time, current_time):
    try:
        this_df = web.DataReader(stock_name, 'yahoo', start=lookback_time, end=current_time)
        output_file_name = file_data_bronze + '/' + stock_name + '_'+ lookback_time.strftime("%m-%d-%Y")+ '_'+ current_time.strftime("%m-%d-%Y")+ '_data.csv'
        this_df.to_csv(output_file_name)
    except Exception as e:
        failed_list.append(stock_name) 

In [18]:
for stock_name in s_n_p_list:
    download_daily_stock_data(stock_name,lookback_time,current_time)

In [19]:
failed_list

['BRK.B', 'BF.B']

#### Download Historical Stock Dividend Data <a class="anchor" id="HD"></a>

In [28]:
failed_list_dividend = []
def download_daily_stock_dividend (stock_name, lookback_time, current_time):
    try:
        this_df = web.DataReader(stock_name, 'yahoo-dividends', start=lookback_time, end=current_time)
        output_file_name = dividend_data_bronze + '/' + stock_name + '_'+ lookback_time.strftime("%m-%d-%Y")+ '_'+ current_time.strftime("%m-%d-%Y")+ '_dividend.csv'
        this_df.to_csv(output_file_name)
    except Exception as e:
        failed_list_dividend.append(stock_name) 

In [29]:
for stock_name in s_n_p_list:
    download_daily_stock_dividend(stock_name,lookback_time,current_time)

In [30]:
failed_list_dividend

['BRK.B', 'BF.B']

#### Download Historical Stock Split Data <a class="anchor" id="SS"></a>

In [31]:
failed_list_split = []
def download_daily_stock_split (stock_name, lookback_time, current_time):
    try:
        this_df = web.DataReader(stock_name, 'yahoo-actions', start=lookback_time, end=current_time)
        output_file_name = split_data_bronze + '/' + stock_name + '_'+ lookback_time.strftime("%m-%d-%Y")+ '_'+ current_time.strftime("%m-%d-%Y")+ '_splits.csv'
        this_df.to_csv(output_file_name)
    except Exception as e:
        failed_list_split.append(stock_name) 

In [32]:
for stock_name in s_n_p_list:
    download_daily_stock_split(stock_name,lookback_time,current_time)

In [34]:
failed_list_split

['BRK.B', 'BF.B']

#### Download Stock Sector Data <a class="anchor" id="SSD"></a>

In [43]:
#pip install datapackage
#Import the sector data
import datapackage
import pandas as pd

data_url = 'https://datahub.io/core/s-and-p-500-companies/datapackage.json'

# to load Data Package into storage
package = datapackage.Package(data_url)

# to load only tabular data
resources = package.resources
for resource in resources:
    if resource.tabular:
        sector_data = pd.read_csv(resource.descriptor['path'])
        #print (data)
output_file_name = sector_data_bronze + '/snp500_sector.csv'
sector_data.to_csv(output_file_name)        

In [44]:
data.head()

Unnamed: 0,Symbol,Name,Sector
0,MMM,3M,Industrials
1,AOS,A. O. Smith,Industrials
2,ABT,Abbott Laboratories,Health Care
3,ABBV,AbbVie,Health Care
4,ABMD,Abiomed,Health Care


#### Download Stock Financial Data <a class="anchor" id="SFD"></a>

In [47]:
#Import the financial data
data_url = 'https://datahub.io/core/s-and-p-500-companies-financials/datapackage.json'

# to load Data Package into storage
package = datapackage.Package(data_url)

# to load only tabular data
resources = package.resources
for resource in resources:
    if resource.tabular:
        financial_data = pd.read_csv(resource.descriptor['path'])
        #print (data)
output_file_name = financial_data_bronze + '/snp500_financial.csv'
financial_data.to_csv(output_file_name)  

In [46]:
data.head()

Unnamed: 0,Symbol,Name,Sector,Price,Price/Earnings,Dividend Yield,Earnings/Share,52 Week Low,52 Week High,Market Cap,EBITDA,Price/Sales,Price/Book,SEC Filings
0,MMM,3M Company,Industrials,222.89,24.31,2.332862,7.92,259.77,175.49,138721055226,9048000000.0,4.390271,11.34,http://www.sec.gov/cgi-bin/browse-edgar?action...
1,AOS,A.O. Smith Corp,Industrials,60.24,27.76,1.147959,1.7,68.39,48.925,10783419933,601000000.0,3.575483,6.35,http://www.sec.gov/cgi-bin/browse-edgar?action...
2,ABT,Abbott Laboratories,Health Care,56.27,22.51,1.908982,0.26,64.6,42.28,102121042306,5744000000.0,3.74048,3.19,http://www.sec.gov/cgi-bin/browse-edgar?action...
3,ABBV,AbbVie Inc.,Health Care,108.48,19.41,2.49956,3.29,125.86,60.05,181386347059,10310000000.0,6.291571,26.14,http://www.sec.gov/cgi-bin/browse-edgar?action...
4,ACN,Accenture plc,Information Technology,150.51,25.47,1.71447,5.44,162.6,114.82,98765855553,5643228000.0,2.604117,10.62,http://www.sec.gov/cgi-bin/browse-edgar?action...
