In [1]:
"""
Created on Sun Feb 21 2021
@author: Sahand-j
"""

import pandas as pd
import pandas_datareader,datetime
import pandas_datareader.data as web
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date 
import nltk
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from nltk.sentiment import SentimentIntensityAnalyzer
from sqlalchemy import create_engine
import yahoo_fin.stock_info as si

In [2]:
s = 'avgo,voo,jpm'

In [3]:
def format_ticker_list(ticker_list):
    return ticker_list.upper().split(',')

In [4]:
"""
    webscrapes new stock headlines from finviz.com
    :@return(dict): returns dictionary of stok tickers and their assoicated news headlines for availabe dates
    """

def stock_headline_scraper_dict(ticker_list):
    website_url = 'https://finviz.com/quote.ashx?t='
    news_tables = {}
    
    for ticker in ticker_list:

        #URL for each stock
        url = website_url + ticker

        #requesting url for each ticker
        response = urlopen(Request(url=url, headers={'user-agent': 'sentiment-analysis-app'}))

        #html parser, using bs4. downloaded the html
        html = BeautifulSoup(response,'html')

        #the body that contains all the news article links
        news_table_html_body = html.find(id = 'news-table')

        #each stock is in dictionary with value corresponding to news table
        news_tables.update({ ticker.upper() : news_table_html_body })
    return news_tables
    

In [5]:
"""
    takes in dictionary of stock tickers and their associated headlines 
    :@return(Dataframe): returns Dataframe of stock tickers, their assoicated news headlines, and sentiment score 
    """

def stock_sentiment_df(news_tables):
    
    parsed_data = []
    #itterating over key and value pairs. itterating over a dict
    for ticker, news_tables in news_tables.items():
        for row in news_tables.find_all('tr'):

            #title is in acnchor tag 'a', retrieving that from bs4 obj row
            title = row.a.text

            #time stamps have td tags
            timestamp = row.td.text

            #no date information
            if(len(timestamp.split(' ')) == 1):
                time = timestamp.split(' ')[0]

            #has date info, before time
            else:
                date = timestamp.split(' ')[0]
                time = timestamp.split(' ')[1]
            parsed_data.append([ticker,title,date,time])

        df = pd.DataFrame(parsed_data,columns=['ticker','title','date','time'])

        #compund score for each article title
        vader = SentimentIntensityAnalyzer()
        df['comp_score'] = df['title'].apply(lambda title : vader.polarity_scores(title)['compound'])

        for i in df.index:
            df.at[i, 'time'] = df['time'][i][0:7]

        #converting string time col to datetime obj   
        df['time'] = pd.to_datetime(df['time']).dt.strftime('%H:%M:%S')
        df['date'] = pd.to_datetime(df.date).dt.date

        #df = df.set_index('date')
        df['updated'] = pd.to_datetime('now')

        #filterign nuetral news out of df
        df = df[df.comp_score != 0]

        #columns of interest
        df = df[['date','ticker','comp_score','title','updated']]
    return df


In [6]:
engine = create_engine('postgresql://postgres:postgres@localhost:5432/Stocks')

In [7]:
#needs to go to DB to group by 
df = stock_sentiment_df(stock_headline_scraper_dict(format_ticker_list(s)))
df['date'] = pd.to_datetime(df.date)
df.set_index('date',inplace=True)
df.index

DatetimeIndex(['2021-02-05', '2021-02-05', '2021-02-04', '2021-02-02',
               '2021-02-02', '2021-01-30', '2021-01-28', '2021-01-26',
               '2021-01-19', '2021-01-19',
               ...
               '2021-02-02', '2021-02-01', '2021-02-01', '2021-01-31',
               '2021-01-28', '2021-01-28', '2021-01-27', '2021-01-27',
               '2021-01-27', '2021-01-27'],
              dtype='datetime64[ns]', name='date', length=141, freq=None)

In [8]:
df

Unnamed: 0_level_0,ticker,comp_score,title,updated
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-02-05,AVGO,-0.1280,Synaptics Stock Soars. Demand Is Hot for Chips...,2021-02-24 02:19:36.756613
2021-02-05,AVGO,0.5859,10 Best Technology Stocks That Pay Dividends,2021-02-24 02:19:36.756613
2021-02-04,AVGO,0.3400,Broadcom Inc. (AVGO) Outpaces Stock Market Gai...,2021-02-24 02:19:36.756613
2021-02-02,AVGO,-0.3612,Broadcom Inc. Announces Expiration and Final R...,2021-02-24 02:19:36.756613
2021-02-02,AVGO,0.6369,My Best Dividend-Paying Tech Stock for 2021,2021-02-24 02:19:36.756613
...,...,...,...,...
2021-01-28,JPM,-0.1655,Trading frenzy in AMC stock may stave off bank...,2021-02-24 02:19:36.756613
2021-01-27,JPM,0.5106,AMC Stock Tops GameStop's Surge Amid Reddit-He...,2021-02-24 02:19:36.756613
2021-01-27,JPM,-0.0680,"Dont Blame Fed for GameStop Stock Frenzy, Says...",2021-02-24 02:19:36.756613
2021-01-27,JPM,0.5994,Why GameStop is destined to become another Blo...,2021-02-24 02:19:36.756613


In [9]:
df.to_sql('stock_sentiments', engine, if_exists='replace')

In [10]:
#loading stock data
def stock_prices_dict(ticker_list):
    
    """
    :@return(dict): returns dictionary of stock ticker with Dataframe values with stock historic price data
    """
    
    dict_of_dfs = {}
    for i in ticker_list:
        temp_df = si.get_data(i)  
        temp_df['rolling_mean'] = temp_df['adjclose'].rolling(round(len(temp_df)*.15)).mean()
        temp_df['rolling_std'] = temp_df['adjclose'].rolling(round(len(temp_df)*.15)).std()
        temp_df['cumel_return'] = (1 + temp_df['adjclose'].pct_change(1)).cumprod()
        temp_df['updated'] = pd.to_datetime('now')
        dict_of_dfs.update({i.upper() : temp_df})
    return dict_of_dfs


In [11]:
#datetime index
stock_prices_dict(format_ticker_list(s)).get('AVGO')

Unnamed: 0,open,high,low,close,adjclose,volume,ticker,rolling_mean,rolling_std,cumel_return,updated
2009-08-06,16.500000,16.910000,15.560000,16.180000,12.669686,24197800,AVGO,,,,2021-02-24 02:19:42.882868
2009-08-07,16.150000,16.760000,16.030001,16.430000,12.865452,2454300,AVGO,,,1.015451,2021-02-24 02:19:42.882868
2009-08-10,16.629999,16.629999,15.610000,15.970000,12.505249,2421000,AVGO,,,0.987021,2021-02-24 02:19:42.882868
2009-08-11,15.980000,16.000000,15.500000,15.670000,12.270336,2305400,AVGO,,,0.968480,2021-02-24 02:19:42.882868
2009-08-12,16.150000,16.200001,15.660000,16.000000,12.528741,1451300,AVGO,,,0.988875,2021-02-24 02:19:42.882868
...,...,...,...,...,...,...,...,...,...,...,...
2021-02-17,484.230011,486.119995,475.059998,482.480011,482.480011,1408400,AVGO,310.984810,63.438514,38.081449,2021-02-24 02:19:42.882868
2021-02-18,480.230011,485.410004,475.130005,483.260010,483.260010,1647900,AVGO,311.550866,63.871614,38.143013,2021-02-24 02:19:42.882868
2021-02-19,487.250000,495.140015,486.769989,489.959991,489.959991,1799200,AVGO,312.133056,64.337680,38.671833,2021-02-24 02:19:42.882868
2021-02-22,484.000000,486.609985,475.320007,476.359985,476.359985,1696200,AVGO,312.678726,64.719072,37.598404,2021-02-24 02:19:42.882868


In [12]:
df1 = df.iloc[:50]
df1

Unnamed: 0_level_0,ticker,comp_score,title,updated
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-02-05,AVGO,-0.128,Synaptics Stock Soars. Demand Is Hot for Chips...,2021-02-24 02:19:36.756613
2021-02-05,AVGO,0.5859,10 Best Technology Stocks That Pay Dividends,2021-02-24 02:19:36.756613
2021-02-04,AVGO,0.34,Broadcom Inc. (AVGO) Outpaces Stock Market Gai...,2021-02-24 02:19:36.756613
2021-02-02,AVGO,-0.3612,Broadcom Inc. Announces Expiration and Final R...,2021-02-24 02:19:36.756613
2021-02-02,AVGO,0.6369,My Best Dividend-Paying Tech Stock for 2021,2021-02-24 02:19:36.756613
2021-01-30,AVGO,0.5574,Better Buy: Skyworks Solutions vs. Broadcom,2021-02-24 02:19:36.756613
2021-01-28,AVGO,0.34,Broadcom Inc. (AVGO) Outpaces Stock Market Gai...,2021-02-24 02:19:36.756613
2021-01-26,AVGO,0.5106,Strong Tech Dividend Stocks to Buy with Nasdaq...,2021-02-24 02:19:36.756613
2021-01-19,AVGO,-0.3612,Broadcom Inc. Announces Early Results of Previ...,2021-02-24 02:19:36.756613
2021-01-19,AVGO,0.34,"Russia-Linked Hack Spread Via New Malware, Sec...",2021-02-24 02:19:36.756613


In [13]:
df1.to_sql('test_duplicate_main_data', engine, if_exists='fail')

In [14]:
df2 = df.iloc[:100]
df2

Unnamed: 0_level_0,ticker,comp_score,title,updated
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-02-05,AVGO,-0.1280,Synaptics Stock Soars. Demand Is Hot for Chips...,2021-02-24 02:19:36.756613
2021-02-05,AVGO,0.5859,10 Best Technology Stocks That Pay Dividends,2021-02-24 02:19:36.756613
2021-02-04,AVGO,0.3400,Broadcom Inc. (AVGO) Outpaces Stock Market Gai...,2021-02-24 02:19:36.756613
2021-02-02,AVGO,-0.3612,Broadcom Inc. Announces Expiration and Final R...,2021-02-24 02:19:36.756613
2021-02-02,AVGO,0.6369,My Best Dividend-Paying Tech Stock for 2021,2021-02-24 02:19:36.756613
...,...,...,...,...
2021-02-17,JPM,0.2500,Financial Sector Appears Poised to Move Higher,2021-02-24 02:19:36.756613
2021-02-17,JPM,0.4019,"Dow Jones Futures: Apple, Tesla Test Support A...",2021-02-24 02:19:36.756613
2021-02-16,JPM,0.5267,Dow Jones Holds Gain After New High While Nasd...,2021-02-24 02:19:36.756613
2021-02-16,JPM,0.6486,"Bitcoin price hits $50,000 for first time amid...",2021-02-24 02:19:36.756613


In [15]:
check = engine.has_table('new_data_with_old')
print(check)

False


In [16]:
if check == True:
    engine.execute('DROP TABLE new_data_with_old CASCADE;')
    df2.to_sql('new_data_with_old', engine, if_exists='replace')
else:
    df2.to_sql('new_data_with_old', engine, if_exists='replace')

In [17]:
view_query = '''

create view new_joined_sentiment_data as
select
new_data_with_old.date,
new_data_with_old.ticker,
new_data_with_old.comp_score,
new_data_with_old.title,
new_data_with_old.updated

from new_data_with_old
left join test_duplicate_main_data ON
new_data_with_old.date = test_duplicate_main_data.date AND
new_data_with_old.ticker = test_duplicate_main_data.ticker AND
new_data_with_old.title = test_duplicate_main_data.title
WHERE test_duplicate_main_data.date IS null;

'''

In [18]:
add_new_vals_to_senti_table_query = '''insert into test_duplicate_main_data
select *
from new_joined_sentiment_data;'''

In [19]:
engine.execute(view_query)

<sqlalchemy.engine.result.ResultProxy at 0x7fa234431190>

In [20]:
engine.execute(add_new_vals_to_senti_table_query)

<sqlalchemy.engine.result.ResultProxy at 0x7fa23d591f40>