In [97]:
import pandas as pd
import numpy as np
import time
from datetime import *
import matplotlib.pyplot as plt

import nltk
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from nltk.sentiment import SentimentIntensityAnalyzer

import psycopg2 as pg2

In [98]:
s = ['voo','jpm','iipr','tgt']

In [99]:
def avg_sentiment_df(parsed_df):
    mean_df = parsed_df.groupby(['Ticker','Date']).mean()
    mean_df = mean_df.unstack()
    mean_df = mean_df.xs(key='compound_score',axis = 1).transpose()
    return mean_df

In [100]:
def stock_news_headline_parser_scraper(stock_ticker_list):
    
    website_url = 'https://finviz.com/quote.ashx?t='
    news_tables = {}

    for ticker in stock_ticker_list:

        #URL for each stock
        url = website_url + ticker

        #requesting url for each ticker
        response = urlopen(Request(url=url, headers={'user-agent': 'sentiment-analysis-app'}))

        #html parser, using bs4. downloaded the html
        html = BeautifulSoup(response,'html')

        #the body that contains all the news article links
        news_table_html_body = html.find(id = 'news-table')

        #each stock is in dictionary with value corresponding to news table
        news_tables.update({ticker:news_table_html_body})



    parsed_data = []

    #itterating over key and value pairs. itterating over a dict
    for ticker, news_tables in news_tables.items():

        # news_tables.find_all('tr') is bs4 list of all articles headlins
        for row in news_tables.find_all('tr'):

            #title is in acnchor tag 'a', retrieving that from bs4 obj row
            title = row.a.text

            #time stamps have td tags
            timestamp = row.td.text

            #no date information
            if(len(timestamp.split(' ')) == 1):
                time = timestamp.split(' ')[0]

            #has date info, before time
            else:
                date = timestamp.split(' ')[0]
                time = timestamp.split(' ')[1]

            parsed_data.append([ticker,title,date,time])
    #########
        
        df = pd.DataFrame(parsed_data,columns=['Ticker','Title','Date', 'Time'])
        vader = SentimentIntensityAnalyzer()
        df['Compound_score'] = df['Title'].apply(lambda title : vader.polarity_scores(title)['compound'] )

        for i in df.index:
            df.at[i, 'Time'] = df['Time'][i][0:7]
            
        df['Time'] = pd.to_datetime(df['Time']).dt.strftime('%H:%M:%S')
        df['Date'] = pd.to_datetime(df.Date).dt.date
        df = df.set_index('Date')
    return df[['Time','Ticker','Compound_score','Title']]

In [101]:
#Saving to Database
conn = pg2.connect(database= 'Stock_sentiment_analysis_data',user='postgres',password='postgres')
cur = conn.cursor()

conn.close()

In [102]:
stock_news_headline_parser_scraper(s)

Unnamed: 0_level_0,Time,Ticker,Compound_score,Title
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-02-10,17:25:00,voo,0.0000,Red-Hot ARK ETFs Add $12.5 Billion in New Cash...
2021-01-27,14:06:00,voo,0.0000,When to Give Up on a Stock
2021-01-15,16:17:00,voo,0.0000,Which S&P 500 companies are changing their pol...
2021-01-06,16:15:00,voo,0.3818,Whats Behind ETF Issuer Growth Gap
2021-01-04,09:38:00,voo,0.0000,These Charts Show How Extreme 2020 Was for Inv...
...,...,...,...,...
2021-01-20,07:30:00,tgt,0.0000,"The Zacks Analyst Blog Highlights: Target, Fiv..."
2021-01-19,17:31:00,tgt,0.0000,Target Rides the New Trend in Retail
2021-01-19,16:51:00,tgt,0.0000,"Is Target Stock A Buy Right Now, After Its Lat..."
2021-01-19,12:53:00,tgt,0.5106,CSAIL 2015-C3 Commercial Mortgage Trust -- Moo...


In [103]:
#avg_sentiment_df(stock_news_headline_parser_scraper(s))