In [None]:
from bs4 import BeautifulSoup
import os

html_tables = {}
for table_name in os.listdir('datasets'):
    table_path = f'datasets/{table_name}'
    table_file = open(table_path, 'r')
    html = BeautifulSoup(table_file)
    html_table = html.find(id='news-table')
    html_tables[table_name] = html_table

In [None]:
tsla = html_tables['tsla_22sep.html']
tsla_tr = tsla.findAll('tr')

# For each row...
for i, table_row in enumerate(tsla_tr):
    
    link_text = table_row.a.get_text() 
   
    data_text = table_row.td.get_text()
   
    print(f'File number {i+1}:')
     
    print(link_text)
    print(data_text)
    if i == 3:
        break

###Extracting the news headlines


In [None]:
# Hold the parsed news into a list
parsed_news = []

for file_name, news_table in html_tables.items():
   
    for x in news_table.findAll('tr'):
        text = x.get_text()
        date_scrape = x.td.text.split()
        if len(date_scrape) == 1:
            time = date_scrape[0]
        else:
            date = date_scrape[0]
            time = date_scrape[1]

        ticker = file_name.split("_")[0]
        parsed_news.append([ticker, date, time, x.a.text])

###Using NLTK
<p>Adding some new words and sentiment values to lexicon.</p>

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
new_words = {
    'crushes': 10,
    'beats': 5,
    'misses': -5,
    'trouble': -10,
    'falls': -100,
}
vader = SentimentIntensityAnalyzer()

vader.lexicon.update(new_words)

###Sentiment Estimates
<p>Programmatically predicting sentiment out of news headlines.</p>


In [None]:
import pandas as pd
columns = ['ticker', 'date', 'time', 'headline']
scored_news = pd.DataFrame(parsed_news, columns=columns)

scores = [vader.polarity_scores(headline) for headline in scored_news.headline]
scores_df = pd.DataFrame(scores)
scored_news.columns = columns

scored_news = scored_news.join(scores_df)

scored_news['date'] = pd.to_datetime(scored_news.date).dt.date

###Plotting all the sentiment in subplots
<p>Plotting the time series for the stocks we have.</p>

In [None]:
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")
%matplotlib inline


mean_c = scored_news.groupby(['date', 'ticker']).mean()
mean_c = mean_c.unstack('ticker')
mean_c = mean_c.xs("compound", axis="columns")

mean_c.plot.bar(figsize = (10, 6));

###Duplicates
<p>If two headlines are verbatim the same as another but from another news outlet, getting rid of verbatim copied headlines.</p>

In [None]:
num_news_before = scored_news.headline.count()
scored_news_clean = scored_news.drop_duplicates(subset=['headline', 'ticker'])
num_news_after = scored_news_clean.headline.count()

f"Before we had {num_news_before} headlines, now we have {num_news_after}"

## Sentiment on one single trading day and stock
<p>focussing on one trading day and one single stock. making an informative plot where we will see the smallest grain possible: headline and subscores.</p>

In [None]:
# Setting the index to ticker and date
single_day = scored_news_clean.set_index(['ticker', 'date'])

single_day = single_day.xs('fb')
single_day = single_day.loc['2019-01-03']
single_day['time'] = pd.to_datetime(single_day['time']).dt.time
single_day = single_day.set_index('time')
# Sort it
single_day = single_day.sort_index()

##Visualizing
<p>Visualizing the positive, negative and neutral scores for a single day of trading and a single stock.</p>

In [None]:
TITLE = "Negative, neutral, and positive sentiment for FB on 2019-01-03"
COLORS = ["red","orange", "green"]

plot_day = single_day.drop(['compound', 'headline'], 1)
plot_day.columns = ['negative', 'neutral', 'positive']

# Plot a stacked bar chart
plot_day.plot.bar(stacked = True, figsize=(10, 6), title = TITLE, color = COLORS).legend(bbox_to_anchor=(1.2, 0.5))
plt.ylabel("scores");