## An example of using the utilities library in order to create a training dataset for the final classifier.

In [6]:
# The Hugging Face transformers library. Neccessary to use FinBERT

# EXECUTE ONCE, IN ORDER TO INSTALL transformers LIBRARY
!pip install transformers -q
!pip install yfinance -q 
!pip install pandas_datareader -q 

In [1]:
import pandas as pd 
# Our library
from utilities import financial_dataset, read_news, merge_fin_news, sentim_analyzer, merge_dates

### Read headlines for the desired company and store them in a dataframe. Both csv files are parsed.

In [2]:
aapl_news = read_news('AAPL')
aapl_news

The bot found 469 headlines from analyst_ratings_processed.csv, regarding AAPL stock
The bot found 32 headlines from raw_partner_headlines.csv, regarding AAPL stock
The bot found 501 headlines in total, regarding AAPL stock


Unnamed: 0,headline,date,stock
0,American Pie,2020-06-02,AAPL
1,Tech Giants Dare Antitrust Deal Watchdogs,2020-06-02,AAPL
2,MoneyGram Shares Jump 50% As Western Union Rep...,2020-06-02,AAPL
3,All Eyes on Market Volatility,2020-06-01,AAPL
4,Warren Buffett's Berkshire Hathaway Turns Up S...,2020-06-01,AAPL
...,...,...,...
496,"UBS Maintains Buy on Apple, Lowers Price Targe...",2020-03-10,AAPL
497,123 Biggest Movers From Yesterday,2020-03-10,AAPL
498,Crude Awakening: Energy Sector Takes A 20% Spi...,2020-03-09,AAPL
499,Investor Movement Index Summary: February 2020,2020-03-09,AAPL


In [3]:
aapl_fin = financial_dataset('AAPL', num_of_labels = 2)
aapl_fin

[*********************100%***********************]  1 of 1 completed
AAPL financial dataframe dimensions  (2770, 6)
Positive changes : 1468
Negative changes : 1302
No changes : 0


Unnamed: 0_level_0,Open,Close,Volume,Price_change,date
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-12-31,7.611786,7.526071,352410800,1,0
2010-01-04,7.622500,7.643214,493729600,1,2010-01-04
2010-01-05,7.664286,7.656429,601904800,1,2010-01-05
2010-01-06,7.656429,7.534643,552160000,-1,2010-01-06
2010-01-07,7.562500,7.520714,477131200,-1,2010-01-07
...,...,...,...,...,...
2020-12-24,131.320007,131.970001,54930100,1,2020-12-24
2020-12-28,133.990005,136.690002,124486200,1,2020-12-28
2020-12-29,138.050003,134.869995,121047300,-1,2020-12-29
2020-12-30,135.580002,133.720001,96452100,-1,2020-12-30


### Headlines published during the weekend are lost during merge of financial and news dataframes.

In [4]:
merged_apple = merge_fin_news(aapl_fin, aapl_news)
merged_apple

Unnamed: 0,date,stock,Open,Close,Volume,headline,Price_change
0,2020-03-09,AAPL,65.937500,66.542503,286744800,Crude Awakening: Energy Sector Takes A 20% Spi...,-1
1,2020-03-09,AAPL,65.937500,66.542503,286744800,Investor Movement Index Summary: February 2020,-1
2,2020-03-09,AAPL,65.937500,66.542503,286744800,101 Stocks Moving In Monday's Mid-Day Session,-1
3,2020-03-10,AAPL,69.285004,71.334999,285290000,Peloton Shares Tick To Session Low As Hearing ...,1
4,2020-03-10,AAPL,69.285004,71.334999,285290000,Morning Market Stats In 5 Minutes,1
...,...,...,...,...,...,...,...
457,2020-06-10,AAPL,86.974998,88.209999,166651600,10 Biggest Price Target Changes For Wednesday,1
458,2020-06-10,AAPL,86.974998,88.209999,166651600,"Benzinga Pro's Top 5 Stocks To Watch For Wed.,...",1
459,2020-06-10,AAPL,86.974998,88.209999,166651600,"Deutsche Bank Maintains Buy on Apple, Raises P...",1
460,2020-06-10,AAPL,86.974998,88.209999,166651600,Apple To Let Users Trade In Their Mac Computer...,1


#### Initialize transformer model for sentiment analysis

In [7]:
#!pip install transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

### Conduct the sentiment analysis (transformer inference) on the merged dataframe and get the sentiment scores. (Feature Engineering)

In [8]:
train_apple_df = sentim_analyzer(merged_apple, tokenizer, model)
train_apple_df

100%|████████████████████████████████████████████████████████████████████████████████| 462/462 [00:33<00:00, 13.69it/s]


Unnamed: 0,date,stock,Open,Close,Volume,headline,Positive,Negative,Neutral,Price_change
0,2020-03-09,AAPL,65.937500,66.542503,286744800,Crude Awakening: Energy Sector Takes A 20% Spi...,0.022984,0.948264,0.028752,-1
1,2020-03-09,AAPL,65.937500,66.542503,286744800,Investor Movement Index Summary: February 2020,0.027477,0.053617,0.918906,-1
2,2020-03-09,AAPL,65.937500,66.542503,286744800,101 Stocks Moving In Monday's Mid-Day Session,0.087919,0.232512,0.679569,-1
3,2020-03-10,AAPL,69.285004,71.334999,285290000,Peloton Shares Tick To Session Low As Hearing ...,0.010256,0.935584,0.054160,1
4,2020-03-10,AAPL,69.285004,71.334999,285290000,Morning Market Stats In 5 Minutes,0.027023,0.063252,0.909725,1
...,...,...,...,...,...,...,...,...,...,...
457,2020-06-10,AAPL,86.974998,88.209999,166651600,10 Biggest Price Target Changes For Wednesday,0.035469,0.150262,0.814269,1
458,2020-06-10,AAPL,86.974998,88.209999,166651600,"Benzinga Pro's Top 5 Stocks To Watch For Wed.,...",0.051103,0.014987,0.933910,1
459,2020-06-10,AAPL,86.974998,88.209999,166651600,"Deutsche Bank Maintains Buy on Apple, Raises P...",0.748401,0.031066,0.220533,1
460,2020-06-10,AAPL,86.974998,88.209999,166651600,Apple To Let Users Trade In Their Mac Computer...,0.039779,0.016893,0.943328,1


### Merge dates by computing average sentiment score for each date

In [9]:
train_apple_df = merge_dates(train_apple_df)
train_apple_df

 Dataframe now contains sentiment score for 65 different dates.


Unnamed: 0,date,stock,Open,Close,Volume,Positive,Negative,Neutral,Price_change
0,2020-03-09,AAPL,65.937500,66.542503,286744800,0.046127,0.411465,0.542409,-1
1,2020-03-10,AAPL,69.285004,71.334999,285290000,0.070845,0.449025,0.480130,1
2,2020-03-11,AAPL,69.347504,68.857498,255598800,0.190995,0.453761,0.355244,-1
3,2020-03-12,AAPL,63.985001,62.057499,418474000,0.204221,0.447518,0.348261,-1
4,2020-03-13,AAPL,66.222504,69.492500,370732000,0.315863,0.218127,0.466010,1
...,...,...,...,...,...,...,...,...,...
60,2020-06-04,AAPL,81.097504,80.580002,87560400,0.375941,0.027481,0.596579,-1
61,2020-06-05,AAPL,80.837502,82.875000,137250400,0.539309,0.035119,0.425571,1
62,2020-06-08,AAPL,82.562500,83.364998,95654400,0.655664,0.034216,0.310120,1
63,2020-06-09,AAPL,83.035004,85.997498,147712400,0.406257,0.030222,0.563521,1


#### Store into a csv

In [10]:
train_apple_df.to_csv('Financial_News/train_apple_2_labels.csv')

In [11]:
train_apple_df['date'].unique()

array(['2020-03-09', '2020-03-10', '2020-03-11', '2020-03-12',
       '2020-03-13', '2020-03-16', '2020-03-17', '2020-03-18',
       '2020-03-19', '2020-03-20', '2020-03-23', '2020-03-24',
       '2020-03-25', '2020-03-26', '2020-03-27', '2020-03-30',
       '2020-03-31', '2020-04-01', '2020-04-02', '2020-04-03',
       '2020-04-06', '2020-04-07', '2020-04-08', '2020-04-09',
       '2020-04-13', '2020-04-14', '2020-04-15', '2020-04-16',
       '2020-04-17', '2020-04-20', '2020-04-21', '2020-04-22',
       '2020-04-23', '2020-04-24', '2020-04-27', '2020-04-28',
       '2020-04-29', '2020-04-30', '2020-05-01', '2020-05-04',
       '2020-05-05', '2020-05-06', '2020-05-08', '2020-05-11',
       '2020-05-12', '2020-05-13', '2020-05-14', '2020-05-15',
       '2020-05-18', '2020-05-19', '2020-05-20', '2020-05-21',
       '2020-05-22', '2020-05-26', '2020-05-27', '2020-05-28',
       '2020-05-29', '2020-06-01', '2020-06-02', '2020-06-03',
       '2020-06-04', '2020-06-05', '2020-06-08', '2020-