## Preliminary items

References: 
https://github.com/peanutshawny/lstm-stock-predictor

In [21]:
import re
import string
import unicodedata

# importing
import numpy as np
import pandas as pd
import requests
import spacy
import nltk
nltk.download('stopwords')
nltk.download('vader_lexicon')

nlp = spacy.load('en_core_web_sm')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [22]:
pd.set_option('max_colwidth', 400)

In [23]:
%load_ext google.colab.data_table

The google.colab.data_table extension is already loaded. To reload it, use:
  %reload_ext google.colab.data_table


## Mount Google Drive 

In [24]:
currentWorkingDir = !pwd
defaultWorkingDir = "/content"

if ( currentWorkingDir[0] == defaultWorkingDir ):
  from google.colab import drive

  drive.mount('/content/drive')
      
  %cd "/content/drive/My Drive/Colab Notebooks/stock_portfolio"
else:
  print("Currenting running app from: ")
  !pwd

Currenting running app from: 
/content/drive/My Drive/Colab Notebooks/stock_portfolio


# Import articles to dataframe

In [None]:
df = pd.read_csv("articles.csv")

In [None]:
df.content = df.content.astype('str')

In [None]:
df.dtypes

postid             object
post_date          object
instrument_code    object
title              object
link               object
content            object
content_length      int64
dtype: object

In [None]:
df.head(10)

Unnamed: 0,postid,post_date,instrument_code,title,link,content,content_length
0,post-1104646,2020-12-31 16:16:00,138SL,138 Student Living Jamaica Limited (138SL) Audited Financial Statements for the Year Ended 30 September 2020,https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-audited-financial-statements-for-the-year-ended-30-september-2020/,138 Student Living Jamaica 2020 – Final 138SL Quarterly Report as at 30 September 2020,86
1,post-1101933,2020-11-13 18:13:00,138SL,"138 Student Living Jamaica Limited (138SL) Unaudited Financial Statements for the Twelve Months Ended September 30, 2020",https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-unaudited-financial-statements-for-the-twelve-months-ended-september-30-2020/,138 SL FS Sept qtr – 2020 – Final Qtr,37
2,post-1096087,2020-08-14 09:27:00,138SL,"138 Student Living Jamaica Limited (138SL) Unaudited Financial Statements for the Third Quarter Ended June 30, 2020",https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-unaudited-financial-statements-for-the-third-quarter-ended-june-30-2020/,138SL Q3 Financials June 2020,29
3,post-1094046,2020-07-16 16:21:00,138SL,138 Student Living Jamaica Limited (138SL) – Impact on Business Operation by COVID-19,https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-impact-on-business-operation-by-covid-19/,138 Student Living Jamaica Limited (138SL) – Disclosure of COVID-19 Impact on Business Operation,96
4,post-1090194,2020-05-18 15:49:00,138SL,138 Student Living Jamaica Limited (138SL) Unaudited Financial Statements for the Six Months Ended 31 March 2020,https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-unaudited-financial-statements-for-the-six-months-ended-31-march-2020-2/,138SL FS March Qtr 2 2020 138SL Note to JSE,43
5,post-1085967,2020-03-03 16:35:00,138SL,138 Student Living Jamaica Limited (138SL) – Trading in Shares,https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-trading-in-shares-3/,"138 Student Living Jamaica Limited (138SL) has advised that a Director purchased 9,902 138SL shares on February 28, 2020.",121
6,post-1085350,2020-02-25 19:56:00,138SL,138 Student Living Jamaica Limited 2019 Annual Report,https://www.jamstockex.com/138-student-living-jamaica-limited-2019-annual-report/,138SL Annual Report 2019,24
7,post-1085327,2020-02-25 17:22:00,138SL,138 Student Living Jamaica Limited (138SL) – Executive Appointment,https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-executive-appointment/,138 Student Living Jamaica Limited (138SL) has advised that Renelle Pearson has been designated as an Executive of the Company for the purposes of securities transactions by directors and senior executives due to the nature of her position as Accountant.,254
8,post-1085270,2020-02-24 17:05:00,138SL,138 Student Living Jamaica Limited (138SL)- Trading in Shares,https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-trading-in-shares-2/,"138 Student Living Jamaica Limited (138SL) has advised that a director purchased 990,098 138SL shares on February 20, 2020.",123
9,post-1084747,2020-02-17 13:24:00,138SL,138 Student Living Jamaica Limited (138SL) Unaudited Financial Statements for the Three Months Ended 31 December 2019,https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-unaudited-financial-statements-for-the-three-months-ended-31-december-2019/,138SL – FY 2020 Qtr 1 Financial Report,38


# Clean up data

## Remove articles outside of scope

In [None]:
df_sentiment = df

In [None]:
df_sentiment.post_date.min()

'2014-04-11 00:00:00'

In [None]:
df_sentiment.post_date.max()

'2021-01-21 17:24:00'

In [None]:
df_sentiment.post_date = pd.to_datetime(df_sentiment.post_date, format="%Y-%m-%d").dt.date
df_sentiment.post_date = pd.to_datetime(df_sentiment.post_date)

In [None]:
from datetime import datetime
df_sentiment = df_sentiment [(df_sentiment.post_date >= datetime(2016,1,1)) & (df_sentiment.post_date < datetime(2021,1,1)) ]

In [None]:
df_sentiment.shape

(7400, 7)

In [None]:
df_sentiment.post_date

0      2020-12-31
1      2020-11-13
2      2020-08-14
3      2020-07-16
4      2020-05-18
          ...    
7776   2016-07-29
7777   2016-05-12
7778   2016-05-12
7779   2016-05-12
7780   2016-03-31
Name: post_date, Length: 7400, dtype: datetime64[ns]

## Remove fields that have no content

In [None]:
df_sentiment = df_sentiment[df_sentiment.content_length > 0]

## Clean up content field (remove "click here")

Filter for keywords

In [None]:
df_sentiment[df_sentiment.content.str.contains("|".join(["click here"]),case=False) ]

Unnamed: 0,postid,post_date,instrument_code,title,link,content,content_length
46,post-945093,2017-02-09,138SL,"138 Student Living Unaudited Financial Statements for the First Quarter Ended December 31, 2016",https://www.jamstockex.com/138-student-living-unaudited-financial-statements-first-quarter-ended-december-31-2016/,Click here to open document,27
51,post-943454,2016-12-29,138SL,138 Student Living Jamaica Limited 2016 Audited Financial Statements,https://www.jamstockex.com/138-student-living-jamaica-limited-2016-audited-financial-statements/,Click here to open,18
52,post-941975,2016-11-14,138SL,138 Student Living Jamaica Limited 4th Quarter Financials – Sept 2016,https://www.jamstockex.com/138-sl-4th-quarter-financials-sept-2016/,Click here to open document,27
54,post-938519,2016-08-12,138SL,138 Student Living Jamaica Ltd (138 SL) 3rd Qtr Results – June 2016,https://www.jamstockex.com/138-student-living-jamaica-ltd-138-sl-3rd-qtr-results-june-2016/,Click here to open document,27
57,post-356439,2016-02-12,138SL,138 Student Living Jamaica Limited (138SL) 1st Qtr Results – December 2015,https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-1st-qtr-results-december-2015/,Click here to open document,27
...,...,...,...,...,...,...,...
7176,post-351488,2016-02-04,LASD,"LASCO Distributors Limited (LASD) Unaudited Results for the Third Quarter ended December 31, 2015 (Revised)",https://www.jamstockex.com/lasco-distributors-limited-lasd-unaudited-results-for-the-third-quarter-ended-december-31-2015/,Click here to download the financialsClick here to open document 2,66
7316,post-352686,2016-02-05,LASF,"LASCO Financial Services Limited (LASF) Unaudited Results for the Third Quarter ended December 31, 2015",https://www.jamstockex.com/lasco-financial-services-limited-lasf-unaudited-results-for-the-third-quarter-ended-december-31-2015/,Click here to open document,27
7503,post-355346,2016-02-12,MDS,"Medical Disposables & Supplies Limited (MDS) – Unaudited Financial Results for the Nine Months Ended December 31, 2015",https://www.jamstockex.com/medical-disposables-supplies-limited-mds-unaudited-financial-results-nine-months-ended-december-31-2015/,Click here to open document,29
7571,post-332400,2016-01-12,PTL,"Paramount Trading (Jamaica) Limited (PTL) Unaudited Results for the 2nd quarter ended November 30, 2015",https://www.jamstockex.com/paramount-trading-jamaica-limited-ptl-unaudited-results-for-the-2nd-quarter-ended-november-30-2015/,Click here to open document,27


Use regex to remove phrases

In [None]:
import re

p = re.compile("(Click here to open \\bdocuments?)|(Click here to (download|open))|(Click here to view \\bdocuments?)|(Click here)|(Please click here to download)",re.IGNORECASE)

In [None]:
df_sentiment["content_clean"] = df_sentiment.content.str.replace(p,"",regex=True)

In [None]:
df_sentiment[df_sentiment.content.str.contains("|".join(["click here"]),case=False) ]

Unnamed: 0,postid,post_date,instrument_code,title,link,content,content_length,content_clean
46,post-945093,2017-02-09,138SL,"138 Student Living Unaudited Financial Statements for the First Quarter Ended December 31, 2016",https://www.jamstockex.com/138-student-living-unaudited-financial-statements-first-quarter-ended-december-31-2016/,Click here to open document,27,
51,post-943454,2016-12-29,138SL,138 Student Living Jamaica Limited 2016 Audited Financial Statements,https://www.jamstockex.com/138-student-living-jamaica-limited-2016-audited-financial-statements/,Click here to open,18,
52,post-941975,2016-11-14,138SL,138 Student Living Jamaica Limited 4th Quarter Financials – Sept 2016,https://www.jamstockex.com/138-sl-4th-quarter-financials-sept-2016/,Click here to open document,27,
54,post-938519,2016-08-12,138SL,138 Student Living Jamaica Ltd (138 SL) 3rd Qtr Results – June 2016,https://www.jamstockex.com/138-student-living-jamaica-ltd-138-sl-3rd-qtr-results-june-2016/,Click here to open document,27,
57,post-356439,2016-02-12,138SL,138 Student Living Jamaica Limited (138SL) 1st Qtr Results – December 2015,https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-1st-qtr-results-december-2015/,Click here to open document,27,
...,...,...,...,...,...,...,...,...
7176,post-351488,2016-02-04,LASD,"LASCO Distributors Limited (LASD) Unaudited Results for the Third Quarter ended December 31, 2015 (Revised)",https://www.jamstockex.com/lasco-distributors-limited-lasd-unaudited-results-for-the-third-quarter-ended-december-31-2015/,Click here to download the financialsClick here to open document 2,66,the financials 2
7316,post-352686,2016-02-05,LASF,"LASCO Financial Services Limited (LASF) Unaudited Results for the Third Quarter ended December 31, 2015",https://www.jamstockex.com/lasco-financial-services-limited-lasf-unaudited-results-for-the-third-quarter-ended-december-31-2015/,Click here to open document,27,
7503,post-355346,2016-02-12,MDS,"Medical Disposables & Supplies Limited (MDS) – Unaudited Financial Results for the Nine Months Ended December 31, 2015",https://www.jamstockex.com/medical-disposables-supplies-limited-mds-unaudited-financial-results-nine-months-ended-december-31-2015/,Click here to open document,29,
7571,post-332400,2016-01-12,PTL,"Paramount Trading (Jamaica) Limited (PTL) Unaudited Results for the 2nd quarter ended November 30, 2015",https://www.jamstockex.com/paramount-trading-jamaica-limited-ptl-unaudited-results-for-the-2nd-quarter-ended-november-30-2015/,Click here to open document,27,


Get content_clean length

In [None]:
df_sentiment['content_clean_length'] = df_sentiment.content_clean.str.len()

In [None]:
filter_click_here = df_sentiment.content.str.contains("|".join(["click here"]),case=False) 

In [None]:
df_sentiment[filter_click_here]

Unnamed: 0,postid,post_date,instrument_code,title,link,content,content_length,content_clean,content_clean_length
46,post-945093,2017-02-09,138SL,"138 Student Living Unaudited Financial Statements for the First Quarter Ended December 31, 2016",https://www.jamstockex.com/138-student-living-unaudited-financial-statements-first-quarter-ended-december-31-2016/,Click here to open document,27,,0
51,post-943454,2016-12-29,138SL,138 Student Living Jamaica Limited 2016 Audited Financial Statements,https://www.jamstockex.com/138-student-living-jamaica-limited-2016-audited-financial-statements/,Click here to open,18,,0
52,post-941975,2016-11-14,138SL,138 Student Living Jamaica Limited 4th Quarter Financials – Sept 2016,https://www.jamstockex.com/138-sl-4th-quarter-financials-sept-2016/,Click here to open document,27,,0
54,post-938519,2016-08-12,138SL,138 Student Living Jamaica Ltd (138 SL) 3rd Qtr Results – June 2016,https://www.jamstockex.com/138-student-living-jamaica-ltd-138-sl-3rd-qtr-results-june-2016/,Click here to open document,27,,0
57,post-356439,2016-02-12,138SL,138 Student Living Jamaica Limited (138SL) 1st Qtr Results – December 2015,https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-1st-qtr-results-december-2015/,Click here to open document,27,,0
...,...,...,...,...,...,...,...,...,...
7176,post-351488,2016-02-04,LASD,"LASCO Distributors Limited (LASD) Unaudited Results for the Third Quarter ended December 31, 2015 (Revised)",https://www.jamstockex.com/lasco-distributors-limited-lasd-unaudited-results-for-the-third-quarter-ended-december-31-2015/,Click here to download the financialsClick here to open document 2,66,the financials 2,17
7316,post-352686,2016-02-05,LASF,"LASCO Financial Services Limited (LASF) Unaudited Results for the Third Quarter ended December 31, 2015",https://www.jamstockex.com/lasco-financial-services-limited-lasf-unaudited-results-for-the-third-quarter-ended-december-31-2015/,Click here to open document,27,,0
7503,post-355346,2016-02-12,MDS,"Medical Disposables & Supplies Limited (MDS) – Unaudited Financial Results for the Nine Months Ended December 31, 2015",https://www.jamstockex.com/medical-disposables-supplies-limited-mds-unaudited-financial-results-nine-months-ended-december-31-2015/,Click here to open document,29,,2
7571,post-332400,2016-01-12,PTL,"Paramount Trading (Jamaica) Limited (PTL) Unaudited Results for the 2nd quarter ended November 30, 2015",https://www.jamstockex.com/paramount-trading-jamaica-limited-ptl-unaudited-results-for-the-2nd-quarter-ended-november-30-2015/,Click here to open document,27,,0


Replace fields that contain "click here" with text from URL

In [None]:
df_sentiment['new_content'] = np.where(filter_click_here, df_sentiment.link,df_sentiment.content_clean)

In [None]:
d = df_sentiment.link.str.replace("https://www.jamstockex.com/", "")

In [None]:
e = d.str.replace("-"," ")

In [None]:
link_txt = e.str.replace("/","")

In [None]:
df_sentiment['content'] = np.where(filter_click_here,link_txt ,df_sentiment.content_clean)

In [None]:
df_sentiment['content_length'] = df_sentiment.content.str.len()

In [None]:
df_sentiment[ df_sentiment.content_length <= 7]

Unnamed: 0,postid,post_date,instrument_code,title,link,content,content_length,content_clean,content_clean_length,new_content
2351,post-1102287,2020-11-18,MJE,Mayberry Jamaican Equities Limited (MJE) – Notice of Annual General Meeting,https://www.jamstockex.com/mayberry-jamaican-equities-limited-mje-notice-of-annual-general-meeting/,\n,2,\n,2,\n
3202,post-482184,2016-05-11,PJAM,Pan-Jamaican Investment Trust Limited (PJAM) – Resolution,https://www.jamstockex.com/482184-2/,\n,2,\n,2,\n


Remove the fields with blank values

In [None]:
column_list = ["postid","post_date","instrument_code","title","link","content"] 
filter_content_len = ~(df_sentiment.content_length <= 7)

df_sentiment = df_sentiment.loc[ filter_content_len, column_list ]

In [None]:
df_sentiment

Output hidden; open in https://colab.research.google.com to view.

# Create fields based on nature of article

## Create field for articles that pertain to financial reports

In [None]:
df_sentiment[df_sentiment.content.str.contains("|".join(["quarter","q\d","qtr","audited","report","results","statements","supplementary"]),case=False)]

Unnamed: 0,postid,post_date,instrument_code,title,link,content
0,post-1104646,2020-12-31,138SL,138 Student Living Jamaica Limited (138SL) Audited Financial Statements for the Year Ended 30 September 2020,https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-audited-financial-statements-for-the-year-ended-30-september-2020/,138 Student Living Jamaica 2020 – Final 138SL Quarterly Report as at 30 September 2020
1,post-1101933,2020-11-13,138SL,"138 Student Living Jamaica Limited (138SL) Unaudited Financial Statements for the Twelve Months Ended September 30, 2020",https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-unaudited-financial-statements-for-the-twelve-months-ended-september-30-2020/,138 SL FS Sept qtr – 2020 – Final Qtr
2,post-1096087,2020-08-14,138SL,"138 Student Living Jamaica Limited (138SL) Unaudited Financial Statements for the Third Quarter Ended June 30, 2020",https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-unaudited-financial-statements-for-the-third-quarter-ended-june-30-2020/,138SL Q3 Financials June 2020
4,post-1090194,2020-05-18,138SL,138 Student Living Jamaica Limited (138SL) Unaudited Financial Statements for the Six Months Ended 31 March 2020,https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-unaudited-financial-statements-for-the-six-months-ended-31-march-2020-2/,138SL FS March Qtr 2 2020 138SL Note to JSE
6,post-1085350,2020-02-25,138SL,138 Student Living Jamaica Limited 2019 Annual Report,https://www.jamstockex.com/138-student-living-jamaica-limited-2019-annual-report/,138SL Annual Report 2019
...,...,...,...,...,...,...
7772,post-941661,2016-11-10,TTECH,"tTech Unaudited Financial Statements at September 30, 2016",https://www.jamstockex.com/ttech-unaudited-financial-statements-september-30-2016/,"tTech Unaudited Financial Statements at September 30, 2016"
7774,post-938792,2016-08-19,TTECH,tTech Limited Annual Report 2015,https://www.jamstockex.com/ttech-limited-annual-report-2015/,tTech Limited Annual Report 2015
7775,post-938377,2016-08-11,TTECH,"tTech Limited Unaudited Financial Statements at June 30, 2016",https://www.jamstockex.com/ttech-limited-unaudited-financial-statements-at-june-30-2016/,"tTech Limited Unaudited Financial Statements at June 30, 2016"
7778,post-486495,2016-05-12,TTECH,tTech Limited 2015 Audited Financial Statements,https://www.jamstockex.com/ttech-2015-audited-financial-statements/,tTech 2015 Audited Financial Statements


In [None]:
filter_report = df_sentiment.content.str.contains("|".join(["quarter","q\d","qtr","audited","report","results","statements","supplementary"]),case=False)

In [None]:
df_sentiment['is_regarding_financial_report'] = np.where(filter_report,1,0)

In [None]:
df_sentiment

Output hidden; open in https://colab.research.google.com to view.

## Create field based on when shares are bought/sold by connected parties

In [None]:
filter_shares = df_sentiment.content.str.contains("|".join(["sold|purchased"]),case=False) &  df_sentiment.content.str.contains("|".join(["shares"]),case=False)

In [None]:
df_sentiment[df_sentiment.content.str.contains("|".join(["sold|purchased"]),case=False) &  df_sentiment.content.str.contains("|".join(["shares"]),case=False)]

Unnamed: 0,postid,post_date,instrument_code,title,link,content,is_regarding_financial_report
5,post-1085967,2020-03-03,138SL,138 Student Living Jamaica Limited (138SL) – Trading in Shares,https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-trading-in-shares-3/,"138 Student Living Jamaica Limited (138SL) has advised that a Director purchased 9,902 138SL shares on February 28, 2020.",0
8,post-1085270,2020-02-24,138SL,138 Student Living Jamaica Limited (138SL)- Trading in Shares,https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-trading-in-shares-2/,"138 Student Living Jamaica Limited (138SL) has advised that a director purchased 990,098 138SL shares on February 20, 2020.",0
27,post-988855,2018-09-12,138SL,138 Student Living Jamaica Limited (138SL) Trading in Shares,https://www.jamstockex.com/138-student-living-jamaica-limited-138sl-trading-in-shares/,"138 Student Living Jamaica Limited (138SL) has advised that a connected party sold 67,901,000 138SL shares on September 5, 2018.",0
48,post-944261,2017-01-18,138SL,138 Student Living Jamaica Limited Trading in Shares,https://www.jamstockex.com/138-student-living-jamaica-limited-trading-in-shares/,"138 Student Living Jamaica Limited (138SL) has advised that a Director sold 30,734,795 138SL shares on January 13, 2017.",0
49,post-944232,2017-01-18,138SL,138 Student Living Jamaica Limited Trading in Shares,https://www.jamstockex.com/138-student-living-jamaica-limited-trading-shares/,"138 Student Living Jamaica Limited (138SL) has advised that a Director sold 7,838,090 138SL shares on January 16, 2017.",0
...,...,...,...,...,...,...,...
7755,post-954802,2017-09-05,TTECH,tTech Limited (TTECH) Trading in Shares,https://www.jamstockex.com/ttech-limited-ttech-trading-in-shares-2/,"tTech Limited (TTECH) has advised that a Director sold 29,840 TTECH shares to an Officer of the Company on August 31, 2017",0
7761,post-951349,2017-07-11,TTECH,tTech Limited (TTECH) Trading in Shares,https://www.jamstockex.com/ttech-limited-ttech-trading-shares-2/,"tTech Limited (TTECH) has advised that a Senior Manager and a Director traded shares on July 5, 2017. The senior manager sold 353,000 TTECH shares, which were purchased by a Director.",0
7768,post-946450,2017-03-10,TTECH,tTECH Limited (TTECH) – Trading in Shares,https://www.jamstockex.com/946450-2/,"tTECH Limited (TTECH) has advised that a Director purchased 200,000 TTECH shares on March 8, 2017.",0
7771,post-942336,2016-11-24,TTECH,tTech Limited Trading in Shares,https://www.jamstockex.com/ttech-limited-trading-shares/,"tTech Limited has advised that a Director purchased 32, 000 TTech shares on September 20, 2016.",0


In [None]:
df_sentiment['is_sold_pur_shares'] = np.where(filter_shares,1,0)

In [None]:
df_sentiment

Output hidden; open in https://colab.research.google.com to view.

# Function to perform sentiment analysis

In [None]:
def getSentiment(article):
    """
    function that generates a soup to process text and output sentiment scores
    """

    # empty list for sentiment data
    sentiment_list = np.empty(shape=0, dtype=object)

    # cleaning text by removing punctuation and stopwords, as well as lemmatization
    punctuations = string.punctuation
    sw = stopwords.words('english')

    # converting to unicode
    section = unicodedata.normalize('NFKD', article)
    section = section.replace('\t', ' ').replace('\n', '').replace('/s', '').replace('\'', '')

    # joining, removing unecessary characters, and truncating text
    text = ''.join(section)
    text = re.sub('\s+', ' ', text).strip()
    text = text[:40000]

    # creating spacy nlp variable to tokenize and remove punctuation
    doc = nlp(text)

    doc = [token.lemma_.lower().strip() for token in doc]
    doc = [token for token in doc if token.isalpha()]
    doc = [token for token in doc if token not in punctuations and token not in sw]

    # joining text and getting sentiment
    doc = ' '.join(doc)

    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(doc)

    sentiment_list = np.append(sentiment_list, sentiment)

    # transposing each type of sentiment (pos, neg, neu) into separate features
    sentiment_df = pd.DataFrame({'sentiment': sentiment_list})

    return sentiment_df

# Add column with article sentiment

In [None]:
a = df_sentiment['content'].to_list()

In [None]:
sentimentsList = [getSentiment(x).values[0][0]['compound'] for x in a]

In [None]:
df_sentiment['sentiment'] = sentimentsList

In [None]:
df_sentiment

Output hidden; open in https://colab.research.google.com to view.

# Group sentiment information based on date and instrument_code

In [None]:
df_sentiment_ml = df_sentiment.groupby(["instrument_code","post_date"]).agg({"sentiment":"mean",
                                                                             "is_regarding_financial_report":"min",
                                                                             "is_sold_pur_shares":"min"})

In [None]:
df_sentiment_ml

Unnamed: 0_level_0,Unnamed: 1_level_0,sentiment,is_regarding_financial_report,is_sold_pur_shares
instrument_code,post_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
138SL,2016-01-05,0.4588,0,1
138SL,2016-01-22,0.3182,1,0
138SL,2016-02-12,-0.2263,1,0
138SL,2016-05-11,0.0000,1,0
138SL,2016-06-13,0.2960,0,0
...,...,...,...,...
XFUND,2020-07-22,-0.2263,0,0
XFUND,2020-08-17,-0.2500,1,0
XFUND,2020-09-16,0.4588,0,1
XFUND,2020-09-22,0.4588,0,1


# Save to CSV

In [35]:
df_sentiment.to_csv("articles_sentiment_processed.csv")
df_sentiment_ml.to_csv("articles_sentiment_processed_ML.csv")

NameError: ignored

# Combine sentiments with prices dataset

## Import into dataframes

In [10]:
sen_df = pd.read_csv("articles_sentiment_processed_ML.csv")
prices_df = pd.read_csv("jse_main_price_2016-2020.csv")

prices_df = prices_df[["Symbol","Date","Close_Price","Today_High","Today_Low","Volume_non_block"]]

In [11]:
sen_df

Unnamed: 0,instrument_code,post_date,sentiment,is_regarding_financial_report,is_sold_pur_shares
0,138SL,2016-01-05,0.4588,0,1
1,138SL,2016-01-22,0.3182,1,0
2,138SL,2016-02-12,-0.2263,1,0
3,138SL,2016-05-11,0.0000,1,0
4,138SL,2016-06-13,0.2960,0,0
...,...,...,...,...,...
6522,XFUND,2020-07-22,-0.2263,0,0
6523,XFUND,2020-08-17,-0.2500,1,0
6524,XFUND,2020-09-16,0.4588,0,1
6525,XFUND,2020-09-22,0.4588,0,1


In [12]:
prices_df

Unnamed: 0,Symbol,Date,Close_Price,Today_High,Today_Low,Volume_non_block
0,AFS,2016-01-04,16.000000,16.00,16.00,8510.0
1,BRG,2016-01-04,3.190000,3.20,3.15,85100.0
2,CCC,2016-01-04,20.150000,20.44,20.00,10000.0
3,CAR,2016-01-04,60.110000,62.00,60.00,2261300.0
4,GK,2016-01-04,84.000000,84.00,84.00,2483.0
...,...,...,...,...,...,...
63878,FIRSTROCKJMD,2020-12-31,12.552716,12.70,12.48,45211.0
63879,CABROKERS,2020-12-31,1.884615,1.89,1.75,1300.0
63880,TJH,2020-12-31,1.326428,1.36,1.30,10197113.0
63881,LUMBER,2020-12-31,1.543207,1.58,1.45,224630.0


## Data Preparation: Prices dataframe

###Convert dates from string to datetime to facilite date calculations

In [13]:
prices_df.Date   = pd.to_datetime(prices_df.Date,   format="%Y-%m-%d").dt.date
sen_df.post_date = pd.to_datetime(sen_df.post_date, format="%Y-%m-%d").dt.date

### Field: Previous business day

In [14]:
import datetime
from pandas.tseries.offsets import BDay
# BDay is business day

prices_df["Previous_business_day"] = prices_df.Date.apply(lambda x:(x-pd.tseries.offsets.BDay(1)))
prices_df.Previous_business_day = pd.to_datetime(prices_df.Previous_business_day, format="%Y-%m-%d").dt.date

In [15]:
prices_df

Unnamed: 0,Symbol,Date,Close_Price,Today_High,Today_Low,Volume_non_block,Previous_business_day
0,AFS,2016-01-04,16.000000,16.00,16.00,8510.0,2016-01-01
1,BRG,2016-01-04,3.190000,3.20,3.15,85100.0,2016-01-01
2,CCC,2016-01-04,20.150000,20.44,20.00,10000.0,2016-01-01
3,CAR,2016-01-04,60.110000,62.00,60.00,2261300.0,2016-01-01
4,GK,2016-01-04,84.000000,84.00,84.00,2483.0,2016-01-01
...,...,...,...,...,...,...,...
63878,FIRSTROCKJMD,2020-12-31,12.552716,12.70,12.48,45211.0,2020-12-30
63879,CABROKERS,2020-12-31,1.884615,1.89,1.75,1300.0,2020-12-30
63880,TJH,2020-12-31,1.326428,1.36,1.30,10197113.0,2020-12-30
63881,LUMBER,2020-12-31,1.543207,1.58,1.45,224630.0,2020-12-30


## Data Preparation: Prices dataframe (per Symbol)

In [17]:
### Function: Used for field "stock_not_traded_date_list"
# Function creates an array of dates that the stock was not traded on

date_lookback_limit = 5 # Determines max dates to lookup
def getDatesToSumSentiments (x):
  
  lb = min(date_lookback_limit, x['no_days_not_traded_since_last_traded'] )
  date_list = list()
  for i in range(1,lb+1):
    date_list.append(
      x['Date'] - pd.tseries.offsets.BDay(i)
    )
  return date_list

In [19]:
# Single df with all data
df_list = list()

for s in prices_df.Symbol.unique():
	
  #### Create prices df for each symbol and sort by Date field
  p_df = prices_df[prices_df.Symbol == s]
  p_df = p_df.sort_values(by='Date', ascending=True)

  #### Create sentiment df for each symbol and sort by Date field 
  s_df = sen_df[sen_df.instrument_code == s]
  s_df = s_df.sort_values(by='post_date', ascending=True)	

  ### Field: Determine if traded on previous business day 
  p_df["is_traded_on_Previous_business_day"] = \
    (p_df.Date.shift() == p_df.Previous_business_day )
    
    
  ### Field: Calculate the previous day last traded
  temp 		 				            = p_df.Date.shift()
  temp.iloc[0] 			        	= p_df.Date[p_df.Date.first_valid_index()]
  p_df["Previous_trade_day"]  = temp


  ### Calculate the number of days that the stock did not trade for prior 
  ### to the current date

  # Function to determine no of business days between dates inclusive
  f = lambda x: (len(pd.bdate_range(x['Previous_trade_day'], x['Date'] )))

  p_df['no_days_not_traded_since_last_traded'] = (p_df.apply(f, axis=1))

  p_df['no_days_not_traded_since_last_traded'] =  \
    p_df['no_days_not_traded_since_last_traded'] - 2 # All values were off by 2 days

  # Set the first date as zero as there is no data for it to check against
  p_df.no_days_not_traded_since_last_traded.iloc[0] = 0 


  ## Field: Determine the dates that the stock did not trade for
  p_df['stock_not_traded_date_list'] = p_df.apply(getDatesToSumSentiments, axis=1)

  ### Convert prices dataframe to 1NF by creating a separate row for each value in
  ### 'stock_not_traded_date_list' field

  p_df = p_df.explode('stock_not_traded_date_list')
    
  ### Convert date fields to string to faciliate join
  p_df.stock_not_traded_date_list = p_df.stock_not_traded_date_list.astype(str)
  p_df.Date = p_df.Date.astype(str)
  s_df.post_date = s_df.post_date.astype(str)

  ### Perform join
  c_df = pd.merge(p_df, s_df, how='left', left_on="stock_not_traded_date_list",right_on="post_date")
  c_df = pd.merge(c_df, s_df, how='left', left_on="Date",right_on="post_date")


  # Group columns
  c_df = c_df.groupby(["Symbol","Date"]).agg({'Close_Price'     :'mean',
                                              'Volume_non_block':'mean',
                                              'no_days_not_traded_since_last_traded':'mean',
                                              'Today_High'      :'mean',
                                              'Today_Low'       :'mean',
                                              'sentiment_x'     :'mean',
                                              'is_regarding_financial_report_x' :'max',
                                              'is_sold_pur_shares_x'            :'max',
                                              'sentiment_y'     :'mean',
                                              'is_regarding_financial_report_y' :'max',
                                              'is_sold_pur_shares_y'            :'max'
                                              })
  
  c_df['sentiment'] =  c_df[["sentiment_x","sentiment_y"]].mean(axis=1)
  c_df['is_regarding_financial_report'] = c_df[["is_regarding_financial_report_x", "is_regarding_financial_report_y"]].max(axis=1)
  c_df['is_sold_pur_shares'] = c_df[["is_sold_pur_shares_x", "is_sold_pur_shares_y"]].max(axis=1)

  c_df = c_df[[
               "Close_Price", 
               "Volume_non_block",
               "Today_High",
               "Today_Low",
               "no_days_not_traded_since_last_traded",
               "sentiment",
               "is_regarding_financial_report",
               "is_sold_pur_shares"]]

  # Save individually to file then to list for saving to single file
  c_df.to_csv("prices_sentiment_"+s+".csv")
  df_list.append(c_df)

# Combine then save all to file
dfs = pd.concat(df_list)
dfs.to_csv("prices_sentiment__all.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [20]:
dfs.corr()['Close_Price']

Close_Price                             1.000000
Volume_non_block                       -0.009635
Today_High                              0.999080
Today_Low                               0.999566
no_days_not_traded_since_last_traded    0.081076
sentiment                               0.013378
is_regarding_financial_report          -0.042616
is_sold_pur_shares                      0.004400
Name: Close_Price, dtype: float64

### Field: Determine if traded on previous business day

In [146]:
prices_df["is_traded_on_Previous_business_day"] = (ccc_df.Date.shift() == ccc_df.Previous_business_day )

In [147]:
prices_df

Unnamed: 0,Symbol,Date,Close_Price,Volume_non_block,Previous_business_day,is_traded_on_Previous_business_day
0,AFS,2016-01-04,16.000000,8510.0,2016-01-01,
1,BRG,2016-01-04,3.190000,85100.0,2016-01-01,
2,CCC,2016-01-04,20.150000,10000.0,2016-01-01,False
3,CAR,2016-01-04,60.110000,2261300.0,2016-01-01,
4,GK,2016-01-04,84.000000,2483.0,2016-01-01,
...,...,...,...,...,...,...
63878,FIRSTROCKJMD,2020-12-31,12.552716,45211.0,2020-12-30,
63879,CABROKERS,2020-12-31,1.884615,1300.0,2020-12-30,
63880,TJH,2020-12-31,1.326428,10197113.0,2020-12-30,
63881,LUMBER,2020-12-31,1.543207,224630.0,2020-12-30,


### Field: Calculate the previous day last traded

In [15]:
temp = ccc_df.Date.shift()
temp.iloc[0] = ccc_df.Date[ccc_df.Date.first_valid_index()]

In [16]:
ccc_df["Previous_trade_day"] = temp

### Calculate the number of days that the stock did not trade for prior to the current date

In [17]:
f = lambda x: (len(pd.bdate_range(x['Previous_trade_day'], x['Date'] )))

ccc_df['no_days_not_traded_since_last_traded'] = (ccc_df.apply(f, axis=1))

ccc_df['no_days_not_traded_since_last_traded'] = ccc_df['no_days_not_traded_since_last_traded'] -2
ccc_df.no_days_not_traded_since_last_traded.iloc[0] = 0 # Set the first date as zero as there is no data for it to check against

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [18]:
ccc_df[ccc_df.is_traded_on_Previous_business_day == False]

Unnamed: 0,Symbol,Date,Close_Price,Previous_business_day,is_traded_on_Previous_business_day,Previous_trade_day,no_days_not_traded_since_last_traded
2,CCC,2016-01-04,20.150000,2016-01-01,False,2016-01-04,0
998,CCC,2016-02-11,27.740000,2016-02-10,False,2016-02-09,1
2021,CCC,2016-03-29,23.630000,2016-03-28,False,2016-03-23,3
3176,CCC,2016-05-17,25.870000,2016-05-16,False,2016-05-13,1
3310,CCC,2016-05-24,24.000000,2016-05-23,False,2016-05-19,2
...,...,...,...,...,...,...,...
50668,CCC,2020-04-14,48.565695,2020-04-13,False,2020-04-09,2
52749,CCC,2020-05-26,51.755767,2020-05-25,False,2020-05-22,1
56412,CCC,2020-08-07,43.634515,2020-08-06,False,2020-08-05,1
59986,CCC,2020-10-20,45.010000,2020-10-19,False,2020-10-16,1


## Determine the dates that the stock did not trade for

In [81]:
def getDatesToSumSentiments (x):
  date_list = list()
  for i in range(x['no_days_not_traded_since_last_traded']):
    date_list.append(
        x['Date'] - pd.tseries.offsets.BDay(i)
    )
  return date_list

In [82]:
ccc_df['stock_not_traded_date_list'] = ccc_df.apply(getDatesToSumSentiments, axis=1)

In [79]:
ccc_df

Unnamed: 0,Symbol,Date,Close_Price,Previous_business_day,is_traded_on_Previous_business_day,Previous_trade_day,no_days_not_traded_since_last_traded,stock_not_traded_date_list
2,CCC,2016-01-04,20.150000,2016-01-01,False,2016-01-04,0,[]
30,CCC,2016-01-05,19.500000,2016-01-04,True,2016-01-04,0,[]
69,CCC,2016-01-06,20.240000,2016-01-05,True,2016-01-05,0,[]
105,CCC,2016-01-07,20.870000,2016-01-06,True,2016-01-06,0,[]
143,CCC,2016-01-08,22.030000,2016-01-07,True,2016-01-07,0,[]
...,...,...,...,...,...,...,...,...
63493,CCC,2020-12-24,61.551857,2020-12-23,True,2020-12-23,0,[]
63571,CCC,2020-12-28,60.519899,2020-12-25,False,2020-12-24,1,[2020-12-28 00:00:00]
63649,CCC,2020-12-29,60.008883,2020-12-28,True,2020-12-28,0,[]
63732,CCC,2020-12-30,60.251705,2020-12-29,True,2020-12-29,0,[]


### Convert prices dataframe to 1NF by creating a separate row for each value in 'stock_not_traded_date_list'

In [94]:
ccc_df_exp = ccc_df.explode('stock_not_traded_date_list')

### Convert fields to string to faciliate join

In [128]:
ccc_df_exp.stock_not_traded_date_list = ccc_df_exp.stock_not_traded_date_list.dt.strftime("%Y-%m-%d")

In [127]:
sen_df.post_date = sen_df.post_date.dt.strftime("%Y-%m-%d")

### Perform join

In [129]:
ccc_com = pd.merge(  ccc_df_exp[["Symbol","Date","Close_Price","stock_not_traded_date_list"]],
                          ccc_sen_df[["post_date","sentiment","is_regarding_financial_report","is_sold_pur_shares"]], 
                          how='left',
                          left_on="stock_not_traded_date_list",right_on="post_date")

In [130]:
ccc_com

Unnamed: 0,Symbol,Date,Close_Price,stock_not_traded_date_list,post_date,sentiment,is_regarding_financial_report,is_sold_pur_shares
0,CCC,2016-01-04,20.150000,,,,,
1,CCC,2016-01-05,19.500000,,,,,
2,CCC,2016-01-06,20.240000,,,,,
3,CCC,2016-01-07,20.870000,,,,,
4,CCC,2016-01-08,22.030000,,,,,
...,...,...,...,...,...,...,...,...
1232,CCC,2020-12-24,61.551857,,,,,
1233,CCC,2020-12-28,60.519899,2020-12-28,,,,
1234,CCC,2020-12-29,60.008883,,,,,
1235,CCC,2020-12-30,60.251705,,,,,


## Combine price df and sentiment df

In [67]:
ccc_combined = pd.merge(  ccc_df[["Symbol","Date","Close_Price"]],
                          ccc_sen_df[["post_date","sentiment","is_regarding_financial_report","is_sold_pur_shares"]], 
                          how='left',
                          left_on="Date",right_on="post_date")


In [68]:
ccc_combined[~(ccc_combined.sentiment.isna())].drop(columns=["post_date"])

Unnamed: 0,Symbol,Date,Close_Price,sentiment,is_regarding_financial_report,is_sold_pur_shares
16,CCC,2016-01-26,25.760000,0.8720,0.0,0.0
36,CCC,2016-02-24,31.000000,-0.2263,0.0,0.0
39,CCC,2016-02-29,27.280000,0.0000,1.0,0.0
42,CCC,2016-03-03,23.040000,-0.2263,1.0,0.0
63,CCC,2016-04-06,24.000000,0.6486,0.0,0.0
...,...,...,...,...,...,...
1110,CCC,2020-08-04,44.395822,-0.2263,0.0,0.0
1127,CCC,2020-08-28,44.076766,-0.2263,0.0,0.0
1168,CCC,2020-10-27,53.759697,0.2023,1.0,0.0
1173,CCC,2020-11-03,53.275782,0.0772,0.0,0.0
