----
# Analyzing SEC Filings textual data using NLP
----
----

# Table Of Contents:

1. Download Data
2. Preprocessing / Cleaning
3. Transformation (lemmatization) / Dependency mapping
4. Model Training
5. Model Evaluation
----

## 1.1 Import Packages

In [1]:
#from secedgar.filings import Filing, FilingType
import sys
import os
import util
import nltk 
import numpy as np 
import pandas as pd
import alphalens as al
import pickle
import pprint

from bs4 import BeautifulSoup
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords 
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import jaccard_similarity_score
from sklearn.feature_extraction.text import TfidfVectorizer


## 1.2 Download Corpora (data required for textual analysis)

In [2]:
# download text data that will be used for word netting (lemmatizing) and for removing (filler) stop words
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\seanm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\seanm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## 1.3 Download SEC Filing Data

Here we will download a series of 10-k doucments (annual financial filings) that we can use to for our NLP analysis.

In [3]:
cik_lookup = {
    'AMZN': '0001018724',
    'BMY': '0000014272',   
    'CNP': '0001130310',
    'CVX': '0000093410',
    'FL': '0000850209',
    'FRT': '0000034903',
    'HON': '0000773840'}

additional_cik = {
    'AEP': '0000004904',
    'AXP': '0000004962',
    'BA': '0000012927', 
    'BK': '0001390777',
    'CAT': '0000018230',
    'DE': '0000315189',
    'DIS': '0001001039',
    'DTE': '0000936340',
    'ED': '0001047862',
    'EMR': '0000032604',
    'ETN': '0001551182',
    'GE': '0000040545',
    'IBM': '0000051143',
    'IP': '0000051434',
    'JNJ': '0000200406',
    'KO': '0000021344',
    'LLY': '0000059478',
    'MCD': '0000063908',
    'MO': '0000764180',
    'MRK': '0000310158',
    'MRO': '0000101778',
    'PCG': '0001004980',
    'PEP': '0000077476',
    'PFE': '0000078003',
    'PG': '0000080424',
    'PNR': '0000077360',
    'SYY': '0000096021',
    'TXN': '0000097476',
    'UTX': '0000101829',
    'WFC': '0000072971',
    'WMT': '0000104169',
    'WY': '0000106535',
    'XOM': '0000034088'}

In [4]:
sec_api = util.SecHandler()

In [5]:
    def get_sec_data(cik, doc_type, start=0, count=60, date='2018-01-01'):
        """
        Function downloads SEC file data

        :param cik:
        :param doc_type:
        :param start:
        :param count:
        :param data: Newest Pricing data date
        :return entries:
        """
        new_price_data = pd.to_datetime(date)
        url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
            '&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom'.format(cik, doc_type, start, count)
        sec_data = sec_api.get(url)

        feed = BeautifulSoup(sec_data.encode('ascii'), 'xml').feed
        entries = [
            (
                entry.content.find('filing-href').getText(),
                entry.content.find('filing-type').getText(),
                entry.content.find('filing-date').getText())

            for entry in feed .find_all('entry', recursive=False)
            if pd.to_datetime(entry.content.find('filing-date').getText()) <= new_price_data]

        return entries


## 1.4 Pull a ticker from the list and analyze its documents

Here we are going to use AMZN as an example and print the 10-k file information from various years prior to 2018.

In [7]:
example_ticker = 'AMZN'
sec_data = {}

for ticker, cik in cik_lookup.items():
    sec_data[ticker] = get_sec_data(cik, '10-k')

pprint.pprint(sec_data[example_ticker][:5])

[('https://www.sec.gov/Archives/edgar/data/1018724/000101872417000011/0001018724-17-000011-index.htm',
  '10-K',
  '2017-02-10'),
 ('https://www.sec.gov/Archives/edgar/data/1018724/000101872416000172/0001018724-16-000172-index.htm',
  '10-K',
  '2016-01-29'),
 ('https://www.sec.gov/Archives/edgar/data/1018724/000101872415000006/0001018724-15-000006-index.htm',
  '10-K',
  '2015-01-30'),
 ('https://www.sec.gov/Archives/edgar/data/1018724/000101872414000006/0001018724-14-000006-index.htm',
  '10-K',
  '2014-01-31'),
 ('https://www.sec.gov/Archives/edgar/data/1018724/000119312513028520/0001193125-13-028520-index.htm',
  '10-K',
  '2013-01-30')]


## 1.5 Download using the data using the acquired list of URLs

In [None]:
raw_filing_by_ticker = {}

# for key, val in dict
for ticker, data in sec_data.items():
    raw_filing_by_ticker[ticker] = {}
    for index_url, file_type, file_date in tqdm(data, desc='Downloading {} Filings'.format(ticker), unit='filing'):
        file_url = index_url.replace('-index.htm', '.txt').replace('.txt1', '.txt')

        raw_filing_by_ticker[ticker][file_date] = sec_api.get(file_url)

print('Example Download:\n\n{}...'.format(next(iter(raw_filing_by_ticker[example_ticker].values()))[:1000]))

## 1.6 Get Document Types

In [None]:
ten_ks_ticker = {}

for ticker, filing_documents in raw_filing_by_ticker.items():
    ten_ks_ticker[ticker] = []
    for file_date, documents in filing_documents.items():
        for document in documents:
            if get_document_type(document) == '10-k':
                ten_ks_ticker[ticker].append({
                    'cik': cik_lookup[ticker],
                    'file': document,
                    'file_date': file_date
                })
util.print_ten_k_data(ten_ks_ticker[example_ticker][:5], ['cik', 'file', 'file_date'])

----
# Preprocessing
----

## 2.1 Clean Data
Start by removing the messy html and making all the text lowercase

In [None]:
for ticker, ten_ks in ten_ks_ticker.items():
    for ten_k in tqdm(ten_ks, desc='Cleaning {} 10-Ks'.format(ticker), unit='10-K'):
        ten_k['file_clean'] = util.clean_text(ten_k['file'])

util.print_ten_k_data(ten_ks_ticker[example_ticker][:5], ['file_clean'])

## 2.2 Lemmatize
This is the process of distilling the verbs down and understanding the dependencies of various tokens in the sentence.

In [None]:
word_pattern = re.compile('\w+')

for ticker, ten_ks in ten_ks_ticker.items():
    for ten_k in tqdm(ten_ks, desc='Lemmatize {} 10-Ks'.format(ticker), unit='10-K'):
        ten_k['file_lemma'] =lemmatize_words(word_pattern.findall(ten_k['file_clean']))

util.print_ten_k_data(ten_ks_ticker[example_ticker][:5], ['file_name'])


## 2.3 Remove Stop Words

Here we are removing words that are common in sentence structure and add noise to our dataset. Words such as 'and' and 'the' are stop words and should be removed.

In [None]:
lemma_english_stopwords = lemmatize_words(stopwords.words('english'))

for ticker, ten_ks in ten_ks_ticker.tiems():
    for ten_k in tqdm(ten_ks, desc='Remove Stopwords for {} 10-Ks'.format(ticker), unit='10-K'):
        # iterate through the documents in that we have lemmatized and remove stopwords
        ten_k['file_lemma'] = [word for word in ten_k['file_lemma'] if word not in lemma_english_stopwords]

print('Stop Words Removed')




----
# Analysis of 10-K
----

## 3.1 Loughranm McDonald Sentiment word list
This will be used to determine the following sentiment for the filings:

* Negative
* Postive
* Uncertainty
* Litigous
* Constraining
* Superfluous
* Model

In [None]:
# setting a list of the various sentiment types
sentiments = ['negative', 'positive', 'uncertainty', 'litigious', 'constraining', 'interesting']
# read in the sentiment word list
sentiment_df = pd.read_csv(os.path.join('..', '..', 'data', 'project_5_loughran_mcdonald', 'loughran_mcdonald_master_dic_2016.csv'))
# lowercase the columns
sentiment_df.columns = [column.lower() for column in sentiment_df.columns] 


# remove unused data (we only want the columns specified in the list sentiments)
sentiment_df = sentiment_df[sentiments + ['word']]
sentiment_df[sentiments] = sentiment_df[sentiments].astype(bool)
sentiment_df = sentiment_df[(sentiment_df[sentiments]).any(1)]

# apply the same preprocessing that is applied to the words in the 10-K
sentiment_df['word'] = lemmatize_words(sentiment_df['word'].str.lower())
sentiments = sentiment_df.drop_duplicates('word')

sentiment_df.head()


## 3.2 Bag of Words

With the sentiment words list that we just created we can generate a sentiment bag of words for our 10_k data set. Count the number of 'sentiment' words that we have in each doc (ignore the rest).

In [None]:
sentiment_bow_ten_ks = {}

for ticker, ten_ks in ten_ks_by_ticker.items():
    lemma_docs = [' '.join(ten_k['file_lemma']) for ten_k in ten_ks]

    sentiment_bow_ten_ks[ticker] = {
        sentiment: util.get_bag_of_words(sentiment_df[sentiment_df[sentiment]]['word'], lemma_docs)
        for sentiment in sentiments}

util.print_ten_k_data([sentiment_bow_ten_ks[example_ticker]], sentiments)

## 3.3 Jaccard Similarity

Calculate the Jaccard Similarity of our our bag of words looking for similarities between each tick in time. (need to turn the bag of words into a boolean array for Jaccard Similarity calculation)

In [None]:
# get dates for the universe
file_dates = {
    ticker: [ten_k['file_date'] for ten_k in ten_ks]
    for ticker, ten_ks in ten_ks_ticker.items()}

jaccard_similarities = {
    ticker: {
        sentiment_name: get_jaccard_similarity(sentiment_values)
        for sentiment_name, sentiment_values in ten_k_sentiments.items()}
    for ticker, ten_k_sentiments in sentiment_bow_ten_ks.items()}

util.plot_similarities(
    [jaccard_similarities[example_ticker][sentiment] for sentiment in sentiments],
    file_dates[example_ticker][1:]
    'Jaccard Similarities for {} Sentiment'.format(example_ticker),
    sentiments)

## 3.4 TFIDF

Generate TFDIF from the 10-K documents using the sentiment word list.

In [None]:
cosine_similarities = {
    ticker: {
        sentiment_name: get_cosine_similarity(sentiment_values)
        for sentiment_name, sentiment_values in ten_k_sentiments.items()}
    for ticker, ten_k_sentiments in sentiment_tfidf_ten_ks.items()}

util.plot_similarities(
    [cosine_similarities[example_ticker][sentiment] for sentiment in sentiments],
    file_dates[example_ticker][1:],
    'Cosine Similarities for {} Sentiment'.format(example_ticker),
    sentiments)

----
# Evaluate Alpha Factors
----

Analyze the returns of our cosine simalarities and Jaccard simalarities.

## 4.1 Price Data

Yearly pricing to run the factor model against (10-Ks are annual filings).

In [None]:
pricing = pd.read_csv('../../data/pricing-data', parse_dates=['date'])
prcing = prcing.pivot(index='date', columns='ticker', values='adj_close')

pricing

## 4.2 Dict to DataFrame

Convert to DF since the alphalens library uses DataFrames.

In [None]:
factor_data = {}
skipped_sentiments = []

for senitment in sentiments:
    cs_df = cosine_similarities_df[(cosine_similarities_df['sentiment'] == sentiment)]
    cs_df = cs_df.pivot(index='date', columns='ticker', values='value')


    try:
        data = al.utils.get_factor_and_forward_returns(cs_df.stack(), pricing, quantiles=5, bins=None, periods=[1])
        factor_data[sentiment] = data
    except:
        skipped_sentiments.append(sentiment)

if skipped_sentiments:
    print('\nskipped the following sentiments:\n{}'.format('\n'join(skipped_sentiments)))
factor_data[sentiments[0]].head()

## 4.3 Alphalens Format with Unix Time

The alphalens mean_return_by_quantile func requires unix timestamp to work (create a factor DF with unix time).

In [None]:
# setting the index of the factor df to unix time
unixt_factor_data = {
    factor: data.set_index(pd.MultiIndex.from_tuples(
        [(x.timestamp(), y) for x, y in data.index.values],
        names=['date', 'asset']))
    for factor, data in unixt_factor_data.items()}

## 4.4 Factor Returns

Visualize factor returns vs time.

In [None]:
ls_factor_returns = pd.DataFrame()

for factor_name, data in factor_data.items():
    ls_factor_returns[factor_name] = al.performance.factor_returns(data).iloc[:, 0]

(1 + ls_factor_returns).cumprod().plot()

## 4.5 Basis Points Per Day per Quantile

Look beyond factor weighted returns (should be monotonic in quantiles). Analyze basis points of the factor returns.

In [None]:
qr_factor_returns = pd.DataFrame()

for factor_name, data in unixt_factor_data.items():
    qr_factor_returns[factor_name] = al.performance.mean_return_by_quantile(data)[0].iloc[:, 0]

(10000*qr_factor_returns).plot.bar(
    subplots=True,
    sharey=True,
    layout=(5,3),
    figsize=(14, 14),
    legend=False)

## 4.6 Turnover Analysis

Here we can analyze how stable the alphas are over time without doing a full on backtest.  This is meant to measure the period to period variance of the alpha factor using Factor Rank Autocorrelation (FRA).

In [None]:
ls_fra = pd.DataFrame()

for factor, data in unixt_factor_data.items():
    ls_fra[factor] = al.performance.factor_rank_autocorrelation(data)


ls_fra.plot(title='Factor Rank Autocorrelation')

## 4.7 Sharpe Ratio of Alphas

Measuring the sharpe ratio(s) procuced by the extracted alphas. We are looking for sharpe > 1.

In [None]:
# 252 trading days in a year
daily_annualization_factor = np.sqrt(252)
# caluclation of sharpe ratio
(daily_annualization_factor * ls_factor_returns.mean() / ls_factor_returns.std()).round(2)