# 10-K/Q Section Change Detection and Analysis with the Calcbench API

## Hypothesis:

The cosine distance between Term Frequency - Iverse Document Frequencey (TF-IDF) vectors is a useful proxy for symantic change in 10-K sections across time.

## Procedure:
1. Download document section contents from Calcbench
2. Tokenize the sections
3. Build TF-IDF matrices
4. Compute the cosine distance between each section and the same section from the previous filing/period
5. Render the matrix of distances with largest distances highlighted
6. "diff" documents with a distance above a certain threshold.


In [33]:
import calcbench as cb
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup
from scipy.spatial.distance import cosine
from IPython.core.display import display, HTML
import sklearn
import itertools
from tqdm import tqdm_notebook
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import colors

In [2]:
class NumberNormalizingVectorizer(sklearn.feature_extraction.text.TfidfVectorizer):
    def build_tokenizer(self):
        tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()
        return lambda doc: list(number_normalizer(tokenize(doc)))

In [3]:
def number_normalizer(tokens):
    """ Map all numeric tokens to a placeholder.

    For many applications, tokens that begin with a number are not directly
    useful, but the fact that such a token exists can be relevant.  By applying
    this form of dimensionality reduction, some methods may perform better.
    """

    return ("#NUMBER" if token[0].isdigit() else token for token in tokens)

In [4]:
document_section = 'Risk Factors'

In [5]:
tickers = cb.tickers(index='DJIA')

In [6]:
def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)   

## Download Documents and compute cosine distance over time periods

In [7]:
diffs = pd.DataFrame(index=tickers, columns=range(2018, 2008, -1))
for ticker in tqdm_notebook(tickers):
    sorted_disclosures = sorted((d for d in cb.document_search(company_identifiers=[ticker], document_name='Risk Factors', all_history=True) if d['fiscal_period'] == 'Y'), key=lambda d: d['fiscal_year'])
    year_pairs = pairwise(sorted_disclosures)
    for last_year, this_year in year_pairs:
        text_last_year = BeautifulSoup(last_year.get_contents(), 'html.parser').text
        text_this_year = BeautifulSoup(this_year.get_contents(), 'html.parser').text
        vectorizer = NumberNormalizingVectorizer(stop_words='english')
        X = vectorizer.fit_transform([text_this_year, text_last_year])
        distance = cosine(X[0].todense(), X[1].todense())
        diffs[this_year['fiscal_year']][ticker] = distance

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):





In [38]:
def background_gradient(s, m, M, cmap='PuBu', low=0, high=0):
    rng = M - m
    norm = colors.Normalize(m - (rng * low),
                            M + (rng * high))
    normed = norm(s.values)
    c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
    return ['background-color: %s' % color for color in c]

def highlight_largest_diffs(diffs):
    filled_df = diffs.fillna(0)
    return filled_df.style.apply(background_gradient, cmap='Reds', m=filled_df.min().min(), M=filled_df.max().max(), low=0, high=2.5)

In [39]:
highlight_largest_diffs(diffs.loc[['JNJ']])

Unnamed: 0,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009
JNJ,0,0.0101569,0.607054,0.0201741,0.569617,0,0.0418208,0,0.0263554,0


## Hightlight periods which are most different from the previous filing

In [40]:
highlight_largest_diffs(diffs)

Unnamed: 0,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009
MMM,0.0,0.00779628,0.00762529,0.0258541,0.00727321,0.008662,0.00506823,0.0435289,0.0260112,0
AXP,0.0,0.00929214,0.01471,0.0321971,0.016149,0.0313577,0.0245556,0.0221019,0.059023,0
AAPL,0.0,0.00395994,0.00278187,0.000551845,0.00426382,0.00349175,0.027321,0.0172534,0.00437213,0
BA,0.0,0.00484915,0.00481619,0.00161179,0.00373701,0.00870041,0.0315051,0.0358225,0.0456911,0
CAT,0.0,0.00214401,0.00873649,0.00352635,0.00924687,0.0109385,0.0179613,0.012855,0.040419,0
CVX,0.0,0.0125657,0.0279247,0.0451385,0.0198717,0.000979089,0.00496076,0.0748053,0.0569165,0
CSCO,0.00443587,0.00206469,0.0038352,0.00174538,0.00588518,0.0119791,0.00310984,0.010243,0.0,0
KO,0.0,0.0129111,0.00665081,0.0138153,0.0120799,0.0107884,0.036683,0.0350291,0.0650325,0
DWDP,0.0,0.240127,0.134602,0.0446338,0.0326869,0.0198903,0.033084,0.00999427,0.0681977,0
XOM,0.0,0.0305509,0.00840683,0.0179265,0.00275424,0.0069008,0.0115724,0.0150352,0.0,0


## Review Changes

In [5]:
ticker = "JNJ"
year = 2016
previous_year = 2015
doc = next(cb.document_search(company_identifiers=[ticker], document_name=document_section, year=year)).get_contents()
previous_doc = next(cb.document_search(company_identifiers=[ticker], document_name=document_section, year=previous_year)).get_contents()
display(HTML(cb.html_diff(doc, previous_doc)))