In [475]:
import pandas as pd
import numpy as np
import random
import requests
import warnings
from scipy import stats
from datetime import datetime, timedelta
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')
import re
from bs4 import BeautifulSoup

# Randomly draw ten 8-Ks per quarter from 1995-2020.

In [1]:
# Don't run this. Take lots of time.
import edgar

download_directory = 'tsv/'
since_year = 1995

# The SEC has a rate request limit. So we run several times to make sure we get all data we need.
for i in range(10):
    edgar.download_index(download_directory, since_year, skip_all_present_except_last=False)

In [4]:
# Create name for all files
QTR = ['QTR1','QTR2','QTR3','QTR4']
path = 'tsv/'
file_list = []
for i in range(1995,2021) :
    for j in QTR:
        file_list.append(path + str(i) + '-' + j + '.tsv')
        
# Initialize record
record = pd.DataFrame(columns = ['CIK', 'Name', 'Form', 'Date', 'TXT', 'Link'])
        
# Read in and process files
for i in file_list:
    df = pd.read_csv(i, sep = '|', error_bad_lines=False, header=None)
    df.columns = ['CIK', 'Name', 'Form', 'Date', 'TXT', 'Link']
    # Sample 100 for each year and quarter
    df_sample = df[df['Form']=="8-K"].sample(10)
    # Record
    record = pd.concat([record, df_sample], ignore_index=True)

In [6]:
record

Unnamed: 0,CIK,Name,Form,Date,TXT,Link
0,216228,ITT CORP,8-K,1995-02-06,edgar/data/216228/0000950123-95-000211.txt,edgar/data/216228/0000950123-95-000211-index.html
1,723916,MERIDIAN BANCORP INC,8-K,1995-01-12,edgar/data/723916/0000903594-95-000001.txt,edgar/data/723916/0000903594-95-000001-index.html
2,846972,ADIENCE INC,8-K,1995-02-10,edgar/data/846972/0000846972-95-000004.txt,edgar/data/846972/0000846972-95-000004-index.html
3,853890,KANEB PIPE LINE PARTNERS L P,8-K,1995-03-13,edgar/data/853890/0000950109-95-000665.txt,edgar/data/853890/0000950109-95-000665-index.html
4,790070,EMC CORP,8-K,1995-03-03,edgar/data/790070/0000790070-95-000006.txt,edgar/data/790070/0000790070-95-000006-index.html
...,...,...,...,...,...,...
1035,854800,"MICT, Inc.",8-K,2020-10-07,edgar/data/854800/0001213900-20-030486.txt,edgar/data/854800/0001213900-20-030486-index.html
1036,948320,"CONVERSION LABS, INC.",8-K,2020-11-25,edgar/data/948320/0001493152-20-022570.txt,edgar/data/948320/0001493152-20-022570-index.html
1037,1693577,"MainStreet Bancshares, Inc.",8-K,2020-11-19,edgar/data/1693577/0001564590-20-054603.txt,edgar/data/1693577/0001564590-20-054603-index....
1038,1142596,NUVASIVE INC,8-K,2020-10-29,edgar/data/1142596/0001564590-20-048859.txt,edgar/data/1142596/0001564590-20-048859-index....


In [7]:
record.to_csv("record.csv")

In [None]:
# Download the 8-K files. Download will take lots of time. Don't run this.
files = []

for index, value in record.iterrows():
    url = 'https://www.sec.gov/Archives/' + value['TXT']
    # print(url)
    r = requests.get(url, allow_redirects=True)
    while str(r) != '<Response [200]>': # <Response [200]> means it works fine.
        r = requests.get(url, allow_redirects=True) # Always make sure we get the valid text data
    path = '8-K/' + str(value['CIK']) + '-' + value['Date'] + '.txt' 
    open(path, 'wb').write(r.content)

# Extract relevant info

In [429]:
# Read 8-K files into files
record = pd.read_csv('record.csv')
files = []
for index, value in record.iterrows():
    path = '8-K/' + str(value['CIK']) + '-' + value['Date'] + '.txt'
    f = open(path,'r')
    file = f.read()
    files.append(file)

In [428]:
# Find all 8-K filings with Item
def paragraph_split(file):
    num = 0
    list_ret = list()
    if '</IMS-HEADER>' in file: # old
        soup = BeautifulSoup(file, 'lxml').find('text')
        temp = str(soup).lower().split('signature')[0]
        temp = temp.lower().split('item 9.01')[0]
    elif '</SEC-HEADER>' in file: # new
        soup = BeautifulSoup(file, 'lxml').find_all('text')[0]
        temp = soup.get_text().lower().split('item 9.01')[0]
        temp = temp.lower().split('signature')[0]
    else: 
        num = num+1
    for s_str in temp.lower().split('item')[1:]:
        s_str = s_str.replace("\xa0", "")
#         s_str = s_str.replace("\n", " ")
        list_ret.append(s_str)
    return list_ret

In [415]:
# Add all itmes to the items
items = list()
for file in files:
    for s_str in paragraph_split(file):
        s_str = s_str.replace("\n", "") # Remove '\n'
        s_str = s_str.replace("  ", " ")
        items.append(s_str)
print(items)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [424]:
# Store data into excel
items_df = pd.DataFrame(items)
items_df.to_csv('items.csv',encoding='utf-8')

In [425]:
len(items)

2575

# Data Preprocessing

In [476]:
from __future__ import unicode_literals, print_function
import numpy as np
import pandas as pd
import spacy
import nltk
import torch
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
# from ey_nlp.contractions import CONTRACTION_MAP
import unicodedata
from sklearn.feature_extraction.text import CountVectorizer
import time
from IPython import embed
#import ipdb
import time
import pickle
from spacy.lang.en import English # updated
from transformers import BertTokenizer, BertModel, BertForMaskedLM

In [477]:
nlp = spacy.load('en_core_web_md')
#nlp_vec = spacy.load('en_vecs', parse = True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [478]:
# from the paper
import nltk
nltk.download('averaged_perceptron_tagger')
def remove_proper_nouns(text = 'I am named John Dow'):
    # written by Robert Hatem
    text_tagged = nltk.tag.pos_tag(text.split())
    text_edited = [word for word, tag in text_tagged if tag != 'NNP' and tag != 'NNPS']
    text_new = ' '.join(text_edited)
    return text_new

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/xinyue/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [479]:
def lower_case(text):
    return text.lower()

In [480]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [481]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [482]:
# extra preprocessing
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def remove_extra_newlines(text):
    return re.sub(r'[\r|\n|\r\n]+', ' ', text)
    
def remove_extra_whitespace(text):
    return re.sub(' +', ' ', text)

In [483]:
# for customer tockenizer
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [484]:
def preprocess_text(doc,
                    proper_noun_removal=False,
                    lower_the_case=True,
                    special_char_removal=True,
                    remove_digits=True,
                    stopword_removal=True,
                    html_stripping=True,
                    accented_char_removal=True):
    """
    doc: string
    """
    
    # remove proper nouns
    if proper_noun_removal:
        doc = remove_proper_nouns(doc)
    # lowercase the text    
    if lower_the_case:
        doc = lower_case(doc)
    # expand contractions    
    if special_char_removal:
        # insert spaces between special characters to isolate them    
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)
        doc = remove_special_characters(doc, remove_digits=remove_digits)
    # remove stopwords
    if stopword_removal:
        doc = remove_stopwords(doc, is_lower_case=lower_the_case)
    # strip HTML
    if html_stripping:
        doc = strip_html_tags(doc)
    # remove accented characters
    if accented_char_removal:
        doc = remove_accented_chars(doc)
    doc = remove_extra_newlines(doc)
    doc = remove_extra_whitespace(doc)
        
    return doc

In [462]:
def custom_tokenizer(doc,
                     stemmer=True,
                     text_lemmatization=True):
    # stemmer
    if stemmer:
        doc = simple_stemmer(doc)
    # lemmatize text
    if text_lemmatization:
        doc = lemmatize_text(doc)
    return doc.split()


def clean_text(doc):
    """
    Clean then tockenize data
    """
    doc_clean = preprocess_text(doc)
    doc_clean = custom_tokenizer(doc_clean)
    
    doc_as_string = ' '.join(doc_clean)
    return doc_as_string


# DEPRECATED
def _count_words(corpus):
    '''
    Makes document-term matrix  
    corpus: list of strings
    '''
    
    if type(corpus) != list:
        raise TypeError('corpus should be list or nltk corpus')
        
    vectorizer = CountVectorizer(preprocessor=preprocess_text,
                                 tokenizer=custom_tokenizer)
    X = vectorizer.fit_transform(corpus)
    
    vocab = vectorizer.get_feature_names()
    doc_term_mat = X.toarray()

    return vocab, doc_term_mat

In [471]:
items_cleaned = [clean_text(item) for item in items]

In [490]:
# Store data into excel
items_cleaned_df = pd.DataFrame(items_cleaned)
items_cleaned_df.to_csv('items_cleaned.csv',encoding='utf-8')

In [509]:
labeled = pd.read_csv('labeled.csv', encoding = "ISO-8859-1")[0:10][['event', 'labels']]
labeled_cleaned = [clean_text(item) for item in labeled['event']]
labeled_cleaned_df = pd.DataFrame(labeled_cleaned, labeled['labels'])
labeled_cleaned_df.to_csv('labeled_cleaned.csv',encoding='utf-8')