## _Get Financial Data_

**This script crawls through the SEC website to get financial data and documents we are interested in. The final DataFrame includes Company, CIK (unique identifier), 8-K document text, percent change in stock before and after the document is released, the average change in stock price avergaed weekly, monthly, quarterly and yearly. Finally, a signal column with 'up','down' or 'stay' values depending on if there is a more than 1 percent change in stock post the release of a 8-K document.**

### _Import Libraries_

In [1]:
from bs4 import BeautifulSoup
import datetime
import unicodedata
import requests
import pandas as pd
import numpy as np
from time import sleep
import math
# from config import Config
import dateutil.relativedelta
import pandas_market_calendars as mcal
import os
import io
import re
from tqdm import tqdm
import gc
import ast

from bs4 import BeautifulSoup
import datetime
import unicodedata
import requests
import pandas as pd
import numpy as np
from time import sleep
import re

### _Helper functions and classes_

In [3]:
class SEC_Extractor:
    """
    Class to get links and the 8-K documents for a particular company of interest.
    """
    
    def get_doc_links(cik,ticker):
    """
    Method to crawl through the SEC website to get links for the 8-K documents for the companies
    we are interested in.
    
    Args:
        cik (str): Central Index Key for the company of interest
        ticker (ticker): Company of Interest
        
    Returns:
        Pandas DataFrame: DatFrame of Company, CIK, 8-K Document Link and Document Name.
    """
        try:
            base_url = "https://www.sec.gov/cgi-bin/browse-edgar"
            inputted_cik = cik
            payload = {
                "action" : "getcompany",
                "CIK" : inputted_cik,
                "type" : "8-K",
                "output":"xml",
                "dateb" : "20180401",
            }
            sec_response = requests.get(url=base_url,params=payload)
            soup = BeautifulSoup(sec_response.text,'lxml')
            url_list = soup.findAll('filinghref')
            html_list = []
            # Get html version of links
            for link in url_list:
                link = link.string
                if link.split(".")[len(link.split("."))-1] == 'htm':
                    txtlink = link + "l"
                    html_list.append(txtlink)

            doc_list = []
            doc_name_list = []
            # Get links for txt versions of files
            for k in range(len(html_list)):
                txt_doc = html_list[k].replace("-index.html",".txt")
                doc_name = txt_doc.split("/")[-1]
                doc_list.append(txt_doc)
                doc_name_list.append(doc_name)
                # Create dataframe of CIK, doc name, and txt link
            df = pd.DataFrame(
                {
                "cik" : [cik]*len(html_list),
                "ticker" : [ticker]*len(html_list),
                "txt_link" : doc_list,
                "doc_name": doc_name_list
                }
            )
        except requests.exceptions.ConnectionError:
                sleep(.1)
        return df

    def extract_text(link):
    """
    Method to extract text and submission date-time given a document link.
    
    Args:
        link (str): Link to the 8-K document of interest on the SEC website.
        
    Returns:
        str: Text in the 8-K document.
        str: Filing date of the document.
    """
        try:
            r = requests.get(link)
            #Parse 8-K document
            filing = BeautifulSoup(r.content,"html5lib",from_encoding="ascii")
            #Extract datetime
            try:
                submission_dt = filing.find("acceptance-datetime").string[:14]
            except AttributeError:
                    # Flag docs with missing data as May 1 2018 10AM
                submission_dt = "20180501100000"
            
            submission_dt = datetime.datetime.strptime(submission_dt,"%Y%m%d%H%M%S")
            #Extract HTML sections
            for section in filing.findAll("html"):
                try:
                    #Remove tables
                    for table in section("table"):
                        table.decompose()
                    #Convert to unicode
                    section = unicodedata.normalize("NFKD",section.text)
                    section = section.replace("\t"," ").replace("\n"," ").replace("/s"," ").replace("\'","'")            
                except AttributeError:
                    section = str(section.encode('utf-8'))
            filing = "".join((section))
        except requests.exceptions.ConnectionError:
                sleep(10)
        sleep(.1)

        return filing, submission_dt

    def extract_item_no(document):
        """
        Method to extract item number for a 8-K document given text from the document, if available.
        
            
        Args:
            text (str): Text from the 8-K document of interest.
        
        Returns:
            list: A list of item numbers in the 8-K document.
        """
        pattern = re.compile("Item+ +\d+[\:,\.]+\d+\d")
        item_list = re.findall(pattern,document)
        return item_list

In [4]:
# Returns Dataframe of document links for a given CIK
idx = pd.Index
class FinDataExtractor:
    def __init__(self):
        # S&P 500 index data downloaded from Yahoo Finance GSPC
        self.gspc_df = pd.read_csv("./Data/GSPC.csv",parse_dates=['Date'],index_col="Date")
        # Get VIX index data downloaded from Yahoo Finance
        self.vix_df = pd.read_csv("./Data/GSPC.csv",parse_dates=['Date'],index_col="Date")
        nyse = mcal.get_calendar('NYSE')
        self.nyse_holidays = nyse.holidays().holidays
        # Get data scraped from Wikipedia
        self.all_tickers_data = pd.read_pickle("./all_tickers_data.pkl")
        

    def get_historical_movements(self,row,period):
        """
        Method to get opening or closing stock price for a given time period and company of interest.
        
        Args:
            Company, Release Date (Pandas DataFrame): A Pandas DataFrame row of Company and Aggregation Period.
                                                      Aggregation Period can be one of week, month, quarter or year.
        
        Returns:
            float: Normalized price averaged over period of interest.
        """
        ticker,release_date = row[0],row[1]

       #1 Week
        if period == "week":
            e_start = release_date + datetime.timedelta(weeks=-1)
            b_start = e_start
        

            e_end = release_date + dateutil.relativedelta.relativedelta(days=-1)
            b_end = e_end
      

         #1 Month    
        elif period == "month":
            e_start = release_date + dateutil.relativedelta.relativedelta(months=-1)
            b_start = e_start + dateutil.relativedelta.relativedelta(days=-5)
            
            e_end = release_date + dateutil.relativedelta.relativedelta(days=-1)
            b_end = release_date + dateutil.relativedelta.relativedelta(days=-6)
            


        #1 Quarter
        elif period == "quarter":
            e_start = release_date + dateutil.relativedelta.relativedelta(months=-3)
            b_start = e_start + dateutil.relativedelta.relativedelta(days=-10)

            e_end = release_date + dateutil.relativedelta.relativedelta(days=-1)
            b_end = release_date + dateutil.relativedelta.relativedelta(days=-11)

        #1 Year
        elif period == "year":
            e_start = release_date + dateutil.relativedelta.relativedelta(years=-1)
            b_start = e_start + dateutil.relativedelta.relativedelta(days=-20)

            e_end = release_date + dateutil.relativedelta.relativedelta(days=-1)
            b_end = release_date + dateutil.relativedelta.relativedelta(days=-21)
        else:
            raise KeyError

        e_start = self.weekday_check(e_start)
        b_start = self.weekday_check(b_start)
        e_end = self.weekday_check(e_end)
        b_end = self.weekday_check(b_end)


        start_price = self.get_av_data(ticker=ticker,start_date = b_start, end_date = e_start)
        end_price = self.get_av_data(ticker=ticker,start_date = b_end, end_date = e_end)
        stock_change = self.calculate_pct_change(end_price,start_price)

        start_index = self.get_index_price(start_date = b_start, end_date = e_start)
        end_index = self.get_index_price(start_date = e_start, end_date = e_end)
        index_change =  self.calculate_pct_change(end_index,start_index)

        normalized = stock_change - index_change
        return normalized

    def get_av_data(self,ticker,start_date,end_date,market_open=False):
        """
        Method to get average opening or closing stock price for a given start and end date and company of interest.
        
        Args:
            ticker (str): Company of interest.
            start_date (str): Start date for time period of interest.
            end_date (str): Start date for time period of interest.
            martket_open (bool): Flag to choose adjusted close or open stock price depending on time of interest.
                                 (default is False)   
        
        Returns:
            float: Stock price averaged over period of interest.
        """
        start_date = str(start_date.date())
        end_date = str(end_date.date())

        try:
            if market_open == False:
                price = self.all_tickers_data[self.all_tickers_data.ticker == ticker].loc[start_date:end_date,"adjusted_close"].mean()
            else:
                price = self.all_tickers_data[self.all_tickers_data.ticker == ticker].loc[start_date:end_date,"open"].mean()
        except (KeyError,IndexError):
            price = np.nan
        return price

    # Takes ticker, 8K release date, checks time of release and then calculate before and after price change
    def get_change(self,row):
        """
        Method to calculate before and after price change after checking company, release date and time.
        
        Args:
            Company, Release Date (Pandas DataFrame): A Pandas DataFrame row of Company and Release Date.
            
        Returns:
            float: Normalized change (%) in stock price post release of the 8-K document.
            float: Vix 
        """
        release_date = row['release_date']
        ticker = row['ticker']
        market_close = release_date.replace(hour=16,minute=0,second=0)
        market_open = release_date.replace(hour=9,minute=30,second=0)

    # If report is released after market hours, take change of start date close and release date open
        if release_date > market_close:
            start_date = release_date
            end_date = release_date + datetime.timedelta(days=1)
            end_date = self.weekday_check(end_date)

            price_before_release = self.get_av_data(ticker,start_date,start_date,market_open=False)
            price_after_release = self.get_av_data(ticker,end_date,end_date,market_open=True)

            index_before_release = self.get_index_price(start_date,start_date,market_open=False)
            index_after_release = self.get_index_price(end_date,end_date,market_open=True)

            try:
                vix = self.vix_df.loc[self.vix_df.index == np.datetime64(start_date.date()),"Adj Close"][0].item()
            except IndexError:
                vix = np.nan

        # If report is released before market hours, take change of start date's close and release date's open
        elif release_date < market_open:
            start_date = release_date + datetime.timedelta(days=-1)
            start_date = self.weekday_check(start_date)
            end_date = release_date

            price_before_release = self.get_av_data(ticker,start_date,start_date,market_open=False)
            price_after_release = self.get_av_data(ticker,end_date,end_date,market_open=True) 

            index_before_release = self.get_index_price(start_date,start_date,market_open=False)
            index_after_release = self.get_index_price(end_date,end_date,market_open=True)
            try:
                vix = self.vix_df.loc[self.vix_df.index == np.datetime64(start_date.date()),"Adj Close"][0].item()
            except IndexError:
                vix = np.nan
        # If report is released during market hours, use market close
        else:
            start_date = release_date
            end_date = release_date
            price_before_release = self.get_av_data(ticker,start_date,start_date,market_open=True)
            price_after_release = self.get_av_data(ticker,end_date,end_date,market_open=False)

            index_before_release = self.get_index_price(start_date,start_date,market_open=True)
            index_after_release = self.get_index_price(end_date,end_date,market_open=False)
            
            try:
                vix = self.vix_df.loc[self.vix_df.index == np.datetime64(start_date.date()),"Open"][0].item()
            except IndexError:
                vix = np.nan
                
        price_pct_change = self.calculate_pct_change(price_after_release,price_before_release)
        index_pct_change = self.calculate_pct_change(index_after_release,index_before_release)
        normalized_change = price_pct_change - index_pct_change

        return normalized_change, vix

    def get_index_price(self,start_date,end_date,market_open=False):
        """
        Method to get index price given a date range.
        
        Args:
           start_date (str):  Start date of interest.
           end_date (str):  Start date of interest.
           market_open (bool): Flag to choose between adjusted close or open stock price (default is False).
            
        Returns:
            float: Price (stock) averaged over period of interest. 
        """
        try:
            if market_open == True:
                price = self.gspc_df.loc[(self.gspc_df.index >= np.datetime64(start_date.date())) & 
                                 (self.gspc_df.index <= np.datetime64(end_date)),"Open"].mean()
            else:
                price = self.gspc_df.loc[(self.gspc_df.index >= np.datetime64(start_date.date())) & 
                                 (self.gspc_df.index <= np.datetime64(end_date)),"Adj Close"].mean()
        except IndexError:
                price = np.nan
        return price

    def calculate_pct_change(self,end_value,start_value):
        """
        Method to the percent change in price values.
        
        Args:
           end_value (float):  Start price value.
           start_value (float):  End price value.
            
        Returns:
            float: Percent change in values entered.
        """
        pct_change = (end_value - start_value) / start_value
        pct_change = round(pct_change,4) * 100
        return pct_change

    def weekday_check(self,date):  
        """
        Method to check if a given date is a holiday.
        
        Args:
           date (str):  Date of interest.
           
        Returns:
            date (str): Returns previous day if a given day is a weekend or holiday.
        """
        while date.isoweekday() > 5 or date.date() in self.nyse_holidays:
            date = date + datetime.timedelta(days=-1)
        return date


### _S&P 500 Data from Wikipedia_

In [5]:
# Get table of the S&P 500 tickers, CIK, and industry from Wikipedia
wiki_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
cik_df = pd.read_html(wiki_url,header=0,index_col=0)[0]

# Correct Data Types of the DataFrame
cik_df['GICS Sector'] = cik_df['GICS Sector'].astype("category")
cik_df['GICS Sub Industry'] = cik_df['GICS Sector'].astype("category")
cik_df.head()

Unnamed: 0_level_0,Symbol,SEC filings,GICS Sector,GICS Sub Industry,Headquarters Location,Date first added[3][4],CIK,Founded
Security,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3M Company,MMM,reports,Industrials,Industrials,"St. Paul, Minnesota",,66740,1902
Abbott Laboratories,ABT,reports,Health Care,Health Care,"North Chicago, Illinois",1964-03-31,1800,1888
AbbVie Inc.,ABBV,reports,Health Care,Health Care,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
ABIOMED Inc,ABMD,reports,Health Care,Health Care,"Danvers, Massachusetts",2018-05-31,815094,1981
Accenture plc,ACN,reports,Information Technology,Information Technology,"Dublin, Ireland",2011-07-06,1467373,1989


In [6]:
sec_ext = SEC_Extractor
no_parts = 2
part_no = 3

In [7]:

df_list = []
company_list = cik_df['CIK'].to_dict()

# Extract documents link for a particular company and CIK from the S&P 500 list from Wikipedia
for (ticker,cik) in tqdm(company_list.items()):
    df_list.append(sec_ext.get_doc_links(cik,ticker))
doc_links_df = pd.concat(df_list,axis=0)
doc_links_df = doc_links_df.set_index("ticker").join(cik_df['GICS Sector']).join(cik_df['GICS Sub Industry']).reset_index().rename(columns={"index":"ticker"})
doc_links_df.head()

100%|██████████| 505/505 [05:04<00:00,  1.85it/s]


Unnamed: 0,ticker,cik,txt_link,doc_name,GICS Sector,GICS Sub Industry
0,3M Company,66740.0,https://www.sec.gov/Archives/edgar/data/66740/...,0001558370-18-002122.txt,Industrials,Industrials
1,3M Company,66740.0,https://www.sec.gov/Archives/edgar/data/66740/...,0001104659-18-015215.txt,Industrials,Industrials
2,3M Company,66740.0,https://www.sec.gov/Archives/edgar/data/66740/...,0001104659-18-010779.txt,Industrials,Industrials
3,3M Company,66740.0,https://www.sec.gov/Archives/edgar/data/66740/...,0001104659-18-004014.txt,Industrials,Industrials
4,3M Company,66740.0,https://www.sec.gov/Archives/edgar/data/66740/...,0001104659-18-003876.txt,Industrials,Industrials


In [8]:
# Write document links of companies and CIK to a pickle file
doc_links_df.to_pickle("./doc_links_df.pkl")

In [9]:
# Size of Document Links DataFrame
doc_links_df.shape

(19806, 6)

### _Download 8Ks & Stock Movements_

In [28]:
chunksize = 2

# Read in pickled file of document links
crawled_df = pd.read_pickle("./doc_links_df.pkl")

# Filter document links DataFrame for the companies we are interested in
array = ['Apple Inc.','General Electric']
crawled_df = crawled_df.loc[crawled_df['ticker'].isin(array)]

crawled_len = len(crawled_df['txt_link'])
chunks = math.ceil(crawled_len/chunksize)

df_list = []
for i, df in tqdm(enumerate(np.array_split(crawled_df,chunks))):
    df['text'], df['release_date'] = zip(*df['txt_link'].apply(sec_ext.extract_text))
    # Extract item number for each text link and add to items column
    df['items'] = df['text'].map(sec_ext.extract_item_no)    
    df_list.append(df)
    del df
    
    if i % 50 == 0:
        gc.collect()
df = pd.concat(df_list)
df.head()

40it [11:55, 19.42s/it] 


Unnamed: 0,ticker,cik,txt_link,doc_name,GICS Sector,GICS Sub Industry,text,release_date,items
1962,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0001193125-18-045761.txt,Information Technology,Information Technology,0001193125-18-045761.txt : 20180214 0001193125...,2018-02-14 16:54:21,[]
1963,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0000320193-18-000005.txt,Information Technology,Information Technology,0000320193-18-000005.txt : 20180201 0000320193...,2018-02-01 16:30:17,"[Item 2.02, Item 9.01]"
1964,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0001193125-17-341015.txt,Information Technology,Information Technology,0001193125-17-341015.txt : 20171113 0001193125...,2017-11-13 16:44:57,[]
1965,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0000320193-17-000067.txt,Information Technology,Information Technology,0000320193-17-000067.txt : 20171102 0000320193...,2017-11-02 16:30:16,[]
1966,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0001193125-17-308859.txt,Information Technology,Information Technology,0001193125-17-308859.txt : 20171012 0001193125...,2017-10-12 16:31:33,[]


In [65]:
# temp = pd.concat([temp, df], axis = 0)
# temp = temp.reset_index()
# temp.shape

(320, 9)

In [29]:
df_1 = df

In [74]:
temp['txt_link'][0]

'https://www.sec.gov/Archives/edgar/data/320193/000119312518045761/0001193125-18-045761.txt'

In [11]:
df.to_csv("docs_companies_default.csv")

In [21]:
df = pd.read_csv("docs_companies_default.csv")

In [23]:
df['release_date'] = pd.to_datetime(df['release_date'])

### _Load Financial Data_

In [12]:
cik_dict = cik_df['CIK'].to_dict()
cik_dict = {v: k for k, v in cik_dict.items()}

df['ticker'] = df['cik'].map(cik_dict)


In [88]:
df1_gen = pd.read_csv("./Data/texts_example1.csv.gzip",compression="gzip",parse_dates=['release_date'],chunksize=1000,index_col=[0])
#df2_gen = pd.read_csv("Data/texts2.csv.gzip",compression="gzip",parse_dates=['release_date'],chunksize=1000)
df1 = pd.concat([df for df in df1_gen])
#df2 = pd.concat([df for df in df2_gen])
#df2 = pd.read_csv("Data/texts2.csv",parse_dates=['release_date'],encoding="utf_8",index_col=[0])
gc.collect()
# df = pd.concat([df1,df2],axis=0)
df = df1
gc.collect()
# df['items'] = df['items'].map(lambda x: ast.literal_eval(x))

ValueError: malformed node or string: []

In [24]:
df

Unnamed: 0.1,Unnamed: 0,ticker,cik,txt_link,doc_name,GICS Sector,GICS Sub Industry,text,release_date,items
0,1962,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0001193125-18-045761.txt,Information Technology,Information Technology,0001193125-18-045761.txt : 20180214 0001193125...,2018-02-14 16:54:21,[]
1,1963,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0000320193-18-000005.txt,Information Technology,Information Technology,0000320193-18-000005.txt : 20180201 0000320193...,2018-02-01 16:30:17,"['Item 2.02', 'Item 9.01']"
2,1964,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0001193125-17-341015.txt,Information Technology,Information Technology,0001193125-17-341015.txt : 20171113 0001193125...,2017-11-13 16:44:57,[]
3,1965,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0000320193-17-000067.txt,Information Technology,Information Technology,0000320193-17-000067.txt : 20171102 0000320193...,2017-11-02 16:30:16,[]
4,1966,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0001193125-17-308859.txt,Information Technology,Information Technology,0001193125-17-308859.txt : 20171012 0001193125...,2017-10-12 16:31:33,[]
5,1967,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0001193125-17-282809.txt,Information Technology,Information Technology,0001193125-17-282809.txt : 20170912 0001193125...,2017-09-12 16:45:59,[]
6,1968,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0001193125-17-262261.txt,Information Technology,Information Technology,0001193125-17-262261.txt : 20170818 0001193125...,2017-08-18 16:30:58,[]
7,1969,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0000320193-17-000006.txt,Information Technology,Information Technology,0000320193-17-000006.txt : 20170801 0000320193...,2017-08-01 16:30:15,[]
8,1970,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0001193125-17-208226.txt,Information Technology,Information Technology,0001193125-17-208226.txt : 20170620 0001193125...,2017-06-20 16:36:33,[]
9,1971,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0001193125-17-181867.txt,Information Technology,Information Technology,0001193125-17-181867.txt : 20170524 0001193125...,2017-05-24 17:19:03,[]


In [61]:
# Find rows flagged where no date was found
# df = df.loc[~(df['release_date'] >= pd.datetime(year=2018,month=5,day=1))]
# df = df.drop_duplicates(subset="doc_name")
# df.index.names = ['ticker']
# df = df.reset_index()

In [81]:
gc.collect()

349

In [30]:

#from FinDataExtractor import FinDataExtractor
fin_data = FinDataExtractor()
## Load pickle of ticker, date, and doc number

df[['price_change','vix']]= pd.DataFrame(df[['ticker','release_date']].apply(fin_data.get_change,axis=1).tolist(), index=df.index)
# df['vix'] = df[['ticker','release_date']].apply(fin_data.get_change,axis=1).iloc[:,1]
df['rm_week'] = df[['ticker','release_date']].apply(fin_data.get_historical_movements,period="week",axis=1)
df['rm_month'] = df[['ticker','release_date']].apply(fin_data.get_historical_movements,period="month",axis=1)
df['rm_qtr'] = df[['ticker','release_date']].apply(fin_data.get_historical_movements,period="quarter",axis=1)
df['rm_year'] = df[['ticker','release_date']].apply(fin_data.get_historical_movements,period="year",axis=1)
df["signal"] = df['price_change'].map(lambda x: "stay" if -1<x<1 else ("up" if x>1 else "down"))
df.head()

Unnamed: 0,ticker,cik,txt_link,doc_name,GICS Sector,GICS Sub Industry,text,release_date,items,price_change,vix,rm_week,rm_month,rm_qtr,rm_year,signal
1962,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0001193125-18-045761.txt,Information Technology,Information Technology,0001193125-18-045761.txt : 20180214 0001193125...,2018-02-14 16:54:21,[],2.0,2698.629883,4.96,-8.67,-12.19,21.31,up
1963,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0000320193-18-000005.txt,Information Technology,Information Technology,0000320193-18-000005.txt : 20180201 0000320193...,2018-02-01 16:30:17,"[Item 2.02, Item 9.01]",0.88,2821.97998,-2.26,-4.93,3.1,36.79,stay
1964,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0001193125-17-341015.txt,Information Technology,Information Technology,0001193125-17-341015.txt : 20171113 0001193125...,2017-11-13 16:44:57,[],1.23,2584.840088,0.7,11.58,8.45,39.04,up
1965,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0000320193-17-000067.txt,Information Technology,Information Technology,0000320193-17-000067.txt : 20171102 0000320193...,2017-11-02 16:30:16,[],5.36,2579.850098,5.5,6.52,5.45,28.88,up
1966,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0001193125-17-308859.txt,Information Technology,Information Technology,0001193125-17-308859.txt : 20171012 0001193125...,2017-10-12 16:31:33,[],2.15,2550.929932,0.82,-4.53,5.59,28.49,up


In [31]:
df.to_csv("docs_fin_default.csv")

### _References_

* Code borrowed and modified from: https://github.com/yiaktan/NLP-Stock-Prediction