# CIK-Date-Form
Original version of "Companies from 1994-2018". This notebook performs the same task with exception of only 3 columns instead of 5. 

- CIK
- Form
- Date or Year

In [3]:
# import our libraries
import requests
import time
import datetime
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import urlopen

## exp_seq_merge_int()
Function that converts any long-written integer into list.
For example, I have integer 123456789101112, the exp_seq_merge_int would convert it into list [1,2,3,4,5,6,7,8,9,10,11,12]. 

It helps the code to correctly parse through right amount of pages per search

In [4]:
def exp_seq_merge_str(digits_str, base):
    combined_list = []
    merge_len = 1
    start_at = 0
    end_at = base
    while start_at < end_at:
        for i in range(start_at, end_at, merge_len):
            combined_str = ''
            for ii in range(i, i + merge_len):
                combined_str += digits_str[ii]
            combined_list.append(combined_str)
        start_at = end_at
        end_at *= base
        end_at = end_at if end_at < len(digits_str) else len(digits_str)
        merge_len += 1

    return combined_list

def exp_seq_merge_int(n, base):
    return list(map(int, exp_seq_merge_str('0'+str(n), base)))[1:]

## get_filing_link()

Function get_filing_link() retrives the URL of text document.

In [5]:
def get_filing_link(table_data):
    base_url = 'https://www.sec.gov/'
    td_href = table_data[comp_row][2].find('a', {'href':True})['href'] 
    url = base_url + td_href
    return url

## get_CIK()

Function get_CIK() scans the content of URL and parses the central index key inside the document.

In [6]:
# New
def get_CIK(url):
    headers = {"User-Agent": "My-User_Agent"}
    req = requests.get(url, headers={'User-Agent': 'My-User_Agent', 'From': 'tnurmanov@drew.edu'})
        
    try:
        contents = req.content.decode('utf-8').split()
        contents.index('KEY:')
        index = contents.index('KEY:')
        cik = contents[index+1]
    except ValueError:
        cik = 'NO CIK FOUND'
    except UnicodeDecodeError:
        cik = 'UnicodeError'
        
    return cik

## Main code

This code has been ran three times. First it parsed data of years 1994 to 2007, 2008 to 2014, and 2015 to 2018. Doing the initial period (1994-2018) all at once, may cause an overload of the code, so the operation will have to be restarted. 

The exectuion may take several hours.

In [14]:
# I wanted to see how long exactly it took me to process the code
startTime = time.time()

# First run, set sequnce 1994 to 2003.
# Second, 2003 to 2011
# Third 2011 to 2019
# Because of the Python's nature we need to end it by 2019, but last number is 2018
year = [i for i in range(2011,2019)]

# Dictionary to store companies
companies = {}

# Lists of a company CIK, type of form, date
company_name = []
company_cik = []
company_form = []
company_date = []
company_url = []

for j in range(len(year)):
    
    print('-'*50)
    print('Year:', str(year[j]), '---', 'Processing...')
    
    # base URL for the SEC EDGAR browser
    endpoint = r"https://www.sec.gov/cgi-bin/srch-edgar"

    ########################################################
    # This piece of code defines a number of pages per search
    # therefore computer 

    # define our parameters dictionary
    param_dict = {'text':'S-1 OR S-1/A OR S-1MEF',
                  'start':'1',
                  'count':'80',
                  'first':str(year[j]),
                  'last':str(year[j])}

    # request the url, and then parse the response.
    response = requests.get(url = endpoint, params = param_dict)
    soup = BeautifulSoup(response.content, 'html.parser')

    # find the page table with our data 
    table = soup.find('div', attrs = {'style':'margin-left: 10px'})
    page_amount = table.findAll('center')[0]
    page_amount = page_amount.text
    page_amount = page_amount.replace(" ", "")
    page_amount = page_amount[1:-6]
    
    if len(page_amount) <= 9:
        base = len(page_amount)+1
    else:
        base = 10
        
    page_list = exp_seq_merge_int(page_amount, base)
    ########################################################
    
    page = 1
    page_info = 1
    
    # PAGE
    for i in range(len(page_list)):
        # define our parameters dictionary
        param_dict = {'text':'S-1 OR S-1/A OR S-1MEF',
                      'start':str(page),
                      'count':'80',
                      'first':str(year[j]),
                      'last':str(year[j])}
        
        print('Page:', page_info, 'out of', len(page_list), end='\r')
        
        # request the url, and then parse the response.
        response = requests.get(url = endpoint, params = param_dict)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # find the page table with our data 
        table = soup.find('div', attrs = {'style':'margin-left: 10px'})
        
        # List of given data of all companies
        table_data = []
        
        # This "for loop" gets rid of all attributes 
        # that don't relate to this task
        for row in table.findAll('tr'):
            data = row.findAll('td')
            # if attribute length is equal to 6, it will be added to table_data
            if len(data) == 6:
                table_data.append(data)
                
        
        # FILES
        comp_row = 0   
        
        for i in range(len(table_data)-1):
            url = get_filing_link(table_data)
            company_url.append(url)
            company_cik.append(get_CIK(url))
            company_name.append(table_data[comp_row][1].text)
            company_date.append(table_data[comp_row][4].text)
            company_form.append(table_data[comp_row][3].text)
            comp_row += 1
            
        page += 80
        page_info += 1
    
    # Last file has to be retrived outside the 'for' loop
    url = get_filing_link(table_data)
    company_url.append(url)
    company_cik.append(get_CIK(url))
    company_name.append(table_data[comp_row][1].text)
    company_date.append(table_data[comp_row][4].text)
    company_form.append(table_data[comp_row][3].text)
    
    print('Year:', str(year[j]), '---', 'Complete!')

companies['Company Name'] = company_name    
companies['CIK'] = company_cik
companies['Form'] = company_form
companies['Filing Date'] = company_date
companies['URL'] = company_url


print('\n')
print('Task complete!')

print('Company Name:', len(companies['Company Name']))
print('CIK:', len(companies['CIK']))
print('Form:', len(companies['Form']))
print('Filing Date:', len(companies['Filing Date']))
print('URL:', len(companies['URL']))

print('\n')
seconds = (time.time() - startTime)
seconds = int(seconds)
execution_time = datetime.timedelta(seconds=seconds)
print('Execution time (hours, minutes, seconds):', execution_time)

--------------------------------------------------
Year: 2003 --- Processing...
Year: 2003 --- Complete!
--------------------------------------------------
Year: 2004 --- Processing...
Year: 2004 --- Complete!
--------------------------------------------------
Year: 2005 --- Processing...
Year: 2005 --- Complete!
--------------------------------------------------
Year: 2006 --- Processing...
Year: 2006 --- Complete!
--------------------------------------------------
Year: 2007 --- Processing...
Year: 2007 --- Complete!
--------------------------------------------------
Year: 2008 --- Processing...
Year: 2008 --- Complete!
--------------------------------------------------
Year: 2009 --- Processing...
Year: 2009 --- Complete!
--------------------------------------------------
Year: 2010 --- Processing...
Year: 2010 --- Complete!


Task complete!
Company Name: 26179
CIK: 26179
Form: 26179
Filing Date: 26179
URL: 26179


Execution time (hours, minutes, seconds): 2:02:49


## 1994-2002

In [10]:
companies_1994_2002 = companies

df = pd.DataFrame(companies_1994_2002, dtype = str)
display(df.head(15))
display(df.info())

df.to_excel('1994-2002.xlsx')

## 2003 -2010

In [15]:
companies_2003_2010 = companies

df = pd.DataFrame(companies_2003_2010, dtype = str)
display(df.head(15))
display(df.info())

df.to_excel('2003-2010.xlsx')

Unnamed: 0,Company Name,CIK,Form,Filing Date,URL
0,21ST CENTURY HOLDING CO,1069996,S-1/A,11/04/2003,https://www.sec.gov//Archives/edgar/data/10699...
1,21ST CENTURY HOLDING CO,1069996,S-1,09/12/2003,https://www.sec.gov//Archives/edgar/data/10699...
2,2ND SWING INC,1098044,S-1/A,08/12/2003,https://www.sec.gov//Archives/edgar/data/10980...
3,2ND SWING INC,1098044,S-1,07/17/2003,https://www.sec.gov//Archives/edgar/data/10980...
4,2ND SWING INC,1098044,S-1/A,06/02/2003,https://www.sec.gov//Archives/edgar/data/10980...
5,3D SYSTEMS CORP,910638,S-1,08/29/2003,https://www.sec.gov//Archives/edgar/data/91063...
6,99 CENT STUFF INC,1176435,S-1/A,11/26/2003,https://www.sec.gov//Archives/edgar/data/11764...
7,99 CENT STUFF INC,1176435,S-1/A,11/19/2003,https://www.sec.gov//Archives/edgar/data/11764...
8,99 CENT STUFF INC,1176435,S-1/A,10/29/2003,https://www.sec.gov//Archives/edgar/data/11764...
9,99 CENT STUFF INC,1176435,S-1/A,10/10/2003,https://www.sec.gov//Archives/edgar/data/11764...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26179 entries, 0 to 26178
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Company Name  26179 non-null  object
 1   CIK           26179 non-null  object
 2   Form          26179 non-null  object
 3   Filing Date   26179 non-null  object
 4   URL           26179 non-null  object
dtypes: object(5)
memory usage: 1022.7+ KB


None

## 2011-2018

In [None]:
companies_2011_2018 = companies

df = pd.DataFrame(companies_2011_2018, dtype = str)
display(df.head(15))
display(df.info())

df.to_excel('2011-2018.xlsx')

## Using Pandas

I used Pandas to concatenate all three documents into one

In [15]:
df1 = pd.read_csv('1997-2007.csv')
df2 = pd.read_csv('2008-2014.csv')
df3 = pd.read_csv('2015-2018.csv')
del df1['Unnamed: 0']
del df2['Unnamed: 0']
del df3['Unnamed: 0']
display(df1)
display(df2)
display(df3)

Unnamed: 0,CIK,Date,Form
0,0000883702,08/04/1994,S-1/A
1,0000883702,07/26/1994,S-1/A
2,0000883702,07/14/1994,S-1/A
3,0000883702,06/10/1994,S-1
4,0000003370,12/19/1994,S-1
...,...,...,...
37324,0001277092,04/19/2007,S-1
37325,0001277092,02/01/2007,S-1/A
37326,0001277092,01/24/2007,S-1
37327,0000890923,03/01/2007,S-1/A


Unnamed: 0,CIK,Date,Form
0,0001437719,07/03/2008,S-1
1,0001446926,10/03/2008,S-1
2,0000038723,04/28/2008,S-1/A
3,0001389415,04/23/2008,S-1
4,0001444756,11/17/2008,S-1
...,...,...,...
27474,0001594178,02/21/2014,S-1
27475,0001378453,12/09/2014,S-1MEF
27476,0001378453,12/08/2014,S-1/A
27477,0001378453,12/04/2014,S-1/A


Unnamed: 0,CIK,Date,Form
0,0000859747,06/25/2015,S-1/A
1,0000859747,06/09/2015,S-1/A
2,0000859747,05/29/2015,S-1/A
3,0000859747,05/12/2015,S-1/A
4,0000859747,04/07/2015,S-1
...,...,...,...
10598,0001713683,02/16/2018,S-1
10599,0001423774,04/11/2018,S-1MEF
10600,0001423774,04/10/2018,S-1/A
10601,0001423774,04/02/2018,S-1/A


In [16]:
df_main = pd.concat([df1, df2, df3])
display(df_main)

display(display(df_main.info()))

Unnamed: 0,CIK,Date,Form
0,0000883702,08/04/1994,S-1/A
1,0000883702,07/26/1994,S-1/A
2,0000883702,07/14/1994,S-1/A
3,0000883702,06/10/1994,S-1
4,0000003370,12/19/1994,S-1
...,...,...,...
10598,0001713683,02/16/2018,S-1
10599,0001423774,04/11/2018,S-1MEF
10600,0001423774,04/10/2018,S-1/A
10601,0001423774,04/02/2018,S-1/A


<class 'pandas.core.frame.DataFrame'>
Int64Index: 75411 entries, 0 to 10602
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   CIK     75411 non-null  object
 1   Date    75411 non-null  object
 2   Form    75411 non-null  object
dtypes: object(3)
memory usage: 2.3+ MB


None

None

In [17]:
display(df_main['Form'].value_counts())

# As previously we remove extra type of values 
# leaving only S-1/A, S-1, and S-1MEF
values = ['S-3/A', 'S-3', 'POS462B', 'X-17A-5', 
          'D/A', 'D', 'N-8F ORDR', '497J', 'N-8F', 
          'N-8F NTC', '485APOS', 'FOCUSN', '497', 
          'PRES14A','DEFS14A', 'DEFA14A', 'N-30D', 
          'NSAR-B', 'POS AM', 'NSAR-A', '24F-2NT', 
          '485B24E', 'N-14AE']
df_main = df_main[df_main.Form.isin(values) == False]

display(df_main['Form'].value_counts())

S-1/A        52229
S-1          21247
S-1MEF        1406
S-3/A          274
497             88
DEFS14A         41
PRES14A         41
S-3             25
DEFA14A         13
POS462B          9
N-30D            5
NSAR-B           4
D/A              4
X-17A-5          4
POS AM           3
NSAR-A           3
497J             3
24F-2NT          2
485B24E          2
D                2
N-14AE           1
N-8F             1
FOCUSN           1
485APOS          1
N-8F ORDR        1
N-8F NTC         1
Name: Form, dtype: int64

S-1/A     52229
S-1       21247
S-1MEF     1406
Name: Form, dtype: int64

In [18]:
# Save into excel file
df_main['CIK'] = df_main['CIK'].astype(str)
df_main.to_excel('CikDateForm.xlsx')

## Or
I saved it with date instead of year because full date has more contains more information rather than just year. But lets make it into the format it supposed to be initially (CIK, YEAR, FORM)

In [19]:
# Turn date into datetime 
df_main['Date'] = pd.to_datetime(df_main['Date'])
print(df_main.info())

df_main['Year'] = df_main['Date'].dt.year

del df_main['Date']

df_main

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74882 entries, 0 to 10602
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   CIK     74882 non-null  object        
 1   Date    74882 non-null  datetime64[ns]
 2   Form    74882 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 2.3+ MB
None


Unnamed: 0,CIK,Form,Year
0,0000883702,S-1/A,1994
1,0000883702,S-1/A,1994
2,0000883702,S-1/A,1994
3,0000883702,S-1,1994
4,0000003370,S-1,1994
...,...,...,...
10598,0001713683,S-1,2018
10599,0001423774,S-1MEF,2018
10600,0001423774,S-1/A,2018
10601,0001423774,S-1/A,2018


In [20]:
# Rearrange columns
df_main = df_main[['CIK', 'Year', 'Form']]
display(df_main)

# Save into Excel
df_main.to_excel('CIKYearForm.xlsx')

Unnamed: 0,CIK,Year,Form
0,0000883702,1994,S-1/A
1,0000883702,1994,S-1/A
2,0000883702,1994,S-1/A
3,0000883702,1994,S-1
4,0000003370,1994,S-1
...,...,...,...
10598,0001713683,2018,S-1
10599,0001423774,2018,S-1MEF
10600,0001423774,2018,S-1/A
10601,0001423774,2018,S-1/A
