# Companies with S-1 form ONLY
This notebook creates a table of years from 1998 to 2018, containing total number of companies that submitted **ONLY** form S-1 and no any other ammenidng type of form.

In [216]:
# import our libraries
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

## exp_seq_merge_int()
Function that converts any long-written integer into list.
For example, I have integer 123456789101112, the exp_seq_merge_int would convert it into list [1,2,3,4,5,6,7,8,9,10,11,12]. 

It helps the code to correctly parse through right amount of pages per search

In [217]:
def exp_seq_merge_str(digits_str, base):
    combined_list = []
    merge_len = 1
    start_at = 0
    end_at = base
    while start_at < end_at:
        for i in range(start_at, end_at, merge_len):
            combined_str = ''
            for ii in range(i, i + merge_len):
                combined_str += digits_str[ii]
            combined_list.append(combined_str)
        start_at = end_at
        end_at *= base
        end_at = end_at if end_at < len(digits_str) else len(digits_str)
        merge_len += 1

    return combined_list

def exp_seq_merge_int(n, base):
    return list(map(int, exp_seq_merge_str('0'+str(n), 10)))[1:]

## Main code

Attention: code takes 5-10 min to process

In [349]:
# Sequence of years from 1998 to 2018
# Because of the Python's nature we need to end it by 2019, but last number is 2018
year = [i for i in range(1998,2019)]

# Dictionary to store companies
companies = {}

# Lists of a company name, type of form, date
company_name = []
company_form = []
company_date = []

for j in range(len(year)):
    
    # base URL for the SEC EDGAR browser
    endpoint = r"https://www.sec.gov/cgi-bin/srch-edgar"

    ########################################################
    # This piece of code defines a number of pages per search
    # therefore computer 

    # define our parameters dictionary
    param_dict = {'text':'S-1 OR S-1/A OR S-1MEF',
                  'start':'1',
                  'count':'80',
                  'first':str(year[j]),
                  'last':str(year[j])}

    # request the url, and then parse the response.
    response = requests.get(url = endpoint, params = param_dict)
    soup = BeautifulSoup(response.content, 'html.parser')

    # find the page table with our data 
    table = soup.find('div', attrs = {'style':'margin-left: 10px'})
    page_amount = table.findAll('center')[0]
    page_amount = page_amount.text
    page_amount = page_amount.replace(" ", "")
    page_amount = page_amount[1:-6]
    page_list = exp_seq_merge_int(page_amount, 10)
    ########################################################
    
    page = 1
    
    # PAGE
    for i in range(len(page_list)):
        # define our parameters dictionary
        param_dict = {'text':'S-1 OR S-1/A OR S-1MEF',
                      'start':str(page),
                      'count':'80',
                      'first':str(year[j]),
                      'last':str(year[j])}
        
        # request the url, and then parse the response.
        response = requests.get(url = endpoint, params = param_dict)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # find the page table with our data 
        table = soup.find('div', attrs = {'style':'margin-left: 10px'})
        
        # List of given data of all companies
        table_data = []
        
        # This "for loop" gets rid of all attributes 
        # that don't relate to this task
        for row in table.findAll('tr'):
            data = row.findAll('td')
            # if statement will remove attributes with length that NOT equal to 6
            if len(data) == 6:
                table_data.append(data)
                
        # FILES
        comp_row = 0
        for i in range(len(table_data)-1):
            company_name.append(table_data[comp_row][1].text)
            company_form.append(table_data[comp_row][3].text)
            company_date.append(table_data[comp_row][4].text)
            comp_row += 1
            
        page += 80
        
    # Last file has to be retrived outside the 'for' loop
    company_name.append(table_data[comp_row][1].text)
    company_form.append(table_data[comp_row][3].text)
    company_date.append(table_data[comp_row][4].text)
    

# Then we merge 'company_name', 'company_form', 'company_date' into dictionary 'companies'
# so we create data frame out of 'companies' 
companies['Name'] = company_name
companies['Form'] = company_form
companies['Date'] = company_date

companies

{'Name': ['1 800 CONTACTS INC',
  '1 800 CONTACTS INC',
  '180 JAMAICA INC',
  '180 JAMAICA INC',
  '180 JAMAICA INC',
  '1855 BANCORP',
  '1855 BANCORP',
  '1ST ATLANTIC GUARANTY CORP',
  '1ST ATLANTIC GUARANTY CORP',
  '1ST ATLANTIC GUARANTY CORP',
  '1ST STATE BANCORP INC',
  '201 WEST SOTELLO REALTY INC',
  '201 WEST SOTELLO REALTY INC',
  '201 WEST SOTELLO REALTY INC',
  '24/7 MEDIA INC',
  '24/7 MEDIA INC',
  '24/7 MEDIA INC',
  '24/7 MEDIA INC',
  '3CI COMPLETE COMPLIANCE CORP',
  '3CI COMPLETE COMPLIANCE CORP',
  '3DFX INTERACTIVE INC',
  '3DFX INTERACTIVE INC',
  '3DFX INTERACTIVE INC',
  '4FRONT TECHNOLOGIES INC',
  '4FRONT TECHNOLOGIES INC',
  'A 55 INC',
  'A 55 INC',
  'A 55 INC',
  'A I RECEIVABLES CORP',
  'A I RECEIVABLES CORP',
  'A I RECEIVABLES CORP',
  'A I RECEIVABLES CORP',
  'ABC DISPENSING TECHNOLOGIES INC',
  'ABC DISPENSING TECHNOLOGIES INC',
  'ABC DISPENSING TECHNOLOGIES INC',
  'ABGENIX INC',
  'ABGENIX INC',
  'ABGENIX INC',
  'ABGENIX INC',
  'ABGENIX INC

## Using Pandas package

In [565]:
df = pd.DataFrame(companies)
display(df)
display(df.info())

Unnamed: 0,Name,Form,Date
0,1 800 CONTACTS INC,S-1/A,02/03/1998
1,1 800 CONTACTS INC,S-1/A,01/16/1998
2,180 JAMAICA INC,S-1/A,01/27/1998
3,180 JAMAICA INC,S-1/A,01/27/1998
4,180 JAMAICA INC,S-1/A,01/21/1998
...,...,...,...
67408,"Zscaler, Inc.",S-1,02/16/2018
67409,ZUORA INC,S-1MEF,04/11/2018
67410,ZUORA INC,S-1/A,04/10/2018
67411,ZUORA INC,S-1/A,04/02/2018


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67413 entries, 0 to 67412
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    67413 non-null  object
 1   Form    67413 non-null  object
 2   Date    67413 non-null  object
dtypes: object(3)
memory usage: 1.5+ MB


None

Here I found another issue with archive of historical EDGAR docs.
Regardless the fact that I set the search EXACTLY for forms S-1, S-1/A and S-1MEF. There're still a leak of results I didn't request. However there's nothing I could do, if only to remove them out of the dataframe. 

Here I found another issue with archive of historical EDGAR docs.
Regardless the fact that I set the search EXACTLY for forms S-1, S-1/A and S-1MEF. There're still a leak of results I didn't request. However there's nothing I could do since we get this issue made by government website. 

All I could do is to remove them out of the dataframe. 

In [566]:
df['Form'].value_counts()

S-1/A        46748
S-1          19155
S-1MEF        1190
S-3/A          272
S-3             23
POS462B          9
X-17A-5          4
D/A              4
D                2
N-8F ORDR        1
497J             1
N-8F             1
N-8F NTC         1
485APOS          1
FOCUSN           1
Name: Form, dtype: int64

Drop rows that contain extra values

In [567]:
# drop any rows with values 'S-3/A', 'S-3', 'POS462B', 'X-17A-5', 'D/A' 
# 'D', 'N-8F ORDR', '497J', 'N-8F', 'N-8F NTC', '485APOS', 'FOCUSN' in the Form column

values = ['S-3/A', 'S-3', 'POS462B', 'X-17A-5', 'D/A', 'D', 'N-8F ORDR', 
                        '497J', 'N-8F', 'N-8F NTC', '485APOS', 'FOCUSN']
df = df[df.Form.isin(values) == False]

df['Form'].value_counts()

S-1/A     46748
S-1       19155
S-1MEF     1190
Name: Form, dtype: int64

Even though the Date column is 'object' type, as long as it is written in a right notation it can be converted into 'datetime' type. With the 'datetime' columns we can manipulate data about days, months and years any way we want.

In [568]:
print("Now Date is 'datetime64[ns]'")
print()
df['Date'] = pd.to_datetime(df['Date'])
print(df.info())

Now Date is 'datetime64[ns]'

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67093 entries, 0 to 67412
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Name    67093 non-null  object        
 1   Form    67093 non-null  object        
 2   Date    67093 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(2)
memory usage: 2.0+ MB
None


Now we create separate column with year only. And remove Date column.

In [569]:
df['Year'] = df['Date'].dt.year
del df['Date']

# Some how one file of 1997 appear in search of 1998
df = df[df.Year != 1997]


Unnamed: 0,Name,Form,Year
0,1 800 CONTACTS INC,S-1/A,1998
1,1 800 CONTACTS INC,S-1/A,1998
2,180 JAMAICA INC,S-1/A,1998
3,180 JAMAICA INC,S-1/A,1998
4,180 JAMAICA INC,S-1/A,1998
...,...,...,...
67408,"Zscaler, Inc.",S-1,2018
67409,ZUORA INC,S-1MEF,2018
67410,ZUORA INC,S-1/A,2018
67411,ZUORA INC,S-1/A,2018


Subset dataframe into three, each by form type. So then we can merge them together by Name and Year columns.

In [570]:
# Dataframe with S-1 
df_s1 = df[df.Form == 'S-1']
# Dataframe with S-1/A
df_s1a = df[df.Form == 'S-1/A']
# Dataframe with S-1MEF 
df_s1mef = df[df.Form == 'S-1MEF']

display(df_s1.head())
display(df_s1a.head())
display(df_s1mef.head())

Unnamed: 0,Name,Form,Year
6,1855 BANCORP,S-1,1998
10,1ST STATE BANCORP INC,S-1,1998
17,24/7 MEDIA INC,S-1,1998
19,3CI COMPLETE COMPLIANCE CORP,S-1,1998
22,3DFX INTERACTIVE INC,S-1,1998


Unnamed: 0,Name,Form,Year
0,1 800 CONTACTS INC,S-1/A,1998
1,1 800 CONTACTS INC,S-1/A,1998
2,180 JAMAICA INC,S-1/A,1998
3,180 JAMAICA INC,S-1/A,1998
4,180 JAMAICA INC,S-1/A,1998


Unnamed: 0,Name,Form,Year
73,ADAMS GOLF INC,S-1MEF,1998
170,AMERICAN DENTAL PARTNERS INC,S-1MEF,1998
194,AMERICAN TOWER CORP /MA/,S-1MEF,1998
210,AMKOR TECHNOLOGY INC,S-1MEF,1998
308,ATG INC,S-1MEF,1998


Merge them together by name of company and year. Also get rid of duplicates

In [571]:
# Merge dataframe S-1 with S-1/A into df_new
df_new = pd.merge(df_s1, df_s1a, on = ['Name', 'Year'], 
                  how = 'outer', suffixes=('_S1','_S1A'))
# Merge dataframe df_new with S-1MEF 
df_new = pd.merge(df_new, df_s1mef, on = ['Name', 'Year'], 
                  how = 'outer')
# After second merge there's column with forms S-1MEF,
# however unlike other two this one carries original name 'Form'.
# It will be better for an eye to rename it
df_new.rename(columns = {'Form':'Form_S1MEF'}, inplace = True)
display(df_new)

# There are a lot of duplicate rows, we don't need them 
df_new.drop_duplicates()

Unnamed: 0,Name,Form_S1,Year,Form_S1A,Form_S1MEF
0,1855 BANCORP,S-1,1998,S-1/A,
1,1ST STATE BANCORP INC,S-1,1998,,
2,24/7 MEDIA INC,S-1,1998,S-1/A,
3,24/7 MEDIA INC,S-1,1998,S-1/A,
4,24/7 MEDIA INC,S-1,1998,S-1/A,
...,...,...,...,...,...
57753,"TRI Pointe Homes, Inc.",,2013,,S-1MEF
57754,"ROYAL HAWAIIAN ORCHARDS, L.P.",,2014,,S-1MEF
57755,"Blue Water Global Group, Inc.",,2015,,S-1MEF
57756,Celcuity Inc.,,2017,,S-1MEF


Unnamed: 0,Name,Form_S1,Year,Form_S1A,Form_S1MEF
0,1855 BANCORP,S-1,1998,S-1/A,
1,1ST STATE BANCORP INC,S-1,1998,,
2,24/7 MEDIA INC,S-1,1998,S-1/A,
5,3CI COMPLETE COMPLIANCE CORP,S-1,1998,S-1/A,
6,3DFX INTERACTIVE INC,S-1,1998,S-1/A,
...,...,...,...,...,...
57753,"TRI Pointe Homes, Inc.",,2013,,S-1MEF
57754,"ROYAL HAWAIIAN ORCHARDS, L.P.",,2014,,S-1MEF
57755,"Blue Water Global Group, Inc.",,2015,,S-1MEF
57756,Celcuity Inc.,,2017,,S-1MEF


In [572]:
# Check for NaN values
display(df_new.isnull().sum())

# Select those rows that contain null values in columns Form_S1A and Form_S1MEF
# therefore we only left with companies that only submitted form S-1, 
# and never any amending form type
df_new = df_new[df_new.Form_S1A.isnull() == True][df_new.Form_S1MEF.isnull() == True]
df_new

Name              0
Form_S1       13343
Year              0
Form_S1A       5450
Form_S1MEF    52323
dtype: int64

  df_new = df_new[df_new.Form_S1A.isnull() == True][df_new.Form_S1MEF.isnull() == True]


Unnamed: 0,Name,Form_S1,Year,Form_S1A,Form_S1MEF
1,1ST STATE BANCORP INC,S-1,1998,,
26,ABLE TELCOM HOLDING CORP,S-1,1998,,
37,ACKERLEY GROUP INC,S-1,1998,,
38,ACREEDO HEALTH INC,S-1,1998,,
50,ADVANCED COMMUNICATION SYSTEMS INC,S-1,1998,,
...,...,...,...,...,...
44367,"WEYLAND TECH, INC.",S-1,2018,,
44372,WINNAN CORP.,S-1,2018,,
44375,"WIZARD ENTERTAINMENT, INC.",S-1,2018,,
44380,World Technology Corp.,S-1,2018,,


In [573]:
df_new["Year"].value_counts().sort_index(ascending=True)

1998    223
1999    191
2000    262
2001    100
2002    190
2003     98
2004    275
2005    155
2006    398
2007    385
2008    522
2009    446
2010    366
2011    255
2012    355
2013    276
2014    191
2015    195
2016    173
2017    162
2018    182
Name: Year, dtype: int64

## Conclusion
Seems like the number of companies with for S-1 only randomly varies each year, and there's no factor that'd cause it. In 2003 there were only 98 companies that didn't submit any amending forms, while the largest number of companies with S-1 form only were in year 2008.