# Scraping the SEC Litigations Page

This project represents all four stages of the data science lifecycle:

(1) gathering data
(2) asking questions
(3) conducting analysis
(4) drawing conclusions

It aims to better organize and understand SEC litigations using a variety of packages and tools.

Importing helpful tools

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import requests

from bs4 import BeautifulSoup
import xml.etree.cElementTree as et

from IPython.display import display, Latex, Markdown


# RSS Feed

#### Let's first start with scraping the RSS feed which is a live, neatly formatted update of litigations of the SEC. Here is the link I'll be using: https://www.sec.gov/rss/litigation/litreleases.xml


In [2]:
parsedXML = et.parse("litigationData/sec11.xml")

In [3]:
for node in parsedXML.getroot():
    item = node.attrib.get('item')
    title = node.find('title')
    link = node.find('link')
    description = node.find('description')
    guid = node.find('guid')
    pubDate = node.find('pubDate')

In [4]:
def getvalueofnode(node):
    """ return node text or None """
    return node.text if node is not None else None

In [5]:
##Had to delete the <CHANNEL> tag from the file but at least it works 

parsed_xml = et.parse("litigationRSS/RSS_oct_4.xml")
dfcols = ['title', 'link', 'description', 'guid', 'pubDate']
df_xml = pd.DataFrame(columns=dfcols)
 
for node in parsed_xml.getroot():
    item = node.attrib.get('item')
    title = node.find('title')
    link = node.find('link')
    description = node.find('description')
    guid = node.find('guid')
    pubDate = node.find('pubDate')

    df_xml = df_xml.append(
        pd.Series([getvalueofnode(title), getvalueofnode(link),
                   getvalueofnode(description), getvalueofnode(guid),
                   getvalueofnode(pubDate)], index=dfcols), ignore_index=True)

In [6]:
df_xml.head(5)

Unnamed: 0,title,link,description,guid,pubDate
0,Christopher J. Spencer and John Busshaus,https://www.sec.gov/litigation/litreleases/201...,SEC Charges CEO and CFO of Digital Entertainme...,LR-24636,"Fri, 04 Oct 2019 11:20:02 EDT"
1,"PlexCorps, Dominic Lacroix, and Sabrina Paradi...",https://www.sec.gov/litigation/litreleases/201...,Defendants Charged in Fraudulent ICO to Pay Ne...,LR-24635,"Wed, 02 Oct 2019 16:54:27 EDT"
2,"Woojae (""Steve"") Jung, et al.",https://www.sec.gov/litigation/litreleases/201...,SEC Obtains Final Judgment Against Investment ...,LR-24634,"Wed, 02 Oct 2019 08:54:06 EDT"
3,"Westport Capital Markets, LLC and Christopher ...",https://www.sec.gov/litigation/litreleases/201...,SEC Obtains Partial Summary Judgment Against I...,LR-24633,"Wed, 02 Oct 2019 08:18:02 EDT"
4,"Bluepoint Investment Counsel, et al.",https://www.sec.gov/litigation/litreleases/201...,SEC Announces Fraud Charges Related to Wiscons...,LR-24632,"Mon, 30 Sep 2019 17:17:42 EDT"


### Discovered that the RSS feed is just a subscription feed and does not include everything - still helpful to know and have the code to parse it in case. Now let's try scraping the HTML table on the webpage.

In [7]:
import urllib
import urllib.request
from bs4 import BeautifulSoup 

## Attempt 1:  Sucesfully scraped; but not ideal format

In [8]:
def make_soup(url):
    the_page = requests.get(url)
    soup_data = BeautifulSoup(the_page.content, "html.parser")
    return soup_data

### Starting with 2010:

In [9]:
soup2010 = make_soup('https://www.sec.gov/litigation/litreleases/litrelarchive/litarchive2010.shtml')

In [10]:
data = []
table = soup2010.find_all('table')[4] # target the specific table
header, *rows = table.find_all('tr')

for row in rows:
    try:
        litigation, date, complaint = row.find_all('td')
    except ValueError:
        continue # ignore header row and quarter rows

    id = litigation.text.strip().split('-')[-1]
    date = date.text.strip()
    desc = complaint.text.strip().split('\t')[0]
    lit_url = litigation.find('a').get('href')

    try:
        comp_url = complaint.find('a').get('href')
    except AttributeError:
        comp_ulr = None # complaint url is optional

    info = dict(id=id, date=date, desc=desc, lit_url=lit_url, comp_url=comp_url)
    data.append(info)

In [11]:
data[0:5]

[{'id': '21795',
  'date': 'Dec 27, 2010',
  'desc': 'Alcatel-Lucent, S.A.',
  'lit_url': '/litigation/litreleases/2010/lr21795.htm',
  'comp_url': '/litigation/complaints/2010/comp21795.pdf'},
 {'id': '21794',
  'date': 'Dec 23, 2010',
  'desc': 'One or More Unknown Purchasers of Options of InterMune, Inc.',
  'lit_url': '/litigation/litreleases/2010/lr21794.htm',
  'comp_url': '/litigation/complaints/2010/comp21794.pdf'},
 {'id': '21793',
  'date': 'Dec 23, 2010',
  'desc': 'Michael E. Kelly, et al.',
  'lit_url': '/litigation/litreleases/2010/lr21793.htm',
  'comp_url': '/litigation/complaints/2010/comp21794.pdf'},
 {'id': '21792',
  'date': 'Dec 23, 2010',
  'desc': 'One or More Unknown Purchasers of Securities of Martek Biosciences Corporation',
  'lit_url': '/litigation/litreleases/2010/lr21792.htm',
  'comp_url': '/litigation/complaints/2010/comp21794.pdf'},
 {'id': '21791',
  'date': 'Dec 22, 2010',
  'desc': 'Pharma Holdings, Inc., Edward Klapp IV and Edward Klapp Jr.',
  'lit

# Attempt 2: Placing into a pandas dataframe; mostly functional

### SEC 2010

In [12]:
from bs4 import BeautifulSoup 
import requests, re

def make_soup(url):
    the_page = requests.get(url)
    soup_data = BeautifulSoup(the_page.content, "html.parser")
    return soup_data

soup2010 = make_soup('https://www.sec.gov/litigation/litreleases/litrelarchive/litarchive2010.shtml')
releases = []
links = []
dates = []
descs = [] 
addit_urls = []

for i in soup2010.select('td:nth-of-type(1):has([href^="/litigation/litreleases/"])'):
    sib_sib = i.next_sibling.next_sibling.next_sibling.next_sibling
    releases+= [i.a.text]
    links+= [i.a['href']]
    dates += [i.next_sibling.next_sibling.text.strip()]
    descs += [re.sub('\t+|\s+',' ',sib_sib.text.strip())]
    addit_urls += ['N/A' if sib_sib.a is None else sib_sib.a['href']]

result = list(zip(releases, links, dates, descs, addit_urls))


In [13]:
fulllink = []
for i in links:
    fulllink += ['https://www.sec.gov'+i]
    
additional_url = []
for i in addit_urls:
    if i == 'N/A':
        additional_url += ['N/A']
    else: 
        additional_url += ['https://www.sec.gov' + i]

In [14]:
sec2010 = pd.DataFrame({'title' : releases,
                   'links' : fulllink,
                   'dates' : dates,
                   'descs' : descs,
                   'additional url' : additional_url})

In [15]:
sec2010.head(5)

Unnamed: 0,title,links,dates,descs,additional url
0,LR-21795,https://www.sec.gov/litigation/litreleases/201...,"Dec 27, 2010","Alcatel-Lucent, S.A. See also: SEC Complaint",https://www.sec.gov/litigation/complaints/2010...
1,LR-21794,https://www.sec.gov/litigation/litreleases/201...,"Dec 23, 2010",One or More Unknown Purchasers of Options of I...,https://www.sec.gov/litigation/complaints/2010...
2,LR-21793,https://www.sec.gov/litigation/litreleases/201...,"Dec 23, 2010","Michael E. Kelly, et al.",
3,LR-21792,https://www.sec.gov/litigation/litreleases/201...,"Dec 23, 2010",One or More Unknown Purchasers of Securities o...,
4,LR-21791,https://www.sec.gov/litigation/litreleases/201...,"Dec 22, 2010","Pharma Holdings, Inc., Edward Klapp IV and Edw...",https://www.sec.gov/litigation/complaints/2010...


# Great - it works for one year lets do it for all years!

In [16]:
from bs4 import BeautifulSoup 
import requests, re



"""Initializing all the lists """
releases = []
incomplete_links = []
dates = []
descs = [] 
addit_urls = []


"""This is a helper function that turns any url to a BSoup object."""
def make_soup(url):
    the_page = requests.get(url)
    soup_data = BeautifulSoup(the_page.content, "html.parser")
    return soup_data



In [17]:
"""" Making a list of soup objects"""
souplist = []
years = ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']

start = 'https://www.sec.gov/litigation/litreleases/litrelarchive/litarchive'
end = '.shtml'

for i in years:
    souplist += [make_soup(start + i + end)]

In [18]:
"""Gathering the columns"""

for k in souplist:
    for i in k.select('td:nth-of-type(1):has([href^="/litigation/litreleases/"])'):
        sib_sib = i.next_sibling.next_sibling.next_sibling.next_sibling
        releases+= [i.a.text]
        incomplete_links+= [i.a['href']]
        dates += [i.next_sibling.next_sibling.text.strip()]
        descs += [re.sub('\t+|\s+',' ',sib_sib.text.strip())]
        addit_urls += ['N/A' if sib_sib.a is None else sib_sib.a['href']]

In [19]:
"""Adding link headers to make them more functional"""
fulllink = []
for i in incomplete_links:
	fulllink += ['https://www.sec.gov'+i]

additional_url = []
for i in addit_urls:
    if i == 'N/A':
        additional_url += ['N/A']
    else: 
        additional_url += ['https://www.sec.gov' + i]

In [20]:
"""Compiling the dateaframe"""
lit = pd.DataFrame({'title' : releases,
                   'links' : fulllink,
                   'dates' : dates,
                   'descs' : descs,
                   'additional url' : additional_url})

In [21]:
lit.describe()

Unnamed: 0,title,links,dates,descs,additional url
count,3012,3012,3012,3012,3012.0
unique,3012,3011,1556,2634,1390.0
top,LR-23062,https://www.sec.gov/litigation/litreleases/201...,"Sep. 28, 2018","Galleon Management, LP, et al.",
freq,1,2,11,10,1617.0


In [22]:
"""Dropping the 2 duplicate values (ok because we have 3000 entries)"""


lit = lit[lit['links'] != 'https://www.sec.gov/litigation/litreleases/2013/lr22633a.htm']
len(lit)

3010

# III. Converting from DF to Excel

In [23]:
#from pandas import ExcelWriter


#writer = ExcelWriter('seclitigations.xlsx')
#lit.to_excel(writer)
#writer.save()

#DF TO CSV: Only run when necessary to update
#lit.to_csv('litigations.xlsx', sep=',')

# Step 2 of the Data Science Lifecycle (gathering data) is on it way! On to EDA, reasking questions, and regather data if necessary.


# IV. Reading the pdf files themselves

# Next Goals:
    1. Analyze the text per lin
        a. download all the links into pdf that can be parsed
        b. use pypdf2 to analyze all the pdfs
        c. _analysis_
    

In [24]:
linkLists = lit['links'].value_counts().index


In [25]:
#https://www.sec.gov/litigation/complaints/2018/comp24378.pdf

In [32]:
import PyPDF2
import re

# open the pdf file
object = PyPDF2.PdfFileReader("testcomplaint.pdf")

# get number of pages
NumPages = object.getNumPages()

# define keyterms
String = "market"

# extract text and do the search
for i in range(0, NumPages):
    PageObj = object.getPage(i)
    print("this is page " + str(i)) 
    Text = PageObj.extractText() 
    # print(Text)
    ResSearch = re.search(String, Text)
    print(ResSearch)

this is page 0
<re.Match object; span=(690, 696), match='market'>
this is page 1
<re.Match object; span=(348, 354), match='market'>
this is page 2
<re.Match object; span=(615, 621), match='market'>
this is page 3
None
this is page 4
<re.Match object; span=(315, 321), match='market'>
this is page 5
None
this is page 6
<re.Match object; span=(142, 148), match='market'>
this is page 7
<re.Match object; span=(301, 307), match='market'>
this is page 8
None
this is page 9
None
this is page 10
None


In [29]:
NumPages

11

In [None]:
from PyPDF2 import PdfFileMerger
import pdftotext

# Load your PDF
with open("testcomplaint", "rb") as f:
    pdf = pdftotext.PDF(f)

# Save all text to a txt file.
with open('output.txt', 'w') as f:
    f.write("\n\n".join(pdf))

