# Setup

In [92]:
# import web scraping libraries
import pandas as pd
import csv
import time
import requests
from IPython.display import clear_output, display
from bs4 import BeautifulSoup
import datetime
import spacy
from tqdm import tqdm
nlp = spacy.load('en_core_web_sm')

In [93]:
leads_db = []

lead = {
    'uid': '1234567890',
    # 'name': 'John Doe',
    # 'email': '',
    # 'phone': '123-456-7890',
    # 'company': 'ABC Company',
    # 'title': 'CEO',
    # 'location': 'New York, NY',
    # 'website': 'www.abc.com',
    # 'linkedin': 'www.linkedin.com/in/johndoe',
    # 'twitter': 'www.twitter.com/johndoe',
    # 'snapchat': 'www.snapchat.com/johndoe',
    # 'twitch': 'www.twitch.com/johndoe',
    # 'tumblr': 'www.tumblr.com/johndoe',
    # 'reddit': 'www.reddit.com/johndoe',
    # 'quora': 'www.quora.com/johndoe',
    # 'medium': 'www.medium.com/johndoe',
    # 'github': 'www.github.com/johndoe',
    # 'angel': 'www.angel.co/johndoe',
    # 'crunchbase': 'www.crunchbase.com/johndoe',
    # 'producthunt': 'www.producthunt.com/johndoe',
    # 'behance': 'www.behance.com/johndoe',
    # 'dribbble': 'www.dribbble.com/johndoe',
    # 'flickr': 'www.flickr.com/johndoe',
    # 'country': 'United States',
    # 'state': 'New York',
    # 'city': 'New York',
    # 'zip': '10001',
    # 'industry': '',
    # 'revenue': '',
    # 'employees': '',
    # 'tags': '',
    # 'notes': '',
    'status': 'New',
    'date': datetime.datetime.now(),
    # 'assigned': 'John Doe',
    # 'owner': 'John Doe',
    'lead_score': 0,
    'lead_score_reason': '',
    'lead_score_date': datetime.datetime.now(),
    'lead_source': '',
}

In [96]:
FILTERED_ORGS_KEYWORDS = ['securities', 'sec', 'court', 'fbi', 'ponzi', 'llc', 'office', 'authority', 'act', 
    'commission', 'department', 'federal', 'government', 'law', 'legal', 'police', 'state', 'united', 'states', 'cyber']

WHITE_LIST_KEYWORDS = ['fraud', 'corruption', 'bribe', 'compliance']



# Base Functions

In [None]:
# write a function to check if a web page exists
def is_active_url(url):
    try:
        r = requests.get(url)
        if r.status_code == 200:
            return True
    except:
        pass
    return False

# download the web page and store it in a file as html
def download_page(url, file_name):
    try:
        # check if the url exists
        if is_active_url(url):
            r = requests.get(url)
            # save web page to data folder
            with open(file_name.format(url.split('/')[-1]), 'w') as f:
                f.write(r.text)
            # print('Downloaded', url, 'at', file_name)
            return True
        else:
            print('URL not active', url)
            return False
    except Exception as e:
        print('Error downloading page', url)
        print(e)
        return False



# create a function to write leads to csv file
def write_leads_to_csv(leads):
    # check if leads is empty
    if len(leads) == 0:
        return

    # write leads to csv file
    for lead in leads:
        with open('data/leads.csv', 'a') as f:
            writer = csv.DictWriter(f, fieldnames=lead.keys())
            # check if file is empty
            if f.tell() == 0:
                # write header
                writer.writeheader()
            
            for lead in leads:
                # write lead
                writer.writerow(lead)

    # remove duplicates from csv file
    df = pd.read_csv('data/leads.csv')
    df.drop_duplicates(subset=['uid', 'company'], inplace=True)
    df.to_csv('data/leads.csv', index=False)

    

# create a function to create leads
def lead_creation(uid, source, text, filtered_orgs=FILTERED_ORGS_KEYWORDS):
    # create a spacy doc object 
    doc = nlp(text)
 
    # create a list of entities
    entities = [ent.text for ent in doc.ents if ent.label_ == 'ORG']

    invalid_orgs = []
    # check if any of the words in the entities match with words in invalid_orgs
    for entity in entities:
        for word in entity.split():
            if word.lower() in filtered_orgs:
                invalid_orgs.append(entity)
                break

    # get intersection of entities and invalid_orgs
    valid_orgs = list(set(entities) - set(invalid_orgs))

    leads = []

    # if there are no valid orgs, return empty string
    if len(valid_orgs) > 0:
        for org in valid_orgs:
            # create a new leads
            new_lead = lead.copy()
            new_lead['uid'] = uid
            new_lead['company'] = org
            new_lead['lead_source'] = source
            new_lead['notes'] = ''
            leads.append(new_lead)

    return leads






# Country Functions

## United States

In [102]:
# select web page elements to scrape
def scrape_page_us_sec(file_name):
    # read the html file
    with open(file_name, 'r') as f:
        html = f.read()

    # create a soup object
    soup = BeautifulSoup(html, 'html.parser')

    # use beautifulsoup to select elements with a selector
    # select all the divs with id = 'mainlist'
    content = soup.select('#mainlist')

    # check if content exists
    if len(content) > 0:
        # find total rows in the table
        total_rows = len(content[0].select('tr'))

        # select each table row of role as row
        for row in tqdm(content[0].select('tr'), total=total_rows, desc='Scraping'):
            # check if first column of the row has an a element
            if len(row.select('td a')) > 0:
                # select the first a element
                a = row.select('td a')[0]
                # get the text of the a element
                case_number = a.text
                # get the href attribute of the a element
                case_url = a['href']

                # create full url from the href attribute
                full_url = 'https://www.sec.gov' + str(case_url)

                # check if the url exists
                if is_active_url(full_url):
                    # download the web page
                    download_page(full_url, file_name=f'temp/{case_number}.html')
                    
                    # wait for 1 second
                    time.sleep(1)

                    # clear the output
                    # clear_output(wait=True)

                    # extract all paragraphs from the web page under the div with class = 'alpha'
                    with open(f'temp/{case_number}.html', 'r') as f:
                        html = f.read()
                    soup = BeautifulSoup(html, 'html.parser')
                    content = soup.select('.alpha')
                    p_texts = [p.text for p in content[0].select('p')]
                    p_text = ' '.join(p_texts)

                    # create leads
                    leads = lead_creation(case_number, full_url, p_text)

                    # write leads to csv file
                    write_leads_to_csv(leads)

                else:
                    print('URL not active', full_url)

# Scrape Leads

In [104]:
# read excel file in pandas
df = pd.read_excel("data/agency_list.xlsx")
display(df.head())

# loop through the excel file and check active urls
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc='Agencies', unit=' agency', leave=True, colour='green'):
    if row['Active']:
        location_name = row['Location'].strip().lower()

        # generate a unique hash for the location
        location_hash = hash(location_name)

        # output file name
        file_name = f'temp/{location_hash}.html'

        # URL to scrape
        url = row['Reviews URL']

        # # download the web page
        result = download_page(url, file_name=file_name)

        if result:
            scrape_page_us_sec(file_name=file_name)

Unnamed: 0,Location,URL,Agency,Active,Reviews URL
0,United States,https://www.sec.gov/,Securities and Exchange Commission (SEC),True,https://www.sec.gov/litigation/litreleases.htm
1,European Union,https://www.esma.europa.eu/,European Securities and Markets Authority (ESMA),False,
2,United Kingdom,https://www.fca.org.uk/,Financial Conduct Authority (FCA),False,
3,Canada,https://www.securities,Canadian Securities Administrators (CSA),False,
4,Australia,https://www.asic.gov.au/,Australian Securities and Investments Commissi...,False,


Scraping: 100%|██████████| 25/25 [00:45<00:00,  1.82s/it]cy/s]
Agencies: 100%|[32m██████████[0m| 69/69 [00:46<00:00,  1.50 agency/s]
