# Locations from CB Insights

#### Load relevant URLs

In [1]:
urls = list()
with open('./cbinsights/cbinsights_relevant_links.txt', 'r') as file:
    for line in file:
        urls.append(line.strip())
            
print(len(urls))

7258


#### Scrape CB Insights for location information

Location information is found under Headquarters Location under the Overview & Products tab.

We tried using Geopy's API to determine if a name is a city or region but it kept timing out:\
https://www.tutorialspoint.com/how-to-get-geolocation-in-python
https://geopy.readthedocs.io/en/stable/index.html#nominatim

In [5]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderUnavailable, GeocoderTimedOut, GeocoderRateLimited
from time import sleep

geolocator = Nominatim(user_agent = 'city-or-region', timeout = 5)

def city_or_region_deprecated(name, url):
    max_tries = 5
    tries = 0
    
    while tries < max_tries:
        try:
            loc = geolocator.geocode(name)
            if loc:
                loc_type = loc.raw.get('addresstype')

                if loc_type == 'city' or loc_type == 'town':
                    return 'city'
                if loc_type == 'state' or loc_type == 'administrative':
                    return 'region'
        except (GeocoderUnavailable, GeocoderTimedOut, GeocoderRateLimited) as e:
            timeout = 5
            print(f'Unable to classify {name} as city or region for startup at {url}. Retrying in {timeout} seconds.')
            sleep(5)
            tries += 1
            
    print(f'Unable to classify {name} as city or region for startup at {url}. Giving up.')        
    return None

Instead we will compare each name to a master city list found at:\
https://github.com/FinNLP/cities-list/blob/master/list.txt

There will be a few misclassifications using this method, but we can always go back and see what types of misclassifications occur and fix them on a case by case basis, or we can choose to ignore the misclassifications since they will happen with a small number of startups.

In [15]:
import requests

def get_city_set():
    response = requests.get('https://raw.githubusercontent.com/FinNLP/cities-list/refs/heads/master/list.txt')
    if response:
        cities = response.text.splitlines()
        return {city.strip().lower() for city in cities}
    
print(list(get_city_set())[:5])

city_set = get_city_set()

['naugatuck', 'le coudray', 'elaine', 'vandenberg air force base', 'wakeman']


In [16]:
def city_or_region(name):
    if name.strip().lower() in city_set:
        return 'city'
    else:
        return 'region'

In [3]:
def contains_number(s):
    return bool(re.search(r'\d', s))

In [18]:
from bs4 import BeautifulSoup
from time import sleep
from random import randint
import requests
import re
import csv
import sys

matchers = {
    'mosaic_change': re.compile(r'[+-]\d+'),
}

# Fetches and parses page
def scrape(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'
    }
    
    while True:
        response = requests.get(url, headers=headers)
    
        if response.status_code == 429:
            retry_after = response.headers.get('Retry-After')
            if retry_after:
                print(f'Too many requests: {url}. Status code: {response.status_code}. Retrying after {retry_after} seconds.')
                sleep(int(retry_after))
            else:
                print(f'Too many requests: {url}. Status code: {response.status_code}. Aborting.')
                sys.exit(1)
        elif response.status_code == 404:
            return None
        elif response.status_code != 200:
            print(f'Failed to retrieve {url}. Status code: {response.status_code}')
            return None
        else:
            return BeautifulSoup(response.content, 'html.parser')
        
def clean_soup(full_soup):
    if not full_soup:
        return None
    title_div = full_soup.find('div', class_='flex flex-col')
    info_div = full_soup.find('div', {'data-test': 'section-component'})

    soup = BeautifulSoup('<div></div>', 'html.parser').div

    if title_div:
        soup.append(title_div)
    if info_div:
        soup.append(info_div)
        
    return soup

def extract(soup):
    data = {
        'name': None,
        'website': None,
        'cb_description': None,
        'year_founded': None,
        'mosaic_change': None,
        'city': None,
        'region': None,
        'country': None,
        'postal': None,
    }
    
    # Extract name
    name_match = soup.find('h1', class_='cbi-default pr-2 text-2xl font-medium text-black')
    if name_match:
        data['name'] = name_match.text.strip()
        
    # Extract website URL
    website_match = soup.find('a', class_='color--blue padding--top--s text-sm font-medium')
    if website_match:
        data['website'] = website_match['href'].strip()
       
    # Extract year founded
    year_founded_match = soup.find('div', class_='Kpi_kpiItem__2CwXD')
    if year_founded_match:
        year = year_founded_match.find_next('span').text.strip()
        if contains_number(year):
            data['year_founded'] = year
        
    # Extract description
    cb_description_match = soup.find('p', {'data-test': 'description'})
    if cb_description_match:
        data['cb_description'] = cb_description_match.text.strip()
        
    # Extract mosaic change
    mosaic_change_match = soup.find('div', class_='Kpi_mosaic__Ax_II')
    if mosaic_change_match:
        p_text = mosaic_change_match.find_next('p').text.strip()
        match = re.search(matchers['mosaic_change'], p_text)
        if match:
            data['mosaic_change'] = match.group().strip()
            
    # Extract location
    location_match = soup.find('address', {'data-test': 'address'})
    if location_match:
        city_region_zip_match = location_match.find('p', {'data-test': 'city-state-zip'})
        if city_region_zip_match:
            city = None
            region = None
            postal = None
            
            city_region_zip = city_region_zip_match.text.strip()
            if city_region_zip:
                list_city_region_zip = city_region_zip.split(',')[:-1]
                list_city_region = [x.strip() for x in list_city_region_zip if not contains_number(x)]
                list_postal = [x.strip() for x in list_city_region_zip if contains_number(x)]
                if len(list_postal) > 0:
                    postal = list_postal[0]

                # Use geopy to decide whether it's a city or a region (state/province)
                if (len(list_city_region)) == 1:
                    for city_region in list_city_region:
                        result = city_or_region(city_region)
                        if result == 'city':
                            city = city_region
                        elif result == 'region':
                            region = city_region
                elif (len(list_city_region)) == 2:
                    city = list_city_region[0]
                    region = list_city_region[1]

                if city:
                    data['city'] = city
                if region:
                    data['region'] = region
                if postal:
                    data['postal'] = postal
        
        country_match = location_match.find('p', {'data-test': 'country'})
        if country_match:
            country = country_match.text.strip()
            if country:
                data['country'] = country
        
    return data

# Generator that yields overviews
def process_websites(urls):
    for url in urls:
        dirty_soup = scrape(url)
        soup = clean_soup(dirty_soup)
        if soup:            
            yield extract(soup)
        else:
            yield None
            
# Store overviews in CSV
def write_to_csv(overview, filename='../datasets/overviews.csv'):
    fieldnames = [*overview]

    with open(filename, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        if file.tell() == 0:
            writer.writeheader()

        writer.writerow(overview)

def main(urls):
    overviews = process_websites(urls)
    processed_overviews = 0
    total_overviews = len(urls)
    previous_percent_processed = 0
    
    while True:
        try:
            overview = next(overviews)
        except StopIteration:
            break
            
        processed_overviews += 1
        if overview:
            write_to_csv(overview)
        
        percent_processed = (processed_overviews / total_overviews) * 100
        if percent_processed - previous_percent_processed >= 5:
            print(f'{(percent_processed):.0f}% of pages processed.')
            previous_percent_processed = percent_processed
    
        timeout = randint(0, 2)
        sleep(timeout)
        
    print(f'All {total_overviews} pages processed.')
    
if __name__ == '__main__':
    test_urls = ['https://www.cbinsights.com/company/unitree',
                 'https://www.cbinsights.com/company/slice',
                 'https://www.cbinsights.com/company/roman-health-ventures',
                 'https://www.cbinsights.com/company/maven-clinic',
                 'https://www.cbinsights.com/company/doctorbox',
                 'https://www.cbinsights.com/company/healthtap',
                ]
    
    urls = list()
    with open('./cbinsights/cbinsights_relevant_links.txt', 'r') as file:
        for line in file:
            urls.append(line.strip())
    
    urls = [url.replace('/financials', '') for url in urls]
    
    # If program crashes, leftoff is the last index that was processed
    leftoff = 0
    main(urls)

5% of pages processed.
10% of pages processed.
15% of pages processed.
20% of pages processed.
25% of pages processed.
30% of pages processed.
35% of pages processed.
40% of pages processed.
45% of pages processed.
50% of pages processed.
55% of pages processed.
60% of pages processed.
65% of pages processed.
70% of pages processed.
75% of pages processed.
80% of pages processed.
85% of pages processed.
90% of pages processed.
95% of pages processed.
All 7258 pages processed.
