In this notebook we scrape list of ycombinator funded companies as well as their information (e.g location, team size, description, etc.), and then save them in a csv file

# URL of Companies

In this part we check available ids for ycombinatior's companies and save their urls to scrape their information in the next section.

Indeed we can have access to each company's page on the ycombinator website by having id of each company and using the "http://ycombinator.com/companies/{id}" url 

In [2]:
# Importing all necessery packages
import requests
from bs4 import BeautifulSoup
import urllib.request, urllib.error
import pandas as pd
import numpy as np
from tqdm import tqdm
import re

# Id interval that we check their availabilities on the ycombinator website
START_ID=1
END_ID = 10
ULRS_PATH = "working_urls_{}-{}.csv".format(str(START_ID), str(END_ID))

In [10]:
cheking_ulrs = ["https://www.ycombinator.com/companies/" + str(id) for id in range(START_ID,END_ID+1)]

In [11]:
#check if each url is valid and the http request returns 200 code
working_urls=[]
with tqdm(total = len(cheking_ulrs)) as pbar:
    for i, url in enumerate(cheking_ulrs):
        try:
            pbar.update(1)
            _ = urllib.request.urlopen(url)
        except urllib.error.HTTPError as e:
          # Return code error 
          print('iter{} - HTTPError: {}'.format(i, e.code))
        except urllib.error.URLError as e:
          # Url error
          print('iter{} - URLError: {}'.format(i, e.reason))
        else:
          # url is ok
          working_urls.append(url)
df_working_urls = pd.DataFrame(working_urls)
#since this process is time consuming we save its result
df_working_urls.to_csv(ULRS_PATH)




  0%|          | 2/500 [00:00<02:02,  4.07it/s]

iter0 - HTTPError: 404


  1%|          | 3/500 [00:00<02:32,  3.26it/s]

iter1 - HTTPError: 404


  1%|          | 4/500 [00:01<02:50,  2.92it/s]

iter2 - HTTPError: 404


  1%|          | 5/500 [00:01<03:04,  2.69it/s]

iter3 - HTTPError: 404


 12%|█▏        | 61/500 [00:26<02:43,  2.69it/s]

iter59 - HTTPError: 404


 13%|█▎        | 65/500 [00:28<03:02,  2.38it/s]

iter63 - HTTPError: 404


 14%|█▍        | 69/500 [00:30<03:16,  2.20it/s]

iter67 - HTTPError: 404


 23%|██▎       | 114/500 [00:50<03:31,  1.82it/s]

iter112 - HTTPError: 404


 48%|████▊     | 238/500 [01:36<01:42,  2.55it/s]

iter236 - HTTPError: 404


 76%|███████▋  | 382/500 [02:32<00:45,  2.57it/s]

iter380 - HTTPError: 404


 77%|███████▋  | 383/500 [02:33<00:46,  2.51it/s]

iter381 - HTTPError: 404


 77%|███████▋  | 386/500 [02:34<00:33,  3.40it/s]

iter384 - HTTPError: 404


 87%|████████▋ | 437/500 [02:55<00:33,  1.85it/s]

iter435 - HTTPError: 404


100%|██████████| 500/500 [03:20<00:00,  2.49it/s]


# Scrape

Now that we have urls of companies on the ycombinator, we scrape their page and extract interesting information of them

In [3]:
def get_urls(path = ULRS_PATH ):
    temp_urls = pd.read_csv(path)
    temp_urls.columns = ["index", "links"]
    urls = temp_urls["links"].values.tolist()
    return urls
urls = get_urls()
print("NUMBER OF COMPANIES: ", len(urls))

NUMBER OF COMPANIES:  6


In [36]:

csv_titles = ["NAME",
              "FOUNDATION YEAR",
              "SIZE",
              "LOCATION",
              "BATCH",
              "SHORT DESC",
              "LONG DESC",
              ]

result = pd.DataFrame(columns=csv_titles)
data=[]
with tqdm(total = len(urls)) as pbar:
    for i in range(len(urls)):
        url = urls[i]
        request = requests.get(url)
        text = request.text
        soup = BeautifulSoup(text, "html.parser")

        # we refer to html file of one of companies to find class name of each part
        # by reffering we found that 
        #  #company's name is mentioned by h1 tag and class='heavy'
        #  #short description is mentioned by h3 tag
        #  #long description is mentioned by p tag


        name = (soup.find("div", {"h1", "heavy"})).get_text(strip=True)
        short_description = (soup.find("h3").get_text(strip=True))
        long_description = (soup.find("p").get_text(strip=True))

        #The other information are under class="highlight-box"
        details = (soup.find(("div"), ("highlight-box"))).get_text(strip=True)
        #Here we extract information from details using regular expression since all companies'
        # detail follow this expression styly
        # e.g CircuitHubFounded2012Team Size30LocationLondon, United Kingdom
        match_obj = re.match(r'(.*)Founded(\d*)Team Size(\d*)Location(.*)', details, re.M | re.I)
        
        year = match_obj.group(2)
        size = match_obj.group(3)
        location = match_obj.group(4)
        # batch is mentioned by class = pill pill-orange
        batch = (soup.find(("span"), ("pill pill-orange"))).get_text(strip=True)
        dict = {
            "NAME": name,
            "FOUNDATION YEAR": year,
            "SIZE": size,
            "LOCATION": location,
            "BATCH": batch,
            "SHORT DESC": short_description,
            "LONG DESC": long_description,
            }
        data.append(dict)
        pbar.update(1)
#save result
result= result.append(pd.DataFrame.from_records(data, columns=dict.keys()))
result.to_csv("yclist_{}.csv".format(str(len(urls))))


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:08<00:00,  1.48s/it]
