In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import time

In [9]:
site_base_url = 'https://getautopsy.wixsite.com/failed-startups/allcompanies/'
companies = ['Amiloom', 'Backfence', 'Circa-News', 'DeviceFidelity', 'Gumroad-', 'Homejoy', 'Juicero', 'Kichit', 'Lookery', 'Netscape', 'Next-step-living', 'OpTier', 'Path', 'Poliana', 'Shyp', 'Skully', 'Thrively', 'Tutorspree', 'Zulily', 'Yik-Yak', 'eCrowds']
N = len(companies)
us_comp_links = [site_base_url + c for c in companies]

### Utility Function to scrape the Data

In [3]:
def get_requests(url):
  
  """
  Get the status of the request to the url
  """

  time.sleep(1)
  
  hdr = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
  
  req = Request(url,headers=hdr)
  try:
    page = urlopen(req)
    soup = BeautifulSoup(page)
    return soup
  
  except Exception as exc:
    print(f"The problem is {exc}")

In [4]:
def get_dataheaders_from_links(url):

  """
  Extract the data from the given company link
  """

  soup = get_requests(url)  
  header_ = soup.find_all('p',attrs={'class':'font_8'})

  headerlist = []
  for i in header_:
    headerlist.append('_'.join(i.text.split(' ')))

  company_name = headerlist[0]

  subheader_ = soup.find_all('h5',attrs={'class':'font_5'})

  subheaderlist = []
  for i in subheader_:
    subheaderlist.append('_'.join(i.text.split(' ')))
  
  header_indices = [1,8,10,12,14,18,21,25,27,29,31,33,37,39,41,43,45,47,49]

  data_headers = [subheaderlist[i] for i in header_indices]

  data_headers.insert(0,'Company_Name')
  data_headers.insert(2,'Company_type')
  data_headers.insert(3,'Business_type')
  data_headers.insert(4,'Description')
   
  return data_headers,company_name,subheaderlist

In [5]:
def get_data(company_name,subheaderlist):
  
  """
  Fetch the data and create a list 
  """

  data_indices = [2,3,4,5,9,11,13,15,19,22,26,28,30,32,34,38,40,42,44,46,48,50]
  
  insert_data_list = [subheaderlist[i] for i in data_indices] 
  insert_data_list.insert(0,company_name)
  
  return insert_data_list

In [10]:
def main():

  """
  Main Function to execute
  """

  companies_data_list = []

  for link in tqdm(us_comp_links):
    
    data_headers,company_name,subheaderlist = get_dataheaders_from_links(link)
    temp_list = get_data(company_name,subheaderlist)
    companies_data_list.append(temp_list)

  data_df = pd.DataFrame(columns = data_headers,index = np.arange(0,N))

  for i in range(N):
    
    data_df.loc[i] = companies_data_list[i]  
    data_df = data_df.sort_index() 
  
  data_df.to_csv('failure_startup_autopsy.csv')

In [None]:
main()