In [1]:
from bs4 import BeautifulSoup
from dateutil.parser import parse

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time

import sys

firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['handleAlerts'] = True
firefox_capabilities['acceptSslCerts'] = True
firefox_capabilities['acceptInsecureCerts'] = True
geckoPath = 'driver/geckodriver.exe'

firefox = webdriver.Firefox(capabilities=firefox_capabilities, executable_path=geckoPath)
driver = webdriver.PhantomJS(executable_path='driver/phantomjs.exe')

try:
    firefox.get('https://www.crunchbase.com/organization/apple')
except:
    pass
driver.get('https://techcrunch.com/')
firefox.set_page_load_timeout(2)
driver.set_page_load_timeout(30)

def existence_in_crunchbase(name):
    base_url = 'www.crunchbase.com'
    url = 'https://techcrunch.com/search/'+name
    url_crunchbase = None
    try:
        driver.get(url)
    except KeyboardInterrupt:
        sys.exit()
    except:
        print('timeout of phantomjs')
        pass
    
    soup = BeautifulSoup(driver.page_source, "html.parser")
    blocks = soup.select("h2.post-title a")
    if len(blocks)>0:
        url_crunchbase = blocks[0]["href"]  
        name  = blocks[0].getText()
 
        if base_url not in url_crunchbase:
            url_crunchbase = None
    
    return url_crunchbase
            

def extract_company_techcrunch(name,url):
    try:
        firefox.get(url)
    except:
        pass
    founded = None
    employees = None
    company_name = ''
    blocks_dt = []
    blocks_dd = []
    try:
        soup = BeautifulSoup(firefox.page_source, "html.parser")
        blocks_dt = soup.select("div.details dt")
        blocks_dd = soup.select("div.details dd")
        company_name_tag = soup.select_one("#profile_header_heading")
        if company_name_tag == None:
            company_name = ''
        else:
            company_name = company_name_tag.getText()
    except:
        pass

    i = 0
    while company_name.lower() != name.lower() or len(blocks_dt)==0 :
        time.sleep(0.3)
        try:
            soup = BeautifulSoup(firefox.page_source, "html.parser")
            company_name_tag = soup.select_one("#profile_header_heading")
            if company_name_tag:
                company_name = company_name_tag.getText()
            else:
                company_name = ''
            blocks_dt = soup.select("div.details dt")
            blocks_dd = soup.select("div.details dd")
        except KeyboardInterrupt:
            sys.exit()
        except:
            pass
        i=i+1
        if i >= 100:
            print('fail to crawl ', name , ' in crunchbase')
            break

    
    if company_name.lower() == name.lower():
        for index,block in enumerate(blocks_dt):
            if 'Founded' in  block.getText() : 
                founded = blocks_dd[index].getText()
            if 'Employees' in block.getText():
                employees = blocks_dd[index].getText().split('|')[0]
    print( name , founded , employees)
    return founded,employees



In [2]:
import time
import json
import os

class CompaniesManager:
    def __init__(self):
        self.companies = []
        self.articles = []
        self.companies_name = []
        self.load_articles()
        self.load_companies()
        
    def load_articles(self):
        if os.path.isfile('data/raw_articles.json'):
            f = open('data/raw_articles.json')
            self.articles = json.load(f)
            f.close()
            
    def load_companies(self):
        if os.path.isfile('data/raw_companies.json'):
            f = open('data/raw_companies.json')
            self.companies = json.load(f)
            f.close()      
            for company in self.companies:
                name = company["name"]
                self.companies_name.append(name)
        
            
    def extract_companies(self):
        
        for index,article in enumerate(self.articles):
            for company_name in article["companies"]: 
                article_extraInfos = article["extra_infos"]
                for info in article_extraInfos:
                    if info["text"]== company_name:
                        relevance = info["relevance"]
                        count_in_article = info["count"]
                            
                if company_name in self.companies_name :
                    for company in self.companies:
                        if company_name == company["name"] and article["id"] not in company["articles"]: 
                            company["count"] = company["count"]+1
                            company["sentiment"]= company["sentiment"]+article["sentiment"]
                            company["articles"].append(article["id"])
                            extra_infos = {
                                 "id":article["id"] ,
                                 "count_in_article":count_in_article,
                                 "revelance": relevance       
                            }
                            company["extra_infos"].append(extra_infos)
                            
                else:
                    company = {
                        "name": company_name,
                        "dateFound": int(str(time.time()).split('.')[0]),
                        "count":1,
                        "sentiment": article["sentiment"],
                        "articles": [article["id"]],
                        "extra_infos":
                        [
                            {
                             "id": article["id"],
                             "count_in_article":count_in_article,
                             "revelance": relevance
                            }  
                        ]    
                    }
                    self.companies.append(company)
                    self.companies_name.append(company_name)
            
        self.save_to_disk()
        
    def extend_crunch(self):
        
        i = 0
        for index,company in enumerate(self.companies):
            name = company["name"]
            if "search_label" not in company:
                url_crunchbase = existence_in_crunchbase(name)
                
                if url_crunchbase != None:
                    print(name, ' in crunchbase')
                    i=i+1
                    founded,employees = extract_company_techcrunch(name,url_crunchbase)
                    company["search_label"] = url_crunchbase
                    if founded is not None:
                        company["foundationDate"] = founded
                    if employees is not None:
                        company["number_of_employees"] = employees
                    
                else:
                    company["search_label"] = str(0)
                    print(name, 'not in crunchbase')
            else:
                print("skip: ", name)
            
            if (index+1)%20 == 0:
                self.save_to_disk()
        self.save_to_disk()            
                                
    def save_to_disk(self):
        with open('data/raw_companies.json', 'w') as company_file:
            json.dump(self.companies, company_file,indent = 2)
            company_file.close()
                
                
                
            
    

In [None]:
CM = CompaniesManager()
CM.extract_companies()
try:
    CM.extend_crunch()
except KeyboardInterrupt:
    sys.exit()

skip:  SENSORO
skip:  Microsoft Corp
skip:  Sumitomo
skip:  Manchester City Verve
skip:  SalesWings
skip:  SA tech
skip:  Avito
skip:  Tshimologong Innovation Hub
skip:  Flickr
skip:  Niantic Labs
skip:  Oblong
skip:  Osterhout Design Group
skip:  Niantic
skip:  Oblong Industries
skip:  Mezzanine
skip:  The Pokemon Company
skip:  GamesBeat
skip:  BNJ Marketing
skip:  DormaKaba
skip:  Berkshire companies
skip:  Bezos
skip:  Private Equity
skip:  Utrip
skip:  Greylock
skip:  Elsight
skip:  LiveU
skip:  WAN
skip:  Vodaphone
skip:  Optus
skip:  GoRemote
skip:  Agri Alliance
skip:  Bitbond
skip:  Obotritia
skip:  FiveAI
skip:  GuestReady
skip:  Carspring
skip:  Moneybox
skip:  Drivetribe
skip:  Realla
skip:  Habito
skip:  Sling & Stone
skip:  Crowdcube
skip:  Cali Rice
skip:  Nominet Trust
skip:  3D Robotics
skip:  AgFunder
skip:  Airware
skip:  Arbe Robotics
skip:  Zipline International
skip:  Redbird
skip:  Bloomberg Businessweek
skip:  Yuneec Electric Aviation
skip:  CyPhy Works
skip:  E

Emerge Analytics not in crunchbase
Luno  in crunchbase
Luno 2013 11 - 50 
Lula Lend not in crunchbase
Isazi Consulting not in crunchbase
AlphaCode not in crunchbase
NCR  in crunchbase
NCR 1884 10k + 
Seedstars World  in crunchbase
Seedstars World None None
Fomo Travel not in crunchbase
Brightcap not in crunchbase
Johannesburg Social Network not in crunchbase
GD Industries not in crunchbase
Medium-scale Enterprises not in crunchbase
m.Labs not in crunchbase
Shosholoza Meyl not in crunchbase
SiMODiSA not in crunchbase
Barclays Africa Group not in crunchbase
Landmark Barclays Africa not in crunchbase
Barclay not in crunchbase
Jumo  in crunchbase
Jumo February 1, 2010 1 - 10 
PricewaterhouseCoopers not in crunchbase
MoneyTree  in crunchbase
MoneyTree April 23, 2012 11 - 50 
Rebright Partners  in crunchbase
Rebright Partners January 11, 2008 None
Archimedes Labs  in crunchbase
Archimedes Labs May 1, 2005 None
Edgars not in crunchbase
SA Ignite  in crunchbase
SA Ignite 2006 11 - 50 
TRECC no

Citymapper  in crunchbase
Citymapper 2011 11 - 50 
Microsemi  in crunchbase
Microsemi 1960 5k - 10k 
Altera  in crunchbase
Altera 1983 5 in Crunchbase
Cadence Design Systems  in crunchbase
Cadence Design Systems 1988 5k - 10k 
United Design Systems not in crunchbase
Linley Group not in crunchbase
Synopsys  in crunchbase
Synopsys 1986 5k - 10k 
Atmel  in crunchbase
Atmel January 1, 1984 5k - 10k 
HP Enterprise not in crunchbase
Juniper  in crunchbase
Juniper 1996 51 - 100 
livestreaming not in crunchbase
Deezer  in crunchbase
Deezer August 1, 2006 501 - 1k 
Ultra  in crunchbase
Ultra May 1, 2014 1 - 10 
Goldenvoice  in crunchbase
Goldenvoice 1981 11 - 50 
fiat not in crunchbase
PST Consulting not in crunchbase
TGPR  in crunchbase
TGPR Unknown 1 - 10 
NOLs not in crunchbase
JumpStart Inc.  in crunchbase
JumpStart Inc. 2003 None
Fog Creek Software  in crunchbase
Fog Creek Software September 1, 2000 11 - 50 
Allied Gear not in crunchbase
Internet Corporation not in crunchbase
Continuum Ana

Blue Origin  in crunchbase
Blue Origin 2011 251 - 500 
Peddler Brewing Company not in crunchbase
New Shephard not in crunchbase
TechBargains  in crunchbase
TechBargains 1999 501 - 1k 
HostGator  in crunchbase
HostGator 2002 3 in Crunchbase
PCMag  in crunchbase
PCMag 1982 2 in Crunchbase
AMD  in crunchbase
AMD May 1, 1969 10k + 
Vizio  in crunchbase
Vizio 2002 251 - 500 
DellDell not in crunchbase
Wordpress  in crunchbase
fail to crawl  Wordpress  in crunchbase
Wordpress None None
CloudFlare  in crunchbase
CloudFlare July 1, 2009 251 - 500 
Heinz  in crunchbase
Heinz 1869 10k + 
Sourcify  in crunchbase
Sourcify 2016 None found in Crunchbase
Digitalux not in crunchbase
CNBC  in crunchbase
CNBC April 17, 1989 33 in Crunchbase
TINYpulse  in crunchbase
TINYpulse November 1, 2012 4 in Crunchbase
CareerBuilder  in crunchbase
CareerBuilder 1995 5k - 10k 
Veriato  in crunchbase
Veriato 1998 51 - 100 
Come Recommended  in crunchbase
Come Recommended December 1, 2008 11 - 50 
Red Beach Advisors  