In [331]:
import requests
from bs4 import BeautifulSoup
import pandas
from trackers import trackers
from datetime import date, timedelta

In [None]:
# techcrunch.com -> doesnt work properly with Requests, 
# should use Selenium instead ('load more' button)

# cbnc.com -> fine

# techstartups.com -> fine

# inc42.com -> doesnt work properly with Requests,
# should use Selenium instead ('load more' button)

# eu-startups.com -> fine

# techstars.com -> pagination doesnt work with Requests

# geekwire.com -> doesnt work without JS, should use Selenium instead

# startupnews.com.au -> website doesnt answer to requests

# pitchbook.com -> doesnt work properly with requests, 
# should use Selenium instead ('load more' button)

# cbinsights.com -> wtf?

# magazine.startus.cc -> doesnt work properly with Requests,
# should use Selenium instead ('load more' button)

# startupdaily.net -> blocks non-human requests

In [332]:
def iter_daterange(daterange: tuple[date]) -> list[date]:
    l, d = [], daterange[0]
    while d != daterange[1]:
        l.append(d)
        d += timedelta(1)
    return l

def s(url: str) -> BeautifulSoup:
    html = requests.get(url).content
    return BeautifulSoup(html, 'html.parser')

def month_convert(name_or_abb: str) -> int:
    pairs = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
             'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
    return pairs[name_or_abb[:3].lower()]

def cringe_pag(page: int) -> int:
    return int(f'{page - 1}1')

In [366]:
class _m1Parser:
    # Parser for techstartups.com

    @classmethod
    def get_article_content(cls, soup: BeautifulSoup) -> str:
        post = soup.find('div', attrs={'class': 'post_wrapper'})
        text = ' '.join(tag.text for tag in post.find_all('p'))
        return text
    
    @classmethod
    def get_articles(cls, daterange: tuple[date]) -> list[tuple[str]]:
        items = []
        for d in trackers.ForTracker('techstartups.com', iter_daterange(daterange)):
            soup = s(f'https://techstartups.com/{d.year}/{d.month}/{d.day}')
            articles = soup.find('div', attrs={'class': 'sidebar_content'})
            if articles and 'error404' not in soup.body['class']:
                for link in [tag['href'] for tag in articles.find_all('a') if 'title' in tag.attrs]:
                    items.append((f'{d.year}.{d.month}.{d.day}', 'techstartups.com', link))
        return items


class _m2Parser:
    # Parser for cnbc.com

    @classmethod
    def get_article_content(cls, soup: BeautifulSoup) -> str:
        post = soup.find('div', attrs={'class': 'Article'})
        text = ' '.join(tag.text for tag in post.find_all('p'))
        return text
    
    @classmethod
    def get_articles(cls, daterange: tuple[date]) -> list[tuple[str]]:
        startedQ, finishedQ = False, False
        page, daterange = 1, iter_daterange(daterange)
        items = []
        while not finishedQ:
            articles = s(f'https://cnbc.com/startups/?page={page}') \
                .find_all('div', attrs={'class': 'Card-textContent'})
            for tag in articles:
                uri = tag.find('a', attrs={'class': 'Card-title'})['href']
                _ = uri.split('/')
                if _[3] == 'video': continue
                y, m, d = map(int, [_[3], _[4], _[5]])
                if date(y, m, d) in daterange: startedQ = True
                if date(y, m, d) not in daterange and startedQ: finishedQ = True 
                elif startedQ:
                    items.append((f'{y}.{m}.{d}', 'cnbc.com', uri))
            page += 1
        return items


class _m3Parser:
    # Parser for eu-startups.com

    @classmethod
    def get_article_content(cls, soup: BeautifulSoup) -> str:
        return soup.find('div', attrs={'class': 'tdi_83'}).get_text()
    
    @classmethod
    def get_articles(cls, daterange: tuple[date]) -> list[tuple[str]]:
        items = []
        for d in trackers.ForTracker('eu-startups.com', iter_daterange(daterange)):
            articles = s(f'https://eu-startups.com/{d.year}/{d.month}/{d.day}') \
                .find('div', attrs={'id': 'tdi_77'})
            if articles:
                for link in {tag['href'] for tag in articles.find_all('a') if 'title' in tag.attrs}:
                    items.append((f'{d.year}.{d.month}.{d.day}', 'eu-startups.com', link))
        return items
            

class Parser:
    
    methods = {
        'techstartups.com': _m1Parser,
        'cnbc.com': _m2Parser,
        'eu-startups.com': _m3Parser,
    }

    @classmethod
    def get_article_content(cls, url: str) -> str:
        domain = url.split('/')[2].replace('www.', '')
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'html.parser')
        return cls.methods[domain].get_article_content(soup)
    
    @classmethod
    def get_articles(cls, domain: str, daterange: tuple[date] = None) -> list[tuple[str]]:
        if not daterange:
            daterange = (date.today() - timedelta(30), date.today())
        return cls.methods[domain].get_articles(daterange)

In [368]:
sources = (
    'techstartups.com',
    'cnbc.com',
    'eu-startups.com',
)

daterange = (
    date.today() - timedelta(90),
    date.today(),
)

urls = []
for source in sources:
    urls += Parser.get_articles(source, daterange)

(90/90) techstartups.com - 93.10s
(90/90) eu-startups.com - 106.69s


In [369]:
print(len(urls))
urls

680


[('2022.5.29',
  'techstartups.com',
  'https://techstartups.com/2022/05/29/duckduckgo-caught-giving-microsoft-trackers-permission-track-users-third-party-sites-despite-strong-privacy-protection-claim/'),
 ('2022.5.30',
  'techstartups.com',
  'https://techstartups.com/2022/05/30/12-traffic-originating-twitter-made-bots-study-israeli-cybersecurity-startup-cheq-found/'),
 ('2022.5.30',
  'techstartups.com',
  'https://techstartups.com/2022/05/30/austrian-tech-startup-nxrt-raises-extra-2-3-million-expand-simulation-platform-automotive-railway-sectors/'),
 ('2022.5.30',
  'techstartups.com',
  'https://techstartups.com/2022/05/30/candela-unveils-electric-flying-passenger-boat-zero-emissions-debuts-p-8-voyager-venice/'),
 ('2022.5.31',
  'techstartups.com',
  'https://techstartups.com/2022/05/31/meta-apes-becomes-worlds-first-blockchain-game-go-live-bnb-application-sidechain-offers-best-web2-web3-gaming/'),
 ('2022.5.31',
  'techstartups.com',
  'https://techstartups.com/2022/05/31/tech-bi

In [None]:
# NN part

In [334]:
import spacy

In [370]:
_ = [(*item, None) for item in urls]
df = pandas.DataFrame(data=_, columns=['date', 'source', 'link', 'content'])

In [371]:
for id, row in trackers.ForTracker('downloading content', list(df.iterrows())):
    if row.content is None:
        content = Parser.get_article_content(row.link)
        df.content[id] = content

(680/680) downloading content - 868.98s


In [325]:
# or import from file
pandas.read_csv('content.csv')

Unnamed: 0,date,source,link,content
0,2022.5.27,techstartups.com,https://techstartups.com/2022/05/27/layoffs-hi...,The current bear market has sent ripples aroun...
1,2022.5.27,techstartups.com,https://techstartups.com/2022/05/27/oracle-red...,We covered Bybit back in March after the crypt...
2,2022.5.28,techstartups.com,https://techstartups.com/2022/08/25/australia-...,The world’s food supply chain is broken and Sy...
3,2022.5.28,techstartups.com,https://techstartups.com/2022/08/25/amazon-shu...,"In July, we wrote about Amazon after the retai..."
4,2022.5.28,techstartups.com,https://techstartups.com/2022/08/25/bitcoin-at...,"Bitcoin Depot, the largest Bitcoin ATM operato..."
...,...,...,...,...
791,2022.8.24,eu-startups.com,https://www.eu-startups.com/2022/08/eu-funding...,The EU is running several large funding progra...
792,2022.8.24,eu-startups.com,https://www.eu-startups.com/2022/08/polish-fir...,Polish venture capital firm Inovo VC has just ...
793,2022.8.24,eu-startups.com,https://www.eu-startups.com/2022/08/the-weroad...,The travel industry has been on a rollercoaste...
794,2022.8.24,eu-startups.com,https://www.eu-startups.com/2022/08/german-sta...,Berlin-based Dryad has raised €10.5 million in...


In [372]:
for id, row in list(df.iterrows()):
    df.content[id] = row.content.replace('\n', ' ').replace('\xa0', '') \
        .replace('\u200b', '').strip()

In [373]:
df

Unnamed: 0,date,source,link,content
0,2022.5.29,techstartups.com,https://techstartups.com/2022/05/29/duckduckgo...,For many of you who have been following us ove...
1,2022.5.30,techstartups.com,https://techstartups.com/2022/05/30/12-traffic...,"About two weeks ago, billionaire and Tesla CEO..."
2,2022.5.30,techstartups.com,https://techstartups.com/2022/05/30/austrian-t...,While many startups are still struggling to ra...
3,2022.5.30,techstartups.com,https://techstartups.com/2022/05/30/candela-un...,Candela is a Swedish tech startup and maker of...
4,2022.5.31,techstartups.com,https://techstartups.com/2022/05/31/meta-apes-...,"Last week, we wrote about Ankr after theWeb3 i..."
...,...,...,...,...
675,2022.8.24,eu-startups.com,https://www.eu-startups.com/2022/08/lisbon-bas...,Portuguese startup SocialTalk has raised €770k...
676,2022.8.25,eu-startups.com,https://www.eu-startups.com/2022/08/danish-sta...,Neurons has secured over €6 million in one of ...
677,2022.8.25,eu-startups.com,https://www.eu-startups.com/2022/08/thirdweb-r...,Pioneering tech platform for building NFT and ...
678,2022.8.25,eu-startups.com,https://www.eu-startups.com/2022/08/10-greente...,Greentech is one of Europe’s biggest growth ar...


In [374]:
df.to_csv('content.csv', index=False)

In [375]:
def extract_startups(article: str) -> list[str]:
    spacy_nlp = spacy.load('en_core_web_sm')
    text = article
    doc = spacy_nlp(text.strip())

    organizations = set()

    for entrance in doc.ents:
        entry = str(entrance.lemma_)
        text = text.replace(str(entrance), "")

        if entrance.label_ in ["ORG"]:
            organizations.add(entry)

    organizations = list(organizations)

    _ = set()
    for org in organizations:
        org = org.replace('’s', '')
        if org.lower() not in [i.lower() for i in _]:
            _.add(org)
    
    cleared_orgs = list(_)
    pairs = []

    for i, company in enumerate(cleared_orgs):
        url = f'https://search.yahoo.com/search;?p={"+".join(company.split())}'
        soup = s(url)
        try:
            res = soup.find('span', {'class': ['fz-14', 'lh-22']}).get_text()
            # print(company, res)
            numbers = int(res.split()[1].replace(",", ""))
            pairs.append((numbers, company))
        except AttributeError:
            pass
    
    # print(pairs)

    out = []
    for left, right in pairs:
        if left <= 30000: out.append(right)

    return out

In [337]:
keywords = (
    'transport', 'logistics', 'transportation', 'mobility', 'vehicle', 
    'car', 'bus', 'tram', 'underground', 'metro',
)

In [376]:
_df = pandas.DataFrame(columns=['date', 'source', 'link', 'entity'])

In [384]:
# or load from file
_df = pandas.read_csv('data.csv')

In [385]:
_df

Unnamed: 0,date,source,link,entity
0,2022.5.30,techstartups.com,https://techstartups.com/2022/05/30/austrian-t...,eQventure
1,2022.5.30,techstartups.com,https://techstartups.com/2022/05/30/austrian-t...,nxrt CEOLukas Stranger
2,2022.5.31,techstartups.com,https://techstartups.com/2022/05/31/finnish-cy...,Hoxhunt
3,2022.6.2,techstartups.com,https://techstartups.com/2022/06/02/gemini-cry...,ConnectU
4,2022.6.2,techstartups.com,https://techstartups.com/2022/06/02/gemini-cry...,HarvardConnection
5,2022.6.2,techstartups.com,https://techstartups.com/2022/06/02/65788/,zkSNARK
6,2022.6.2,techstartups.com,https://techstartups.com/2022/06/02/marketacro...,FactBlock
7,2022.6.2,techstartups.com,https://techstartups.com/2022/06/02/marketacro...,UNOPND
8,2022.6.2,techstartups.com,https://techstartups.com/2022/06/02/marketacro...,Factblock & Korea
9,2022.6.6,techstartups.com,https://techstartups.com/2022/06/06/british-te...,cybsafe


In [379]:
skipQ = len(_df.index) != 0

for i, row in trackers.ForTracker('extracting startups', list(df.iterrows())):
# for i, row in list(df.iterrows())[:10]:
    if skipQ and row.link != _df.iloc[-1].link:
        continue
    else: skipQ = False
    if any([kw in row.content for kw in keywords]):
        sus = extract_startups(row.content)
        # print(sus)
        for su in sus:
            _ = pandas.DataFrame(
                {'date': [row.date], 'source': [row.source], 
                 'link': [row.link], 'entity': [su]}
            )
            _df = pandas.concat([_df, _], ignore_index=True, axis=0)

(680/680) extracting startups - 1992.05s


In [383]:
_df.drop_duplicates(ignore_index=True).to_csv('data.csv', index=False)