In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
import openai
import time
import json
from threading import Lock
import tiktoken

In [10]:
base_url = 'https://www.finsmes.com/category/usa'

urls_to_scrape = ['https://www.finsmes.com/category/usa']

openai.api_key = 'API-Key' 

# Generate URLs based on the known pattern
other_url = [f"{base_url}/page/{n}" for n in range(2, 5000)]  # You can adjust the range as needed

urls_to_scrape.extend(other_url)


## Scrapping the articles

In [4]:
%%time

# Start with the initial URL
# current_url = 'https://www.finsmes.com/category/usa'
# urls_to_scrape= [current_url]        

error_url= []

def get_page_name(soup):
    nav_bar = soup.find('div', class_='header-navigation')
    if nav_bar:
        current_page = nav_bar.find('a', {'aria-current': 'page'})
        if current_page:
            return current_page.text.strip()
    return "Unknown"  # Return "Unknown" or some default value if the page name cannot be found




def scrape_articles_from_page(url):
    # Initialize lists to store data
    dates = []
    headlines = []
    contents = []
    page_names = []

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract page name using the previously defined function
    page_name = get_page_name(soup)
    
    # Find all article elements
    articles = soup.find_all('article', class_='post-module')
    
    for article in articles:
        # Extract the headline
        headline_tag = article.find('h2')  # or whatever tag contains the headline
        if headline_tag:
            headline = headline_tag.text.strip()
            headlines.append(headline)

            # Find the link to the full article
            link = article.find('a', href=True)
            if link:
                full_url = urljoin(base_url, link['href'])

                # Fetch the full article content
                article_response = requests.get(full_url)
                article_soup = BeautifulSoup(article_response.text, 'html.parser')

                # Extract the article content (adjust the selector as needed)
                content_tag = article_soup.find('div', class_='entry-content')  # adjust if needed
                if content_tag:
                    content = content_tag.text.strip()
                    contents.append(content)

                    # Extract the date from the content
                    date_match = re.search(r'\d{2}/\d{2}/\d{4}$', content)
                    date = date_match.group() if date_match else 'Date not found'
                    dates.append(date)

                    # Append the dynamically extracted page name for each article
                    page_names.append(page_name)
                else:
                    contents.append('Content not found')
                    dates.append('Date not found')
                    page_names.append(page_name)
            else:
                headlines.pop()  # Remove the last headline since no link was found
        else:
            continue  # Skip this article
         
    # Return the BeautifulSoup object to find the "previous" link
    return soup,dates,headlines,contents,page_names


# New function to handle scraping of a single page
def scrape_page(url):
    page_data = {'Date': [], 'Heading': [], 'Content': [], 'Page Name': []}
    try:
        soup, dates, headlines, contents, page_names = scrape_articles_from_page(url)
        page_data['Date'].extend(dates)
        page_data['Heading'].extend(headlines)
        page_data['Content'].extend(contents)
        page_data['Page Name'].extend(page_names)
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        error_url.append(url)
        
    return page_data


# Use ThreadPoolExecutor to scrape pages in parallel
def scrape_all_pages(urls_to_scrape,worker):
    all_page_data = {'Date': [], 'Heading': [], 'Content': [], 'Page Name': []}
    with ThreadPoolExecutor(max_workers=worker) as executor:
        futures = {executor.submit(scrape_page, url) for url in urls_to_scrape}
        for future in as_completed(futures):
            page_data = future.result()
            if page_data is None:
                # If a page returns None, it's an indication that we've reached an empty page or end of available content
                break
            for key in all_page_data:
                all_page_data[key].extend(page_data[key])
    return all_page_data



CPU times: user 6 µs, sys: 1e+03 ns, total: 7 µs
Wall time: 8.82 µs


In [3]:
%%time
# Scraping all the articles 

# Scrape all pages
collected_data = scrape_all_pages(urls_to_scrape,300)

# Create a DataFrame
df = pd.DataFrame(collected_data)


In [4]:
%%time
# Scraping remaining error articles 

collected_data = scrape_all_pages(error_url,300)

# Create a DataFrame

df1 = pd.DataFrame(collected_data)

# combining the data

df2 = df.append(df1)

In [100]:
df3 = df2.head(1000)


In [97]:
%%time
# Calssifying without threading takes around 8 to 9 mins

df3['Classification'] = df3['Content'].apply(classify_startup)


CPU times: user 7.83 s, sys: 1.42 s, total: 9.25 s
Wall time: 8min 20s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [98]:
df3

Unnamed: 0,Date,Heading,Content,Page Name,Classification
0,05/10/2023,Mach Industries Raises $79M in Series A Funding,"Mach Industries, an Austin, TX-based defense t...",USA,"(B2B, 240)"
1,05/10/2023,SuperOrdinary Closes $58M Series B Funding,"SuperOrdinary, a Shanghai, New York City, and ...",USA,"(B2B, 411)"
2,05/10/2023,Saleo Raises $13M in Funding,"Saleo, a San Francisco, CA-based startup provi...",USA,"(B2B, 229)"
3,05/10/2023,SafeRide Health Raises Series C Funding,"SafeRide Health, a San Antonio, CA-based techn...",USA,"(B2B, 334)"
4,05/10/2023,Konect.ai Raises $5.5M in Seed Funding,"Konect.ai, a Houston, TX-based provider of a S...",USA,"(B2B, 325)"
...,...,...,...,...,...
995,25/09/2023,Allient Acquires Sierramotion,"Allient (Nasdaq: ALNT), a Buffalo, NYC-based d...",USA,"(B2B, 294)"
996,25/09/2023,Actionstep Acquires Soluno,"Actionstep, a Denver, CO-based provider of a c...",USA,"(B2B, 296)"
997,25/09/2023,Osteal Therapeutics Closes $23M Series C Finan...,"Osteal Therapeutics, a Dallas, TX-based clinic...",USA,"(B2B, 349)"
998,25/09/2023,InformedDNA Acquires gWell Health,"gWell’s founder and CEO Surya Singh, MD, has t...",USA,"(B2B, 504)"


## Exporting the dataframe

In [36]:
df2 = pd.read_csv('us_stock_data.csv')
df3 = df2.head(100)

In [38]:
df3

Unnamed: 0.1,Unnamed: 0,Date,Heading,Content,Page Name
0,0,05/10/2023,Mach Industries Raises $79M in Series A Funding,"Mach Industries, an Austin, TX-based defense t...",USA
1,1,05/10/2023,SuperOrdinary Closes $58M Series B Funding,"SuperOrdinary, a Shanghai, New York City, and ...",USA
2,2,05/10/2023,Saleo Raises $13M in Funding,"Saleo, a San Francisco, CA-based startup provi...",USA
3,3,05/10/2023,SafeRide Health Raises Series C Funding,"SafeRide Health, a San Antonio, CA-based techn...",USA
4,4,05/10/2023,Konect.ai Raises $5.5M in Seed Funding,"Konect.ai, a Houston, TX-based provider of a S...",USA
...,...,...,...,...,...
95,95,12/10/2023,Trait Biosciences Closes Financing Round,"Trait Biosciences, a Santa Fe, NM-based cannab...",USA
96,96,12/10/2023,MedaSystems Raises Seed Financing,"MedaSystems, a Menlo Park, CA-based developer ...",USA
97,97,12/10/2023,Alethia Venture Partners Launches $50M Debut Fund,"Alethia Venture Partners, a Los Angeles, CA-ba...",USA
98,98,12/10/2023,Harmony Receives Investment from Connetic Vent...,"Harmony, a Minneapolis, MN-based digital platf...",USA


## Classification using Open AI

In [43]:
%%time
# Calssifying with threading takes around 1 min


lock = Lock()

enc = tiktoken.get_encoding("cl100k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-3.5-turbo-instruct")


def count_tokens(text, encoding):
    return len(enc.encode(text))

def classify_startup(description):
    try:
        prompt = f"Determine if the following startup is a B2B or B2C company based on its description. Give only one word answer B2B or B2C else Not sure :\n\nDescription: {description}\n\nIs this a B2B or B2C startup?"
        token_count = count_tokens(prompt, enc)
        response = openai.Completion.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=60,
            temperature=0.7
        )
        return response.choices[0].text.strip(), token_count
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return 'Error', 0

def save_progress(classes):
    with open('progress.json', 'w') as f:
        json.dump(classes, f)

token_limit_per_minute = 240000  # Set your token limit
tokens_used = 0
start_time = time.time()

classes = []

with ThreadPoolExecutor(max_workers=20) as executor:
    entry = 0
    futures = {executor.submit(classify_startup, description):idx for idx,description in enumerate(df3['Content'])}
    classes = [None] * len(futures)
    
    for future in as_completed(futures):
        entry +=1
        if entry%10==0:
            print(f'counting {entry}')
            
        cls, token_count = future.result()
        with lock:
            tokens_used += token_count
            current_time = time.time()
            
        index = futures[future]
        
        
        # Check if token limit is reached
        if tokens_used >= token_limit_per_minute:
            time_to_wait = 60 - (current_time - start_time)
            if time_to_wait > 0:
                time.sleep(time_to_wait)
            tokens_used = 0
            start_time = time.time()

        if cls != 'Error':
            classes[index] = cls
            save_progress(classes)

        



counting 10
counting 20
counting 30
counting 40
counting 50
counting 60
counting 70
counting 80
counting 90
counting 100
CPU times: user 681 ms, sys: 253 ms, total: 934 ms
Wall time: 3.11 s


In [42]:
# Adding all the catergories 

df3['Category'] = classes
df3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['Category'] = classes


Unnamed: 0.1,Unnamed: 0,Date,Heading,Content,Page Name,Category
0,0,05/10/2023,Mach Industries Raises $79M in Series A Funding,"Mach Industries, an Austin, TX-based defense t...",USA,B2B
1,1,05/10/2023,SuperOrdinary Closes $58M Series B Funding,"SuperOrdinary, a Shanghai, New York City, and ...",USA,B2B
2,2,05/10/2023,Saleo Raises $13M in Funding,"Saleo, a San Francisco, CA-based startup provi...",USA,B2B
3,3,05/10/2023,SafeRide Health Raises Series C Funding,"SafeRide Health, a San Antonio, CA-based techn...",USA,B2B
4,4,05/10/2023,Konect.ai Raises $5.5M in Seed Funding,"Konect.ai, a Houston, TX-based provider of a S...",USA,B2B
...,...,...,...,...,...,...
95,95,12/10/2023,Trait Biosciences Closes Financing Round,"Trait Biosciences, a Santa Fe, NM-based cannab...",USA,Not sure.
96,96,12/10/2023,MedaSystems Raises Seed Financing,"MedaSystems, a Menlo Park, CA-based developer ...",USA,B2B
97,97,12/10/2023,Alethia Venture Partners Launches $50M Debut Fund,"Alethia Venture Partners, a Los Angeles, CA-ba...",USA,B2B
98,98,12/10/2023,Harmony Receives Investment from Connetic Vent...,"Harmony, a Minneapolis, MN-based digital platf...",USA,B2B


In [None]:
# df2.to_csv('us_stock_data.csv')

In [3]:
df2 = pd.read_csv('us_stock_data.csv')
df3 = df2.head(100)

In [6]:
df3['Content'][0].replace('\n', ' ').replace('\xa0', ' ')

'Mach Industries, an Austin, TX-based defense tech startup, reportedly raised $79M in Series A funding at a post-money valuation of $335M.  The round was led by Bedrock Capital with participation other backers including DCVC, Marque and others.  The company intends to use the funds to expand operations and its development efforts to build defense hardware. Led by Ethan Thornton, Mach CEO, Mach Industries is advancing a suite of oxyhydrogen defense platforms, including unmanned aerial vehicles (UAVs) and aerial protection devices. The company uses hydrogen that can be manufactured in the field using readily available resources, such as electricity or aluminum, and water to spearhead a hardware-first approach to defense and enable advancements in range and power for projectiles and loiter time and speed for aerial systems. Mach Industries also has a hardware facility in Boston, MA.  FinSMEs 05/10/2023'

In [None]:
original_string = 'Your original string here'

# Replace newline characters and '\xa0' with spaces in a single line
cleaned_string = original_string.replace('\n', ' ').replace('\xa0', ' ')

print(cleaned_string)
