In [1]:
'''
Imports for testing purposes
'''

import requests
from bs4 import BeautifulSoup
import urllib
from tqdm.notebook import tqdm
import pandas as pd
import os
from ast import literal_eval
from datetime import datetime
from utils.base_templates import NewsArticle, ArticleCollection, Company, Insider

import database.database_creator as dc
import database.database_utils as du

In [None]:
input_file = "database/insiders_2024-10-22.csv"
output_file = "database/full_insiders_info_2024-10-22.csv"

dc.save_insiders_to_csv(input_file, output_file)
dc.scrape_all_companies()

In [32]:
df = pd.read_csv('database/companies_info_2024-11-01.csv')
df.columns

Index(['name', 'isin', 'ticker', 'industry', 'sector', 'profile', 'executives',
       'link', 'country'],
      dtype='object')

In [2]:
def find_company_and_insiders_from_article(article_link: str):
    # Step 1: Fetch the article from the Articles collection
    articles_collection = du.get_db_collection(du.DB_NAME, 'Articles')
    article = articles_collection.find_one({"link": article_link})
    
    if not article:
        article = dc.get_article_from_link(article_link)
        if not article:
            return None
    # Step 2: Fetch the company linked to this article
    company_link = 'https://www.marketscreener.com' + article.get('company_link', '')
    companies_collection = du.get_db_collection(du.DB_NAME, 'Companies')
    company = companies_collection.find_one({"link": company_link})
    
    if not company:
        # If the company is not found, scrape it
        ticker, isin, company_profile, executives = dc.extract_company_info(company_link)
        if not ticker:
            print("Company not found and could not be scraped.")
            return None
        company = {
            'link': company_link,
            'ticker': ticker,
            'isin': isin,
            'profile': company_profile,
            'executives': executives
        }
        # Save the scraped company data to the database
        companies_collection.insert_one(company)

    # Step 3: Get executive 'href' links to query the Insiders collection
    executives = dc.str_to_dict_expansion(company.get('executives', '{}'))
    insider_links = [
        exec_info['href'] 
        for exec_category in executives.values() 
        for exec_info in exec_category.values()
    ]

    # Step 4: Fetch insiders based on the collected links
    insiders_collection = du.get_db_collection(du.DB_NAME, 'Insiders')
    insiders = list(insiders_collection.find({"link": {"$in": insider_links}}))

    if not insiders:
        # If insiders are not found, scrape them
        for link in insider_links:
            # Retrieve the name from the collection
            exec_info = next(
                (info for exec_category in executives.values() for info in exec_category.values() if info['href'] == link),
                None
            )
            name = exec_info.get('name') if exec_info else 'Unknown'

            # Pass the name as an additional argument
            insider = dc.extract_insider_info(name, link)  # Assume this function is defined in database_creator.py
            if insider:
                insiders.append(insider)
                # Save the scraped insider data to the database
                insiders_collection.insert_one(insider.dict())

    # Return the collected data as a dictionary
    return {
        "article": article,
        "company": company,
        "insiders": insiders
    }

def generate_markdown_report(article_link: str, output_dir: str = "reports"):
    # Fetch article, company, and insiders data
    data = find_company_and_insiders_from_article(article_link)
    if data is None:
        return

    article = data["article"]
    company = data["company"]
    insiders = data["insiders"]

    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Generate the markdown file name using the article headline and date
    headline = article.get('headline', 'unknown_headline').replace(' ', '_')
    publication_date = article.get('publication_date', 'unknown_date')
    file_name = f"{headline}_{publication_date}.md"
    file_path = os.path.join(output_dir, file_name)

    # Write the markdown report
    with open(file_path, 'w') as md_file:
        md_file.write(f"# {article.get('headline')}\n")
        md_file.write(f"**Publication Date:** {article.get('publication_date')}\n")
        md_file.write(f"**Category:** {article.get('category')}\n")
        md_file.write(f"**Source:** {article.get('source')}\n\n")
        md_file.write("## Article Content\n")
        md_file.write(f"{article.get('content')}\n\n")
        md_file.write("## Company Information\n")
        md_file.write(f"- **Name:** {company.get('name')}\n")
        md_file.write(f"- **ISIN:** {company.get('isin')}\n")
        md_file.write(f"- **Ticker:** {company.get('ticker')}\n")
        md_file.write(f"- **Industry:** {company.get('industry')}\n")
        md_file.write(f"- **Sector:** {company.get('sector')}\n")
        md_file.write(f"- **Country:** {company.get('country')}\n")
        profile = company.get('profile', '').replace('-', '\\-')
        md_file.write(f"- **Profile:**\n {profile}\n\n")
        md_file.write("## Linked Insiders\n")
        for insider in insiders:
            md_file.write(f"- **Name:** {insider.get('name')}\n")
            md_file.write(f"- **Current Position:** {insider.get('current_position')}\n")
            md_file.write(f"- **Current Company:** {insider.get('current_company')}\n")
            md_file.write(f"- **Company URL:** {insider.get('company_url')}\n")
            md_file.write(f"- **Net Worth:** {insider.get('net_worth', 'N/A')}\n")
            known_holdings = dc.str_to_dict_expansion(insider.get('known_holdings', '{}'))
            md_file.write(f"- **Known Holdings:**\n")
            for company, details in known_holdings.items():
                md_file.write(f"  - **{company}:**\n")
                md_file.write(f"    - **Link:** {details.get('link', 'N/A')}\n")
                md_file.write(f"    - **Date:** {details.get('date', 'N/A')}\n")
                md_file.write(f"    - **Number of Shares:** {details.get('number_of_shares', 'N/A')}\n")
                md_file.write(f"    - **Valuation:** {details.get('valuation', 'N/A')}\n")
                md_file.write(f"    - **Valuation Date:** {details.get('valuation_date', 'N/A')}\n")
            md_file.write(f"- **Age:** {insider.get('age', 'N/A')}\n")
            md_file.write(f"- **Industries:** {', '.join(literal_eval(insider.get('industries', [])))}\n")
            md_file.write(f"- **Summary:** {insider.get('summary', 'N/A')}\n")
            md_file.write("- **Active Positions:**\n")
            active_positions = dc.str_to_dict_expansion(insider.get('active_positions', '{}'))
            for position, date in active_positions.items():
                md_file.write(f"    - {position}: {date}\n")
            md_file.write("- **Former Positions:**\n")
            former_positions = dc.str_to_dict_expansion(insider.get('former_positions', '{}'))
            for position, date in former_positions.items():
                md_file.write(f"    - {position}: {date}\n")
            md_file.write("- **Education:**\n")
            trainings = dc.str_to_dict_expansion(insider.get('trainings', '{}'))
            for training, details in trainings.items():
                md_file.write(f"    - {training}: {details}\n")
            md_file.write("\n---\n\n")

    print(f"Markdown report saved to {file_path}")

In [4]:
example_link = 'https://www.marketscreener.com/quote/stock/BELSHIPS-ASA-1413091/news/Norne-Securities-lowers-target-price-for-Belships-to-NOK-28-30-reiterates-Buy-BN-48242972/'
generate_markdown_report(example_link)



  soup = BeautifulSoup(soup)


AttributeError: 'Insider' object has no attribute 'get'

In [19]:
# Usage
endpoint_list = ['IPO', 'mergers-acquisitions', 'rumors']
base_url = 'https://www.marketscreener.com/news/companies'
csv_file_name = f"marketscreener_articles_{datetime.now().strftime('%Y-%m-%d')}.csv"
dc.save_articles_to_csv(endpoint_list, base_url, csv_file_name)

Data saved to marketscreener_articles_2024-11-01.csv


In [None]:
'''
Test using archive.is to access paywalled news sites
'''

import requests
from bs4 import BeautifulSoup
import urllib.parse

def search_archive_is(original_url: str) -> str:
    """
    Searches archive.is for an existing archived URL of the given original URL.
    """
    search_url = f"https://archive.is/{original_url}"
    response = requests.get(search_url, headers=HEADERS)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        # The search results usually have <a> tags pointing to the archived URL
        result = soup.find_all('a', href=True)
        if result:
            return result
    return None

url = 'https://www.reuters.com/business/healthcare-pharmaceuticals/bicara-therapeutics-targets-265-mln-proceeds-upsized-us-ipo-2024-09-11/'
test = search_archive_is(url)
for item in test:
    print(item['href'])