In [1]:
import requests
import json
from bs4 import BeautifulSoup
from datetime import datetime
from script.helper.models import Article
import re

In [1]:
import nest_asyncio
nest_asyncio.apply() 

In [1]:

import confluent_kafka


print(f"openai version: {confluent_kafka.__version__}")


openai version: 2.5.0


## SEEKING ALPHA

In [2]:
import os

# Find them at https://rapidapi.com/apidojo/api/seeking-alpha/playground
API_KEY = os.environ['SEEKING_ALPHA_API_KEY']
API_KEY

'3f7b06e7a8msh390b7f13312554ep1ecb05jsnbae1915ac3c3'

In [2]:
diz = {'peppa': [1,2], 'pig': [3,4], 'papa': [5,6]}

for i in diz:
    print(i)

peppa
pig
papa


In [2]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from script.helper.models import Article
from typing import List
from datetime import datetime
import os

# Find them at https://rapidapi.com/apidojo/api/seeking-alpha/playground
API_KEY = os.environ['SEEKING_ALPHA_API_KEY']
API_HOST = "seeking-alpha.p.rapidapi.com"

# Retrives the article content async
async def fetch_seeking_alpha(session, id, timestp, title, ticket):
    url = "https://seeking-alpha.p.rapidapi.com/news/get-details"
    querystring = {"id": id}
    headers = {
        "x-rapidapi-key": API_KEY,
        "x-rapidapi-host": API_HOST
    }

    async with session.get(url, headers=headers, params=querystring) as response:
        if response.status == 200:
            data = await response.json()
            
            # This extract just the content without the html components
            soup = BeautifulSoup(data['data']['attributes']['content'], 'html.parser')
            article_content = soup.get_text()

            return Article(
                ticket=ticket,
                timestp=timestp,
                url=data['data']['links']['canonical'],
                title=title,
                article_body=article_content
            )
            
        else:
            print(f"Failed to fetch details for ID {id}, status code {response.status}")
            return None


# Get a list of article links about a specific stock
async def seeking_alpha_get_links(ticket, num):
    
    # For details https://rapidapi.com/apidojo/api/seeking-alpha/playground
    url = "https://seeking-alpha.p.rapidapi.com/news/v2/list-by-symbol"
    querystring = {"size": num, "number": "1", "id": ticket}
    headers = {
        "x-rapidapi-key": API_KEY,
        "x-rapidapi-host": API_HOST
    }
    

    async with aiohttp.ClientSession() as session:
        async with session.get(url, headers=headers, params=querystring) as response:
            if response.status == 200:
                data = await response.json()
                tasks = []
                
                # iterates for each entity, defined by the parameter "num"
                for row in data['data']:
                    
                    # Convert the date to unix
                    date_str = row['attributes']['publishOn']
                    date_time = datetime.fromisoformat(date_str)
                    unix_timestamp = int(date_time.timestamp())
                    
                    # Prerate all the tasks to run async
                    task = fetch_seeking_alpha(
                        session=session,
                        id=row['id'],
                        timestp=unix_timestamp,
                        title=row['attributes']['title'],
                        ticket=ticket
                    )
                    tasks.append(task)

                # Instanciate a list of Article entities
                articles: List[Article] = []
                
                # Process the tasks as they complete
                for task in asyncio.as_completed(tasks):
                    result = await task
                    if result:
                        articles.append(result)
                
                return articles
            
            else:
                print(f"Failed to fetch list, status code {response.status}")
                return []

articles = asyncio.run(seeking_alpha_get_links("MARA", 2))


In [3]:
for article in articles:
    print(article)

ticket='MARA' url='https://seekingalpha.com/news/4140691-bitcoin-mining-is-a-survival-game-at-this-point-halving-didnt-do-any-favors' title='Bitcoin mining is a ‘survival game’ at this point; halving didn’t do any favors' article_body='    luza studios “Bitcoin miners are caught in a vice, and the pressure’s only intensifying. The April 2024 halving didn’t just tighten the screws; it flipped the industry on its head,” said David Materazzi, CEO of automated trading platform Galileo FX. Following the halving event, miners’ block rewards were slashed by 50%, cutting into bitcoin (BTC-USD) production and underlying revenues. And with elevated operating costs, many miners experienced dwindling profits during the quarter ended June 30, 2024. The 4% decline in bitcoin’s price during the three-month period only exacerbated the financial pressure. As such, Materazzi expects "the weak to be picked off, leaving only the leanest operations standing tall.” While most miners are struggling to stay a

## GOOD FUNCTION FETCH YAHOO (no Selenium)

In [5]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from script.helper.models import Article
from typing import List
from datetime import datetime

# Define variables
TICKET = 'ACMR'
url = f'https://finance.yahoo.com/quote/{TICKET}/news/'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
}
cookies = {
    'GUC': 'AQABCAFmwcNm80IfWgSU&s=AQAAAGoqDQ7v&g=ZsB0FA',
    'A1': 'd=AQABBBbSSWYCEMGNccbgJJ6fo37cGXpNRK4FEgABCAHDwWbzZudVb2UBAiAAAAcIFNJJZkCw6_E&S=AQAAAhcGLnTIGaPvJkY30Nu1ux4',
    'A3': 'd=AQABBBbSSWYCEMGNccbgJJ6fo37cGXpNRK4FEgABCAHDwWbzZudVb2UBAiAAAAcIFNJJZkCw6_E&S=AQAAAhcGLnTIGaPvJkY30Nu1ux4',
    'A1S': 'd=AQABBBbSSWYCEMGNccbgJJ6fo37cGXpNRK4FEgABCAHDwWbzZudVb2UBAiAAAAcIFNJJZkCw6_E&S=AQAAAhcGLnTIGaPvJkY30Nu1ux4',
    'PRF': 't%3DACMR%26newChartbetateaser%3D0%252C1725284010825'
}


# Gets the text content of the page
async def fetch_page(session, url):
    try:
        async with session.get(url) as response:
            response.raise_for_status()
            return await response.text()
        
    except aiohttp.ClientError as e:
        print(f"Request failed: {e}")
        return None
    
    except Exception as e:
        print(f"Something went wrong while fetching {url}: {e}")
        return None


async def fetch_yahoo_article_content(session, link, title, ticket):
    article_content = await fetch_page(session, link)
    print(f'Page fetched for the link {link}.')
    
    if article_content:
        article_soup = BeautifulSoup(article_content, 'html.parser')
        body = article_soup.find(class_="caas-body") # This find the body of the article
        
        if body:
            print('Article body found!')
            # This return the concatenation of all the text inside the paragraphs of the article
            article_content = "\n".join(p.get_text(strip=False) for p in body.find_all('p'))
            return Article(
                ticket=ticket,
                timestp=int(datetime.now().timestamp()),
                url=link,
                title=title,
                article_body=article_content
            )
        
    print('No article body found.')
    return None


async def fetch_yahoo(ticket, headers, cookies):
        
    url = f'https://finance.yahoo.com/quote/{TICKET}/news/'
    async with aiohttp.ClientSession(headers=headers, cookies=cookies) as session:
        
        # Fetch the main page
        page_content = await fetch_page(session, url)
        print(f'Page fetched for the link {url}.')
        if not page_content:
            print('No page content found.')
            return

        soup = BeautifulSoup(page_content, 'html.parser')

        # Instanciate a list of Article entities
        articles: List[Article] = []
        tasks = []

        # This locate the section on the main page with all the article links
        for news in soup.find_all(class_='js-stream-content Pos(r)'):
            
            article = news.find('h3').find('a')
            title = article.get_text(strip=True)
            link = article.get('href')
            
            # Check if the url is about an add and not about the stock
            if ticket in title:
                tasks.append(fetch_yahoo_article_content(session, link, title, ticket))

        # Process the tasks as they complete
        for task in asyncio.as_completed(tasks):
            result = await task
            if result:
                articles.append(result)
        
        return articles
        
articles = asyncio.run(fetch_yahoo(TICKET, HEADERS, cookies))

Page fetched for the link https://finance.yahoo.com/quote/ACMR/news/.
Page fetched for the link https://finance.yahoo.com/news/acm-research-inc-acmr-trending-130017800.html.
Article body found!
Page fetched for the link https://finance.yahoo.com/news/invest-acm-research-acmr-based-133015686.html.
Article body found!


In [1]:
ap = "papa"
ap.upper()

'PAPA'