## Web Scraping Functions

In [31]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from datetime import datetime   

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def parse_published_datetime(raw_text: str):
    """
    Convert 'Wed, Dec 03, 2025 02:05pm' into datetime.
    Removes trailing ' - 4 days' if present.
    """

    # Remove " - X days" suffix
    text = raw_text.split(" - ")[0].strip()

    # KLSE Screener format examples:
    # "Wed, Dec 03, 2025 02:05pm"
    # "Thu, Nov 28, 2024 11:10am"
    # Format: "%a, %b %d, %Y %I:%M%p"

    formats = [
        "%a, %b %d, %Y %I:%M%p",
        "%a, %b %d, %Y %I:%M %p",   # handles space before AM/PM
        "%b %d, %Y %I:%M%p",       # rare variant without weekday
    ]

    for fmt in formats:
        try:
            return datetime.strptime(text, fmt)
        except ValueError:
            continue

    # If all fail, return None instead of crashing
    return None


def scrape_article(url: str) -> dict:
    response = requests.get(url, headers=HEADERS)
    if response.status_code != 200:
        return "Failed to retrieve the article."

    soup = BeautifulSoup(response.content, 'html.parser')
    headline_node = soup.find('h2')
    headline = headline_node.get_text(strip=True) if headline_node else "No headline found."

    info_node = soup.select_one("h2 + div")
     # Defaults
    source = None
    published = None

    if info_node:
        spans = info_node.find_all("span")

        # span[0] ‚Üí Source
        if len(spans) >= 1:
            source = spans[0].get_text(strip=True)

        # span[1] ‚Üí Datetime (full string including the ‚Äú- 4 days‚Äù part)
        if len(spans) >= 2:
            published_raw = spans[1].get_text(strip=True)

            # Remove trailing " - X days"
            published = parse_published_datetime(published_raw.split(" - ")[0].strip())

    body_nodes = soup.find_all('p')
    body_text = "\n".join([p.get_text(strip=True) for p in body_nodes])

    # Remove known noise lines
    CLEAN_PATTERNS = [
        r"^Contact us\s*",
        r"The content is a snapshot from Publisher. Refer to the original content for accurate info. Contact us for any changes\.",
        r"Related Stocks.*",
        r"Comments.*",
    ]
    for pat in CLEAN_PATTERNS:
        body_text = re.sub(pat, "", body_text, flags=re.IGNORECASE | re.DOTALL).strip()

    return {
        "url": url,
        "headline": headline,
        "source": source,
        "published": published,
        "body": body_text
    }

In [74]:
BASE = "https://www.klsescreener.com"
LANG_SETTINGS_URL = f"{BASE}/v2/news/lang_settings"

def set_english_only(session):
    # disable Chinese
    session.post(LANG_SETTINGS_URL, headers=HEADERS, data={
        "language": "zh",
        "value": "false"
    })

    # disable Malay
    session.post(LANG_SETTINGS_URL, headers=HEADERS, data={
        "language": "ms",
        "value": "false"
    })

In [None]:
from datetime import datetime, timedelta
from langdetect import detect, LangDetectException

def is_english(text: str) -> bool:
    """Return True if langdetect detects English, else False."""
    try:
        lang = detect(text)
        return lang == "en"
    except LangDetectException:
        return False  # Unable to detect ‚Üí treat as not English

def parse_datetime(dt_string):
    """Parse datetime from data-date attribute."""
    return datetime.strptime(dt_string, "%Y-%m-%d %H:%M:%S")

def get_articles_last_n_days(stock_code: str, days: int = 180):
    """
    Scrape articles for a stock from the last N days.
    Default is 180 days (~6 months).
    """
    session = requests.Session()

    base_url = f"https://www.klsescreener.com/v2/news/stock/{stock_code}"
    cutoff = datetime.now() - timedelta(days=days)

    all_articles = []
    page = 1
    until_param = None

    while True:

        # Build page URL
        if until_param:
            url = f"{base_url}/{page}?until={until_param}"
        else:
            url = base_url

        print(f"üîé Fetching: {url}")
        r = session.get(url, headers=HEADERS)
        if r.status_code != 200:
            print("‚ö†Ô∏è Failed to fetch page")
            break

        soup = BeautifulSoup(r.text, "html.parser")
        article_blocks = soup.select("div.article")

        if not article_blocks:
            print("No more articles found.")
            break

        oldest_dt_in_page = None

        for block in article_blocks:
            # ---- Extract link ----
            a = block.select_one("a[href^='/v2/news/view/']")
            if not a:
                continue

            link = "https://www.klsescreener.com" + a["href"]
            title = a.get_text(strip=True)

            # Skip non-English titles
            if not is_english(title):
                # print(f"‚è≠ Skipped non-English: {title}")
                continue

            # ---- Extract datetime ----
            dt_span = block.select_one("span[data-date]")
            if not dt_span:
                continue

            dt_str = dt_span["data-date"]
            dt = parse_datetime(dt_str)

            # Track oldest dt in this batch
            if oldest_dt_in_page is None or dt < oldest_dt_in_page:
                oldest_dt_in_page = dt

            # Stop if older than cutoff
            if dt < cutoff:
                print(f"‚õî Older than {days} days reached. Stopping.")
                return all_articles

            all_articles.append({
                "title": title,
                "url": link,
                "published": dt_str
            })

        # Prepare next page
        if oldest_dt_in_page:
            until_param = int(oldest_dt_in_page.timestamp())
            page += 1
            time.sleep(0.5)  # be nice to server
        else:
            break

    return all_articles

In [None]:
# Test with 6 months (180 days) - you can adjust the days parameter
df = get_articles_last_n_days("1155", days=180)
print(f"Found {len(df)} articles for ticker 1155")

In [20]:
df = pd.DataFrame(df)
df

Unnamed: 0,title,url,published
0,Firm fundamentals to bolster banks next year,https://www.klsescreener.com/v2/news/view/1631...,2025-12-04 00:00:00
1,Opportunities aplenty for digital banks but no...,https://www.klsescreener.com/v2/news/view/1631...,2025-12-03 14:05:00
2,Banking sector to navigate tighter liquidity i...,https://www.klsescreener.com/v2/news/view/1631...,2025-12-03 10:19:14
3,Three sectors delivered strong Q3 performances...,https://www.klsescreener.com/v2/news/view/1631...,2025-12-03 08:01:08
4,"Building materials, plantation top 3Q earnings...",https://www.klsescreener.com/v2/news/view/1630...,2025-12-02 14:18:33
5,Foreign outflows cross RM20bil but local suppo...,https://www.klsescreener.com/v2/news/view/1630...,2025-12-02 10:04:13
6,"Is the FBM KLCI finally ready for 1,700 again?",https://www.klsescreener.com/v2/news/view/1629...,2025-11-29 09:18:23
7,Local bourse ends easier on consolidation mode,https://www.klsescreener.com/v2/news/view/1626...,2025-11-26 00:00:00
8,Local banks offer¬†flood relief assistance to a...,https://www.klsescreener.com/v2/news/view/1626...,2025-11-25 19:16:59
9,Against the odds: Maybank's margins edge up de...,https://www.klsescreener.com/v2/news/view/1625...,2025-11-25 08:00:14


In [34]:
article = scrape_article("https://www.klsescreener.com/v2/news/view/1631309/banking-sector-to-navigate-tighter-liquidity-in-2026-after-strong-finish-this-year-analysts")
df = pd.DataFrame([article])
print(df['headline'])
print("Body:\n", article["body"])

0    Banking sector to navigate tighter liquidity i...
Name: headline, dtype: object
Body:
 KUALA LUMPUR (Dec 3): The Malaysian banking sector is poised for tightening liquidity and heightened deposit competition in 2026, after a strong finish this year.
MBSB Research maintained a positive outlook on the sector, citing strong fundamentals and attractive dividends as key drivers to a solid 4Q2025.
"Bolstered by multiple tailwinds, the banking sector is in a good place ‚Äî so we expect share prices to continue their uptrend," MBSB said in a note on Wednesday.
According to the firm‚Äôs note, elevated dividend yields, improving loan growth, stable net interest margins (NIMs), stronger fee income, and further recoveries in gross impaired loans (GIL) are expected to support continued share price growth.
Although potential headwinds like asset quality and liquidity pressures persist, they are currently seen as secondary concerns.
Citing takeaways from "multiple banks" on the tightening liquid

## Data Archiving

In [None]:
from pymongo import MongoClient
from urllib.parse import quote_plus 
from datetime import timezone
from tqdm import tqdm
# Connect to MongoDB
username = quote_plus("Wrynaft")
password = quote_plus("Ryan@120104")

client = MongoClient(f"mongodb+srv://{username}:{password}@cluster0.bjjt9fa.mongodb.net/?appName=Cluster0")
db = client['roundtable_ai']
print("Connected to MongoDB")

col = db["articles"]

def store_articles_for_all_tickers(df, days: int = 180):
    """
    Loop through df['ticker'], scrape each ticker's articles for the last N days,
    and store them into MongoDB with upsert on URL.
    
    Args:
        df: DataFrame with 'ticker' column
        days: Number of days to look back (default 180 = ~6 months)
    """
    tickers_with_no_articles = []
    
    for ticker in tqdm(df['ticker'], desc="Processing tickers", unit="ticker"):
        ticker = str(ticker).upper().replace(".KL", "")
        print(f"\n===============================")
        print(f"üìå Processing ticker: {ticker}")
        print(f"===============================\n")

        # Step 1: Get all English article links (last N days)
        links = get_articles_last_n_days(str(ticker), days=days)

        print(f"üîó Found {len(links)} article links")

        if len(links) == 0:
            print(f"‚ö†Ô∏è No English articles found for ticker {ticker}")
            tickers_with_no_articles.append(ticker)
            continue  # Skip scraping step

        # Step 2: Scrape each article
        for item in links:
            url = item["url"]

            print(f"üì∞ Scraping article: {url}")

            article_data = scrape_article(url)
            if isinstance(article_data, str):
                print(f"‚ö†Ô∏è Error: {article_data}")
                continue

            # Step 3: Build MongoDB document
            doc = {
                "ticker": str(ticker),
                "url": article_data["url"],
                "headline": article_data["headline"],
                "source": article_data["source"],
                "published": article_data["published"],     # datetime object
                "body": article_data["body"],
                "scraped_at": datetime.now(timezone.utc)
            }

            # Step 4: Insert or update (avoid duplicates)
            result = col.update_one(
                {"url": article_data["url"],"ticker": str(ticker)},   # unique key
                {"$set": doc},                  # update data if exists
                upsert=True
            )

            if result.upserted_id:
                print(f"‚úÖ Stored new article.")
            else:
                print(f"‚ôªÔ∏è Article already exists. Updated instead.")

            time.sleep(0.3)  # be gentle to server

        print(f"‚úî Completed ticker {ticker}\n")
        time.sleep(1.0)  # small delay between tickers

    print("\n===============================")
    print("üì¢ SUMMARY: Tickers with 0 articles")
    print("===============================\n")

    if len(tickers_with_no_articles) == 0:
        print("üéâ All tickers had at least one English article!")
    else:
        for t in tickers_with_no_articles:
            print(f"‚ùå {t}")

In [None]:
# Scrape articles for the last 6 months (180 days)
# Existing articles will be updated (not duplicated) thanks to upsert
ticker_list = pd.read_csv("ticker_list.csv")  # assumes a 'ticker' column
store_articles_for_all_tickers(ticker_list, days=180)

In [43]:
tickers_with_articles = col.aggregate([
    {"$group": {"_id": "$ticker", "count": {"$sum": 1}}},
    {"$sort": {"_id": 1}}
])

# Convert cursor to list so we can iterate twice
ticker_list = list(tickers_with_articles)

for t in tickers_with_articles:
    print(t["_id"], "->", t["count"], "articles")

# Print total number of tickers
print("\nTotal tickers with at least 1 article:", len(ticker_list))


Total tickers with at least 1 article: 390


## Sentiment Analysis

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

LABELS = ["positive", "negative", "neutral"]  # FinBERT order is fixed

def analyze_sentiment(text: str) -> str:
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits[0].cpu().numpy()
        
        probs = torch.softmax(torch.tensor(logits), dim=0).numpy()
        max_index = int(np.argmax(probs))
        sentiment_score = float(probs[0]-probs[1])  # positive - negative

        return {
            "label": LABELS[max_index],
            "score": sentiment_score,
            "confidence": {
                "positive": float(probs[0]),
                "negative": float(probs[1]),
                "neutral": float(probs[2])
            }
        }
    except Exception as e:
        return {"error": str(e)}

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [2]:
from pymongo import MongoClient
from urllib.parse import quote_plus

# Connect to MongoDB
username = quote_plus("Wrynaft")
password = quote_plus("Ryan@120104")

client = MongoClient(f"mongodb+srv://{username}:{password}@cluster0.bjjt9fa.mongodb.net/?appName=Cluster0")
db = client['roundtable_ai']
print("Connected to MongoDB")

col = db["articles"]

Connected to MongoDB


In [4]:
from tqdm import tqdm
cursor = col.find({"sentiment": {"$exists": False}})
count = col.count_documents({"sentiment": {"$exists": False}})

print(f"Processing {count} articles for sentiment analysis...\n")

for doc in tqdm(cursor, desc="Analyzing articles"):
    article_id = doc["_id"]

    headline = doc.get("headline", "") or ""
    body = doc.get("body", "") or ""

    full_text = headline + "\n" + body

    sentiment = analyze_sentiment(full_text)

    col.update_one(
        {"_id": article_id},
        {"$set": {"sentiment": sentiment}}
    )

print("Sentiment analysis completed and stored in MongoDB.")

Processing 2648 articles for sentiment analysis...



Analyzing articles: 2648it [01:32, 28.64it/s]

Sentiment analysis completed and stored in MongoDB.



