### Imports

In [84]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import trafilatura
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import reuters
import numpy as np
import pandas as pd
from collections import Counter


### Reddit Scrapper

### Web Crawler

In [32]:
news_seeds = [
    "https://www.theguardian.com/world", # 54
    "https://www.aljazeera.com/news/", # 19
    "https://www.nytimes.com/section/world",# forbidden url
    "https://www.nbcnews.com/world",# keyword: world , 22
]

visited = set()
articles = []

In [33]:
def extract_text(html, url):
    """Try to extract main article text using trafilatura, fallback to <p> tags."""
    text = trafilatura.extract(html, url=url)
    if text:
        return text
    else:
        soup = BeautifulSoup(html, "html.parser")
        return " ".join([p.get_text() for p in soup.find_all("p")])

def crawl(url, index, depth=1):
    if depth == 0 or url in visited:
        return
    
    if True:
        print(f"Crawling: {url}")
        visited.add(url)
        
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
        except Exception as e:
            print(f"Failed to fetch {url}: {e}")
            return
        
        html = response.text
        
        text = extract_text(html, url)
        
        if text:
            if "2025" in url:
                articles.append({"url": url, "id": 0, "title": 0, "writer": 0,  "raw_text": text})
                print(f"Saved article ({len(text)} chars)")
        else:
            print(f"Not an article, but checking links...")
        
        # Parse links
        soup = BeautifulSoup(html, "html.parser")
        for link in soup.find_all("a", href=True):
            new_url = urljoin(url, link["href"])
            
            # Filter: must be within same domain as seed
            if new_url not in visited and url.split("/")[2] in new_url:
                crawl(new_url, index, depth - 1)


crawl(news_seeds[0], 0, depth=2)

print(f"Collected {len(articles)} articles")

Crawling: https://www.theguardian.com/world
Crawling: https://www.theguardian.com/world#maincontent
Crawling: https://www.theguardian.com/world#navigation
Crawling: https://www.theguardian.com/preference/edition/int
Crawling: https://www.theguardian.com/preference/edition/uk
Crawling: https://www.theguardian.com/preference/edition/us
Crawling: https://www.theguardian.com/preference/edition/au
Crawling: https://www.theguardian.com/preference/edition/eur
Crawling: https://www.theguardian.com/
Crawling: https://www.theguardian.com/commentisfree
Crawling: https://www.theguardian.com/sport
Crawling: https://www.theguardian.com/culture
Crawling: https://www.theguardian.com/lifeandstyle
Crawling: https://www.theguardian.com/us-news/us-politics
Crawling: https://www.theguardian.com/uk-news
Crawling: https://www.theguardian.com/environment/climate-crisis
Crawling: https://www.theguardian.com/world/middleeast
Crawling: https://www.theguardian.com/world/ukraine
Crawling: https://www.theguardian.c

In [34]:
print(articles[10]['raw_text'])

A fugitive father who had been hiding in New Zealand’s rugged wilderness with his three children for nearly four years has been shot dead by police investigating an armed burglary, police said on Monday.
The whereabouts of Tom Phillips has attracted headlines around the world since just before Christmas 2021, when he fled into the Waikato wilderness with his children Ember, now 9, Maverick, 10, and Jayda, 12, following a custody dispute with their mother.
Police, alerted to a burglary early on Monday, pursued two suspects on a quad bike before coming under fire from one of them from a high-powered rifle, authorities said. Police returned fire, killing the shooter, who they believe to be Phillips, pending formal identification.
Hours later, authorities located Phillips’s children, two of whom were believed to be in the wilderness, amid concerns they would be alone in temperatures that were forecast to reach freezing overnight.
Police deputy commissioner Jill Rogers said they were found 

### Preprocessing

In [59]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("reuters")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...


True

In [60]:
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [37]:
def normalize(articles):
    for article in articles:
        text = article['raw_text']
        text = text.lower()
        text = text.replace("\n", " ")

        # Remove URLs
        text = re.sub(r'http\S+|www\.\S+', '', text)
        # Remove special characters (keep only words and numbers)
        text = re.sub(r'[^a-z0-9\s]', ' ', text)

        article['normalize_text'] = text

# normalize(articles)
# print(articles[10]['normalize_text'])

In [38]:
def tokenize(articles):
    for article in articles:
        tokens = nltk.word_tokenize(article['normalize_text'])
    
        # Remove stopwords
        article["tokens"] = [t for t in tokens if t not in stop_words] 

# tokenize(articles)
# print(articles[10]['tokens'])

In [39]:
def stem(articles):
    for article in articles:
        article["tokens"] = [stemmer.stem(t) for t in article["tokens"]]

# stem(articles)
# print(articles[10]['tokens'])

In [40]:
def lemmatize(articles):
    for article in articles:
        article["tokens"] = [lemmatizer.lemmatize(t) for t in article["tokens"]]

# lemmatize(articles)
# print(articles[10]['tokens'])

In [None]:
def preprocess(articles):
    normalize(articles)
    tokenize(articles)
    stem(articles)
    # lemmatize(articles)
    
preprocess(articles)

### Classification

In [99]:
dataset_size = 200000

dataset_articles = []
for fileid in reuters.fileids()[:dataset_size]:
    text = reuters.raw(fileid)
    cats = reuters.categories(fileid)  # some docs have multiple categories
    if cats:
        dataset_articles.append({"raw_text": text, "label": cats[0]})

dataset_articles[10]["label"]

'tin'

In [100]:
preprocess(dataset_articles)

In [101]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X_reuters = vectorizer.fit_transform([" ".join(dataset_article["tokens"]) for dataset_article in dataset_articles])
X_crawled = vectorizer.transform([" ".join(article["tokens"]) for article in articles])

In [102]:
category_vocab = {}
labels = [dataset_article["label"] for dataset_article in dataset_articles]
labels = list(set(labels))  # unique labels

for label in labels:
    # find indices of docs belonging to this label
    idx = [i for i, art in enumerate(dataset_articles) if art["label"] == label]
    
    if not idx:  # skip empty categories
        continue
    
    # average TF-IDF weights for this category
    avg_weights = np.asarray(X_reuters[idx].mean(axis=0)).ravel()
    
    # get indices of top 50 highest-weighted terms
    top_idx = avg_weights.argsort()[::-1][:50]
    
    # get actual term strings
    terms = vectorizer.get_feature_names_out()[top_idx]
    
    # get their corresponding weights
    weights = avg_weights[top_idx]
    
    # save into dictionary
    category_vocab[label] = dict(zip(terms, weights))

In [103]:
def score_article(article_vector):
    scores = {}
    feature_names = vectorizer.get_feature_names_out()
    vec_array = article_vector.toarray().ravel()
    for label, vocab in category_vocab.items():
        score = sum(vec_array[feature_names.tolist().index(term)] * w
                    for term, w in vocab.items() if term in feature_names)
        scores[label] = score
    return scores

for article, x in zip(articles, X_crawled):
    scores = score_article(x)  # returns a dict of {label: score}
    article["scores"] = scores
    article["predicted"] = max(scores, key=scores.get)


In [104]:
articles[10]["predicted"]
print(Counter(a["predicted"] for a in articles))



Counter({'rice': 10, 'nzdlr': 9, 'rand': 6, 'yen': 5, 'dfl': 4, 'naphtha': 4, 'castor-oil': 3, 'hog': 2, 'cotton-oil': 2, 'tea': 2, 'tin': 1, 'jobs': 1, 'groundnut': 1, 'coconut': 1, 'veg-oil': 1, 'cocoa': 1, 'sun-oil': 1, 'rape-oil': 1, 'coconut-oil': 1})


In [105]:
for article in articles[:5]:
    print(article["raw_text"])
    print(f"### predicted class: {article['predicted']}")
    print("*"*20)

The first signs of a takeoff of Africa’s green economy are raising hopes that a transformation of the continent’s fortunes may be under way, driven by solar power and an increase in low-carbon investment.
African leaders are meeting this week in Addis Ababa, Ethiopia, for the Africa Climate Summit, a precursor to the global UN Cop30 in November. They will call for an increase in support from rich countries for Africa’s green resurgence, without which they will warn it could be fragile and spread unevenly.
Richard Muyungi, the climate envoy and adviser to the president of Tanzania, said: “Africa is ready to be part of [the global fight to stave off climate breakdown] provided we are supported with finance, technology and capacity-building.
“There has been an increase of investments in some areas but Africa still needs a lot of financing to be able to be part of the global solution, and to address the challenges we are facing.”
Green energy is booming in Africa, with 20 countries breakin