In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import collections
import os

In [2]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\athan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\athan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Constants
BASE_URL = "https://www.who.int/emergencies/diseases/novel-coronavirus-2019"
KEYWORDS = ['coronavirus', 'Covid', 'SARS-CoV-2', 'Coronavirus', 'COVID']
STOPWORDS = set(stopwords.words('english'))

In [4]:
# Function to clean tokens
def clean_token(token):
    return re.match(r'[^\w\s]+$', token) is None

In [5]:
# Step 1: Crawl the main page and collect relevant links
def crawl_links(base_url, keywords):
    response = requests.get(base_url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch URL: {base_url} (Status code: {response.status_code})")

    soup = BeautifulSoup(response.content, 'lxml')
    links = soup.find_all('a')
    relevant_links = []

    for link in links:
        href = link.get('href')
        if href and any(keyword.lower() in href.lower() for keyword in keywords):
            relevant_links.append(urljoin(base_url, href))

    return list(set(relevant_links))  # Remove duplicates

In [6]:
# Step 2: Extract text from each link
def extract_text_from_links(links):
    all_texts = []

    for link in links:
        try:
            response = requests.get(link)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'lxml')
                text = soup.get_text(separator=' ', strip=True)
                all_texts.append(text)
        except Exception as e:
            print(f"Failed to fetch or parse {link}: {e}")

    return all_texts

In [7]:
# Step 3: Preprocess text
def preprocess_text(texts):
    tokenized_texts = []

    for text in texts:
        tokens = nltk.word_tokenize(text)
        cleaned_tokens = [token.lower() for token in tokens if token.lower() not in STOPWORDS and clean_token(token)]
        tokenized_texts.extend(cleaned_tokens)

    return tokenized_texts

In [8]:
# Step 4: Vectorize text using TF-IDF
def vectorize_text(tokens, top_n=20):
    vocabulary = collections.Counter(tokens).most_common(top_n)
    vocab = [word for word, _ in vocabulary]

    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(vocabulary=vocab)
    tfidf_matrix = vectorizer.fit_transform([" ".join(tokens)])

    return vectorizer.get_feature_names(), tfidf_matrix

In [9]:
# Main Workflow
def main():
    print("Crawling links from the WHO website...")
    links = crawl_links(BASE_URL, KEYWORDS)
    print(f"Found {len(links)} relevant links.")

    print("Extracting text from links...")
    texts = extract_text_from_links(links)
    print(f"Extracted text from {len(texts)} pages.")

    print("Preprocessing text...")
    tokens = preprocess_text(texts)
    print(f"Tokenized {len(tokens)} words.")

    print("Vectorizing text using TF-IDF...")
    vocab, tfidf_matrix = vectorize_text(tokens)
    print("TF-IDF Vocabulary:", vocab)
    print("TF-IDF Matrix Shape:", tfidf_matrix.shape)

    # Save results for further analysis
    if not os.path.exists("output"):
        os.makedirs("output")

    with open("output/crawled_links.txt", "w") as f:
        f.write("\n".join(links))

    with open("output/tokens.txt", "w", encoding="utf-8") as f:
        f.write(" ".join(tokens))

    print("Results saved in the 'output' directory.")

if __name__ == "__main__":
    main()

Crawling links from the WHO website...
Found 37 relevant links.
Extracting text from links...
Extracted text from 35 pages.
Preprocessing text...
Tokenized 47860 words.
Vectorizing text using TF-IDF...
TF-IDF Vocabulary: ['covid-19', 'health', 'episode', 'data', 'vaccines', 'disease', 'vaccine', 'coronavirus', 'global', '2023', 'country', 'countries', '2020', 'world', 'update', 'science', 'public', 'epidemiological', '5', 'response']
TF-IDF Matrix Shape: (1, 20)
Results saved in the 'output' directory.
