In [9]:
import re
import json
import spacy
import nltk
import requests
import sklearn_crfsuite
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from datasets import load_dataset
from sklearn_crfsuite import metrics
from seqeval.metrics import classification_report, accuracy_score, precision_score, f1_score
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS
import urllib.parse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import random

In [2]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rafro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rafro\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rafro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# === PART 1 ===

## TASK 1: MODEL FOR NER

### 1. TEXT CLEANING & PREPROCESSING

In [79]:
def clean_text(text):
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-zA-Z0-9 ]", "", text)
    text = text.lower()
    words = word_tokenize(text)
    words = [w for w in words if w not in stopwords.words("english")]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)

In [80]:
raw_text = "<p>Apple was founded by Steve Jobs.</p>"
cleaned_text = clean_text(raw_text)
print("Cleaned Text:", cleaned_text)

Cleaned Text: apple founded steve job


### 2. NAMED ENTITY RECOGNITION (NER)

In [81]:
# Load the CoNLL-2003 dataset 
dataset = load_dataset("conll2003", trust_remote_code=True)
train_dataset = dataset['train'] 
validation_dataset = dataset['validation'] 
test_dataset = dataset['test'] 
# Example: Print the first example from the training set 
print(train_dataset[0]) 

{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


In [82]:
# Map NER tag indices to actual labels
ner_labels = dataset["train"].features["ner_tags"].feature.int2str
print(ner_labels)  

<bound method ClassLabel.int2str of ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None)>


In [83]:
# Feature Extraction for CRF Model
def extract_features(doc):
    return [{"word": w} for w in doc["tokens"]]

def get_labels(doc):
    return [ner_labels(label) for label in doc["ner_tags"]]

In [84]:
X_train = [extract_features(sent) for sent in train_dataset]
y_train = [get_labels(sent) for sent in train_dataset]
X_test = [extract_features(sent) for sent in test_dataset]
y_test = [get_labels(sent) for sent in test_dataset]

print("Sample Features:", X_train[0])
print("Sample Labels:", y_train[0])

Sample Features: [{'word': 'EU'}, {'word': 'rejects'}, {'word': 'German'}, {'word': 'call'}, {'word': 'to'}, {'word': 'boycott'}, {'word': 'British'}, {'word': 'lamb'}, {'word': '.'}]
Sample Labels: ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


In [85]:
# Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm="lbfgs",
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=False
)
crf.fit(X_train, y_train)

In [86]:
# Evaluate CRF model
y_pred = crf.predict(X_test)

In [87]:
nlp = spacy.load("en_ner_conll03")

# Convert spaCy predictions to IOB format aligned with test set
def spacy_ner_predict(sentences):
    y_preds = []
    for sent in sentences:
        doc = nlp(" ".join(sent["tokens"]))
        pred_tags = ["O"] * len(sent["tokens"])
        for ent in doc.ents:
            start, end, label = ent.start, ent.end, ent.label_
            if start < len(pred_tags):
                pred_tags[start] = f"B-{label}"
                for i in range(start+1, min(end, len(pred_tags))):
                    pred_tags[i] = f"I-{label}"
        y_preds.append(pred_tags)
    return y_preds

# Get gold labels from test set
ner_labels = dataset["train"].features["ner_tags"].feature.int2str
y_true = [[ner_labels(tag) for tag in sent["ner_tags"]] for sent in test_dataset]



In [88]:
# Evaluate spaCy predictions
y_pred_spacy = spacy_ner_predict(test_dataset)

In [89]:
# Compare CRF and spaCy predictions
print("📊 CRF Classification Report:")
print(metrics.flat_classification_report(y_test, y_pred))
print("📊 spaCy Classification Report:")
print(classification_report(y_true, y_pred_spacy))

📊 CRF Classification Report:
              precision    recall  f1-score   support

       B-LOC       0.89      0.78      0.83      1668
      B-MISC       0.83      0.67      0.74       702
       B-ORG       0.81      0.55      0.66      1661
       B-PER       0.74      0.43      0.55      1617
       I-LOC       0.42      0.66      0.51       257
      I-MISC       0.63      0.60      0.61       216
       I-ORG       0.53      0.63      0.58       835
       I-PER       0.72      0.55      0.62      1156
           O       0.94      0.97      0.95     38323

    accuracy                           0.91     46435
   macro avg       0.72      0.65      0.67     46435
weighted avg       0.90      0.91      0.90     46435

📊 spaCy Classification Report:
              precision    recall  f1-score   support

         LOC       0.69      0.64      0.66      1668
        MISC       0.67      0.63      0.65       702
         ORG       0.70      0.67      0.68      1661
         PER      

📊 Named Entity Recognition: CRF vs spaCy Pretrained Model
| Model   | Label Format                | Notes                                               |
|---------|-----------------------------|-----------------------------------------------------|
| CRF     | B-LOC, I-ORG, O, etc.       | Uses BIO tagging (Begin, Inside, Outside) for token-level labeling |
| spaCy   | LOC, PER, ORG, MISC         | Returns full entity spans, not token-level BIO tags    |

Note: spaCy internally uses BILUO tags during training, but outputs clean entity spans without B-/I- prefixes. CRF evaluates per-token; spaCy evaluates per-entity.

📈 Performance Comparison (weighted avrg)
| Metric     | CRF Model (Token-level) | spaCy Pretrained (Entity-level) |
|------------|--------------------------|----------------------------------|
| Precision  | 0.90                     | 0.67                             |
| Recall     | 0.91                     | 0.65                             |
| F1-Score   | 0.90                     | 0.66                             |

- The CRF model, trained specifically on the CoNLL-2003 dataset, achieves higher precision, recall, and F1-score, reflecting strong token-level performance.

- The spaCy pretrained model performs reasonably well but slightly lower. Its strength lies in ease of use, span-level predictions, and pretrained generalization.

- The difference is due to:

    - CRF evaluates per-token BIO tags.

    - spaCy evaluates entire named entity spans, which is stricter (e.g., must capture "New York City" fully).

- For fine-grained, high-accuracy tasks, a CRF trained on your specific dataset is more effective.

- For general-purpose NER across diverse texts, spaCy offers strong performance with zero training.

In [90]:
def extract_crf_entities(test_data, predictions):
    extracted = []
    for sent, tags in zip(test_data, predictions):
        tokens = sent["tokens"]
        i = 0
        while i < len(tags):
            tag = tags[i]
            if tag.startswith("B-"):
                entity = tokens[i]
                start = i
                label = tag[2:]
                i += 1
                while i < len(tags) and tags[i].startswith("I-"):
                    entity += " " + tokens[i]
                    i += 1
                end = i - 1
                extracted.append({"entity": entity, "label": label, "start": start, "end": end})
            else:
                i += 1
    return extracted

crf_entities = extract_crf_entities(test_dataset, y_pred)

# Save to JSON file
with open("crf_entities.json", "w") as f:
    json.dump(crf_entities, f, indent=2)

In [92]:
def extract_spacy_entities(test_data, model):
    all_entities = []
    for sent in test_data:
        tokens = sent["tokens"]
        text = " ".join(tokens)
        doc = model(text)
        for ent in doc.ents:
            all_entities.append({
                "entity": ent.text,
                "label": ent.label_,
                "start": ent.start,
                "end": ent.end - 1
            })
    return all_entities

spacy_entities = extract_spacy_entities(test_dataset, nlp)

# Save to JSON file
with open("spacy_entities.json", "w") as f:
    json.dump(spacy_entities, f, indent=2)

### 3. RELATION EXTRACTION (RE)

In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------- -------------------------------- 2.4/12.8 MB 12.2 MB/s eta 0:00:01
     -------------- ------------------------- 4.7/12.8 MB 11.9 MB/s eta 0:00:01
     ---------------------- ----------------- 7.1/12.8 MB 11.8 MB/s eta 0:00:01
     ------------------------------ --------- 9.7/12.8 MB 11.8 MB/s eta 0:00:01
     ------------------------------------ -- 12.1/12.8 MB 12.0 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 11.2 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


The goal is to extract relations of the form (subject, predicate, object) where:
- Subject: The entity performing the action (or passive subject).
- Predicate: The verb that describes the action.
- Object: The entity affected by the action (or passive agent).

Application of extraction rules:
- Input Text: "Apple was founded by Steve Jobs."
- spaCy Process: The text is processed using the spaCy NLP pipeline.
- Subject Extraction:
    - Token: "Apple" (dependency: nsubjpass).
    - Recognized as the subject of the sentence.
- Predicate Extraction:
    - Token: "founded" (head of "Apple" and main verb of the sentence).
    - Lemma: "found" (the base form of the verb).
- Agent (Object of Preposition):
    - The phrase “by Steve Jobs” is detected as a prepositional phrase.
    - "by" is the preposition, and "Steve Jobs" is the object of the preposition (pobj).

In [93]:
nlp_re = spacy.load("en_core_web_sm")
txt = "Apple was founded by Steve Jobs."
doc = nlp_re(txt)
relations = []
# Iterate over tokens in the document
for token in doc:
    # Check for passive subject (nsubjpass) and the verb (ROOT)
    if token.dep_ == "nsubjpass" and token.head.pos_ == "VERB":
        subject = token.text  # Passive subject
        predicate = token.head.lemma_  # Predicate (base form of the verb)
        
        # Search for the prepositional phrase indicating the agent
        for child in token.head.children:
            if child.dep_ == "agent":  # Look for the agent (usually in 'by' phrase)
                for obj in child.children:
                    if obj.dep_ == "pobj":  # The object of the preposition (agent)
                        obj_text = obj.text
                        # Append the relation to the list
                        relations.append((subject, predicate, obj_text))
print(relations)

[('Apple', 'found', 'Jobs')]


It works well, we will implement this code in an extraction function to make it go through our test_dataset and save it.

In [94]:
def extract_relations(test_data, model):
    relations = []
    for sent in test_data:
        tokens = sent["tokens"]
        text = " ".join(tokens)
        doc = model(text)
        for token in doc:
            if token.dep_ == "nsubjpass" and token.head.pos_ == "VERB":
                subject = token.text
                predicate = token.head.lemma_
                for child in token.head.children:
                    if child.dep_ == "agent":
                        for agent in child.children:
                            if agent.dep_ == "pobj":
                                obj = agent.text
                                relations.append((subject, predicate, obj))
    return relations

core_web_relations = extract_relations(test_dataset, nlp_re)

# Save to JSON file
with open("core_web_relations.json", "w") as f:
    json.dump(core_web_relations, f, indent=2)

### 4. KNOWLEDGE GRAPH BUILDING

We will work with the entities from the crf extraction, saved in crf_entities.json as well as with the relations saved in core_web_relations.json.

In [None]:
EX = Namespace("http://example.org/")
g = Graph()

with open("core_web_relations.json", "r") as relations_file:
    relations = json.load(relations_file)

with open("crf_entities.json", "r") as entities_file:
    entities = json.load(entities_file)

In [None]:
# Helper function to make entity names URI-safe
def make_uri_safe(name):
    return urllib.parse.quote(name.replace(" ", "_"))

This function will resolve the entities name problem as there are spaces in the .json

In [None]:
for relation in relations:
    subject, predicate, obj = relation

    # Add triples to the graph
    g.add((URIRef(EX[subject]), URIRef(EX[predicate]), URIRef(EX[obj])))

for entity in entities:
    entity_name = entity.get("entity", None)
    entity_label = entity.get("label", None)

    if entity_name and entity_label:
        # Make the entity name URI-safe
        safe_entity_name = make_uri_safe(entity_name)
        # Add the entity with its type (label) to the graph
        g.add((URIRef(EX[safe_entity_name]), RDF.type, URIRef(EX[entity_label])))

In [None]:
query = """
SELECT ?subject ?predicate ?object
WHERE {
    ?subject ?predicate ?object .
}
"""
results = g.query(query)
for row in results:
    print(f"Subject: {row.subject}, Predicate: {row.predicate}, Object: {row.object}")

Subject: http://example.org/JACKSONVILLE, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://example.org/LOC
Subject: http://example.org/Healy, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://example.org/PER
Subject: http://example.org/Aleksander_Kwasniewski, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://example.org/PER
Subject: http://example.org/Andersson, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://example.org/PER
Subject: http://example.org/Marcelo, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://example.org/PER
Subject: http://example.org/Gabon, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://example.org/LOC
Subject: http://example.org/Dean_Holdsworth, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://example.org/PER
Subject: http://example.org/26%3D_Michaela, Predicate: http://www.w3.org/1999/

We decided to serialize the graph to a .ttl file

In [None]:
g.serialize("knowledge_graph.ttl", format="turtle")
num_triples = len(g)
print(f"The graph contains {num_triples} triples.")

The graph contains 1912 triples.


Now we want to run the programm on the following text : Star Wars IV is a Movie where there are different kinds of creatures, like humans and wookies. Some creatures are Jedis; for instance, the human Luke is a Jedi, and Master Yoda – for whom the species is not known – is also a Jedi. The wookie named Chewbacca is Han’s co-pilot on the Millennium Falcon starship. The speed of Millennium Falcon is 1.5 (above the speed of light!)

In [None]:
# Define a namespace
EX = Namespace("http://sw.org/")

# Create an RDF graph
g = Graph()

# Add entities and their types
g.add((URIRef(EX["Star_Wars_IV"]), RDF.type, URIRef(EX["Movie"])))
g.add((URIRef(EX["humans"]), RDF.type, URIRef(EX["Species"])))
g.add((URIRef(EX["wookies"]), RDF.type, URIRef(EX["Species"])))
g.add((URIRef(EX["Luke"]), RDF.type, URIRef(EX["Human"])))
g.add((URIRef(EX["Luke"]), RDF.type, URIRef(EX["Jedi"])))
g.add((URIRef(EX["Master_Yoda"]), RDF.type, URIRef(EX["Jedi"])))
g.add((URIRef(EX["Master_Yoda"]), RDF.type, URIRef(EX["Unknown_Species"])))
g.add((URIRef(EX["Chewbacca"]), RDF.type, URIRef(EX["Wookie"])))
g.add((URIRef(EX["Millennium_Falcon"]), RDF.type, URIRef(EX["Starship"])))

# Add relations
g.add((URIRef(EX["Chewbacca"]), URIRef(EX["is_co_pilot_of"]), URIRef(EX["Millennium_Falcon"])))
g.add((URIRef(EX["Millennium_Falcon"]), URIRef(EX["has_speed"]), Literal(1.5)))
g.add((URIRef(EX["Millennium_Falcon"]), URIRef(EX["speed_unit"]), Literal("above the speed of light")))

# Serialize the graph to a file (optional)
g.serialize("star_wars_knowledge_graph.ttl", format="turtle")

# SPARQL query to verify the graph
query = """
SELECT ?subject ?predicate ?object
WHERE {
    ?subject ?predicate ?object .
}
"""

# Execute the query
results = g.query(query)

# Print the results
for row in results:
    print(f"Subject: {row.subject}, Predicate: {row.predicate}, Object: {row.object}")

Subject: http://sw.org/Luke, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://sw.org/Human
Subject: http://sw.org/Master_Yoda, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://sw.org/Unknown_Species
Subject: http://sw.org/Star_Wars_IV, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://sw.org/Movie
Subject: http://sw.org/wookies, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://sw.org/Species
Subject: http://sw.org/Master_Yoda, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://sw.org/Jedi
Subject: http://sw.org/Millennium_Falcon, Predicate: http://sw.org/has_speed, Object: 1.5
Subject: http://sw.org/Millennium_Falcon, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://sw.org/Starship
Subject: http://sw.org/humans, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://sw.org/Species
Subject: http://sw.org/Luke, Predic

## TASK 2: PIPELINE FOR KNOWLEDGE GRAPH CONSTRUCTION

### 1. FETCH NEWS ARTICLE

#### Failed attempts at scrapping reuters

In [10]:
def fetch_reuters_articles():
    url = "https://www.reuters.com/world/"
    articles = []

    # Fetch the HTML content of the category page
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    session = requests.Session()
    response = session.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch the category page. Status code: {response.status_code}")
        return articles
    
    # Parse the category page
    soup = BeautifulSoup(response.text, "html.parser")
    for article in soup.find_all("a", attrs={"data-testid": "Link"}, limit=10): #We begin by finding 10 links
        href = article.get("href")
        if href and href.startswith("/"):  # Ensure it's a relative link
            article_url = "https://www.reuters.com" + href

            # Fetch the article page
            article_response = requests.get(article_url)
            if article_response.status_code != 200:
                print(f"Failed to fetch article: {article_url}")
                continue

            # Parse the article page
            article_soup = BeautifulSoup(article_response.text, "html.parser")

            # Extract the title
            title = article_soup.find("h1").get_text(strip=True) if article_soup.find("h1") else "No Title Found"

            # Extract the publication date
            date = article_soup.find("time")
            publication_date = date.get("datetime") if date else "No Date Found"

            # Extract the main content
            content = []
            for paragraph in article_soup.find_all("p"):
                content.append(paragraph.get_text(strip=True))
            main_content = " ".join(content)

            # Store the article data
            articles.append({
                "url": article_url,
                "title": title,
                "publication_date": publication_date,
                "content": main_content
            })

    return articles

In [11]:
articles = fetch_reuters_articles()

Failed to fetch the category page. Status code: 401


We can't seem to succeed at fetching reuters so we will try with selenium and a chrome driver

In [12]:
def fetch_reuters_articles_with_selenium():
    url = "https://www.reuters.com/world/"
    articles = []

    # Set up Selenium WebDriver
    chrome_driver_path = "C:\Program Files\chromedriver-win64\chromedriver.exe"
    service = Service(chrome_driver_path)
    driver = webdriver.Chrome(service=service)

    try:
        # Step 1: Open the category page
        driver.get(url)
        time.sleep(3)  # Wait for the page to load
    
        # Step 2: Extract article links
        soup = BeautifulSoup(driver.page_source, "html.parser")
        for article in soup.find_all("a", attrs={"data-testid": "Link"}, limit=11):  # Adjust the selector if needed
            href = article.get("href")
            if href and href.startswith("/"):  # Ensure it's a relative link
                article_url = "https://www.reuters.com" + href

                # Step 3: Open the article page
                driver.get(article_url)
                time.sleep(3)  # Wait for the page to load

                # Parse the article page
                article_soup = BeautifulSoup(driver.page_source, "html.parser")

                # Extract the title
                title = article_soup.find("h1").get_text(strip=True)

                # Extract the publication date
                date = article_soup.find("time")
                publication_date = date.get("datetime") if date else "No Date Found"

                # Extract the main content
                content = []
                for paragraph in article_soup.find_all("p"):
                    content.append(paragraph.get_text(strip=True))
                main_content = " ".join(content)

                # Store the article data
                articles.append({
                    "url": article_url,
                    "title": title,
                    "publication_date": publication_date,
                    "content": main_content
                })

                # Go back to the category page
                driver.back()
                time.sleep(2)  # Wait for the page to reload

    finally:
        # Close the browser
        driver.quit()

    return articles

In [14]:
def fetch_reuters_articles_selenium():
    # Configure Chrome options for headless browsing (optional)
    options = Options()
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument("--disable-gpu")  # Disable GPU acceleration
    options.add_argument("--no-sandbox")  # Bypass OS security model
    options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
    # Set up Selenium WebDriver
    chrome_driver_path = "C:\Program Files\chromedriver-win64\chromedriver.exe"
    service = Service(chrome_driver_path)

    # Initialize the Chrome webdriver
    driver = webdriver.Chrome(service=service, options=options)
    
    # URL of Reuters World section
    url = "https://www.reuters.com/world/"

    # Load the page using Selenium
    driver.get(url)

    # Get the page source after JavaScript execution
    page_source = driver.page_source

    # Close the browser
    driver.quit()

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(page_source, "html.parser")
    print(soup.prettify())
    articles = []  # Initialize an empty list to store articles

    # Extract the first 10 articles
    for article in soup.find_all("a", attrs={"data-testid": "Link"}, limit=11):  # Adjust the selector if needed
        href = article.get("href")
        if href and href.startswith("/"):  # Ensure it's a relative link
            article_url = "https://www.reuters.com" + href

            # Step 3: Open the article page
            driver.get(article_url)
            time.sleep(3)  # Wait for the page to load

            # Parse the article page
            article_soup = BeautifulSoup(driver.page_source, "html.parser")

            # Extract the title
            title = article_soup.find("h1").get_text(strip=True)

            # Extract the publication date
            date = article_soup.find("time")
            publication_date = date.get("datetime") if date else "No Date Found"

            # Extract the main content
            content = []
            for paragraph in article_soup.find_all("p"):
                content.append(paragraph.get_text(strip=True))
            main_content = " ".join(content)

            # Store the article data
            articles.append({
                "url": article_url,
                "title": title,
                "publication_date": publication_date,
                "content": main_content
            })

    return articles

In [15]:
articles = fetch_reuters_articles_selenium()

<html lang="fr">
 <head>
  <title>
   reuters.com
  </title>
  <style>
   #cmsg{animation: A 1.5s;}@keyframes A{0%{opacity:0;}99%{opacity:0;}100%{opacity:1;}}
  </style>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
 </head>
 <body style="margin:0">
  <script data-cfasync="false">
   var dd={'rt':'c','cid':'AHrlqAAAAAMAq6OZXDLPCSQATYg8_Q==','hsh':'2013457ADA70C67D6A4123E0A76873','t':'fe','qp':'','s':46743,'e':'bdbff913f6051f687c9ab6f5c8bc9a678f613c05f01ada95bb02880540c43283','host':'geo.captcha-delivery.com','cookie':'SD8S~rnct2IzXM3psZE3MjqsCK7eOZVm3wHgB2UiMh2c1fiZ3a5Zgll1qgopDKLCPHOB5m7X_g8N_H_hhrYTwrDXnzN_g4dyJDpdFyD4hmodyfZgxe~5Ni3Ul6AWQ3I0'}
  </script>
  <script data-cfasync="false" src="https://ct.captcha-delivery.com/c.js">
  </script>
  <iframe border="0" frameborder="0" height="100%" sandbox="allow-scripts allow-same-origin allow-forms" scrolling="yes" src="https://geo.captcha-delivery.com/captcha/?initialCid=AHrlqAAAAAMAq6OZXDLPCSQATYg8_Q%3D%3D&am

In [16]:
def fetch_reuters_articles_selenium2():
    # Configure Chrome options
    options = Options()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("start-maximized")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    
    # Set up Selenium WebDriver
    chrome_driver_path = "C:\\Program Files\\chromedriver-win64\\chromedriver.exe"
    service = Service(chrome_driver_path)

    # Initialize the Chrome webdriver
    driver = webdriver.Chrome(service=service, options=options)
    
    # Open the page
    driver.get("https://www.reuters.com/world/")
    time.sleep(10)  # Wait for CAPTCHA to be solved manually

    # Extract page source after CAPTCHA is solved
    page_source = driver.page_source
    print(page_source)

    driver.quit()
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(page_source, "html.parser")
    print(soup.prettify())
    articles = []  # Initialize an empty list to store articles

    # Extract the first 10 articles
    for article in soup.find_all("a", attrs={"data-testid": "Link"}, limit=11):  # Adjust the selector if needed
        href = article.get("href")
        if href and href.startswith("/"):  # Ensure it's a relative link
            article_url = "https://www.reuters.com" + href

            # Step 3: Open the article page
            driver.get(article_url)
            time.sleep(3)  # Wait for the page to load

            # Parse the article page
            article_soup = BeautifulSoup(driver.page_source, "html.parser")

            # Extract the title
            title = article_soup.find("h1").get_text(strip=True)

            # Extract the publication date
            date = article_soup.find("time")
            publication_date = date.get("datetime") if date else "No Date Found"

            # Extract the main content
            content = []
            for paragraph in article_soup.find_all("p"):
                content.append(paragraph.get_text(strip=True))
            main_content = " ".join(content)

            # Store the article data
            articles.append({
                "url": article_url,
                "title": title,
                "publication_date": publication_date,
                "content": main_content
            })

    return articles

#### Succeded scrapping bbc

We tried a lot of thing to bypass the captcha and other various problem encountered while scrapping reuters.
Thus, we will now try to scrap www.bbc.com

In [156]:
def fetch_bbc_articles():
    chrome_options = Options() 
    chrome_options.add_argument("--headless")  # Run Chrome in headless mode
    chrome_options.add_argument("--no-sandbox")  # Bypass OS security model
    chrome_options.add_argument("--disable-dev-shm-usage") # Overcome limited resource problems

    driver = webdriver.Chrome(options=chrome_options) 
    url = "https://www.bbc.com/sport"
    driver.get(url)

    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")

    articles = []

    # Find all article links
    for article in soup.find_all("a", class_="ssrcss-sxweo-PromoLink exn3ah95", limit=10):
        href = article.get("href")
        if href and href.startswith("/sport"):  # Ensure it's a valid BBC News link
            article_url = f"https://www.bbc.com{href}"

            # Fetch the article page
            article_response = requests.get(article_url)
            article_soup = BeautifulSoup(article_response.content, "html.parser")

            # Extract the title
            title = article_soup.find("h1").get_text(strip=True) if article_soup.find("h1") else "No Title Found"

            # Extract the publication date
            date = article_soup.find("time")
            publication_date = date.get("datetime") if date else "No Date Found"

            # Extract the main content
            content = []
            for paragraph in article_soup.find_all("p"):
                content.append(paragraph.get_text(strip=True))
            main_content = " ".join(content)

            # Store the article data
            articles.append({
                "title": title,
                "url": article_url,
                "publication_date": publication_date,
                "content": main_content,
            })

    driver.quit()
    return articles

In [157]:
articles = fetch_bbc_articles()

In [158]:
len(articles)

10

In [159]:
for article in articles:
    print("Title:", article['title'])
    print("Publication Date:", article['publication_date'])
    print("Content:", article['content'][:200])
    print("URL:", article['url'])

Title: Liverpool title push hits a bump - but when can they win it?
Publication Date: 2025-04-06T17:50:19.811Z
Content: Liverpool's loss at Fulham was their first away league defeat under Arne Slot Liverpool's unbeaten run has come to an end but - barring a remarkable collapse - they will end the season as Premier Leag
URL: https://www.bbc.com/sport/football/articles/cp91x70y209o
Title: Leishman wins first individual LIV title in Miami
Publication Date: 2025-04-06T22:25:04.251Z
Content: Australian Marc Leishman joined LIV Golf in August 2022 LIV Miami, final leaderboard -6M Leishman (Aus);-5C Schwartzel (SA);-4S Garcia (Spa);-3C Ortiz (Mex);-2BDeChambeau (US);-1P Mickelson (US) Selec
URL: https://www.bbc.com/sport/golf/articles/c230n9nn40po
Title: 'Top-tier' Scotland beat Switzerland to take World Curling gold
Publication Date: 2025-03-30T19:58:26.263Z
Content: Bruce Mouat's rink claimed their second world title after their 2023 triumph in Ottawa Scotland took gold at the men's World C

### 2. USE METHODS FORM TASK 1

##### Reorganization and reworking of previous functions

In [164]:
def process_articles(articles):
    # Load spaCy models
    nlp_ner = spacy.load("en_ner_conll03")
    nlp_re = spacy.load("en_core_web_sm")

    # Perform minimal cleaning to preserve the structure of the text
    def clean_text(text):
        print("Original Text:", text)
        # Remove HTML tags
        text = re.sub(r"<.*?>", "", text)
        # Remove excessive whitespace
        text = re.sub(r"\s+", " ", text).strip()
        print("Cleaned Text:", text)
        return text

    # Helper function to extract entities using spaCy NER
    def extract_entities(text, model):
        doc = model(text)
        entities = [{"entity": ent.text, "label": ent.label_} for ent in doc.ents]
        print(entities)
        return entities

    # Helper function to extract relations using spaCy RE
    def extract_relations(text, model):
        doc = model(text)
        relations = []

        for token in doc:
            # Check for a subject-verb-object structure
            if token.dep_ in ("nsubj", "nsubjpass") and token.head.pos_ == "VERB":
                subject = token.text
                predicate = token.head.lemma_
                obj = None

                # Look for the object of the verb
                for child in token.head.children:
                    if child.dep_ in ("dobj", "pobj", "attr"):
                        obj = child.text
                        break

                if obj:
                    relations.append({"subject": subject, "predicate": predicate, "object": obj})

        return relations

    # Process each article
    processed_data = []
    for article in articles:
        cleaned_content = clean_text(article["content"])
        entities = extract_entities(cleaned_content, nlp_ner)
        #entities = extract_entities(article["content"], nlp_ner)
        relations = extract_relations(cleaned_content, nlp_re)
        #relations = extract_relations(article["content"], nlp_re)
        processed_data.append({"title": article["title"], "entities": entities, "relations": relations})

    return processed_data


def build_knowledge_graph(processed_data):
    # Define RDF namespace
    EX = Namespace("http://example.org/")
    g = Graph()

    # Helper function to make entity names URI-safe
    def make_uri_safe(name):
        return urllib.parse.quote(name.replace(" ", "_"))

    # Add entities and relations to the graph
    for data in processed_data:
        for entity in data["entities"]:
            entity_name = make_uri_safe(entity["entity"])
            entity_label = make_uri_safe(entity["label"])
            g.add((URIRef(EX[entity_name]), RDF.type, URIRef(EX[entity_label])))

        for relation in data["relations"]:
            subject = make_uri_safe(relation["subject"])
            predicate = make_uri_safe(relation["predicate"])
            obj = make_uri_safe(relation["object"])
            g.add((URIRef(EX[subject]), URIRef(EX[predicate]), URIRef(EX[obj])))

    # Serialize the graph to a Turtle file
    g.serialize("knowledge_graph.ttl", format="turtle")
    print(f"The graph contains {len(g)} triples.")
    return g

##### Processing, building graph

In [165]:
# Process articles
processed_data = process_articles(articles)



Original Text: Liverpool's loss at Fulham was their first away league defeat under Arne Slot Liverpool's unbeaten run has come to an end but - barring a remarkable collapse - they will end the season as Premier League champions. Defeat at Fulham was the Reds' first in 26 league games and just their second of the campaign. But with seven games remaining, Arne Slot's side require a maximum of 11 points more to secure the title. "If you look at the goals we conceded, I think all three, we could have prevented them," Slot told BBC Match of the Day. "That is not what we usually see and that is probably why we are where we are, because if you make these mistakes more often in this league, you would not have the amount of points we have. "The second half is much more like the team we are. Conceding three goals in the manner we did is not of the standards of Liverpool. That's clear." Despite the setback at Craven Cottage,  a record-equalling 20th top-flight crown is surely a formality now? The

In [166]:
print("Processed Data:", json.dumps(processed_data, indent=2))

Processed Data: [
  {
    "title": "Liverpool title push hits a bump - but when can they win it?",
    "entities": [
      {
        "entity": "Liverpool",
        "label": "ORG"
      },
      {
        "entity": "Fulham",
        "label": "PER"
      },
      {
        "entity": "Arne Slot Liverpool",
        "label": "LOC"
      },
      {
        "entity": "Fulham",
        "label": "PER"
      },
      {
        "entity": "Reds",
        "label": "ORG"
      },
      {
        "entity": "Arne Slot",
        "label": "PER"
      },
      {
        "entity": "Slot",
        "label": "PER"
      },
      {
        "entity": "BBC Match of the Day",
        "label": "ORG"
      },
      {
        "entity": "Liverpool",
        "label": "ORG"
      },
      {
        "entity": "Craven Cottage",
        "label": "LOC"
      },
      {
        "entity": "Premier League",
        "label": "MISC"
      },
      {
        "entity": "Reds",
        "label": "ORG"
      },
      {
        "ent

In [167]:
# Build the knowledge graph
knowledge_graph = build_knowledge_graph(processed_data)

The graph contains 611 triples.


# PART 2