In [51]:
import re
import json
import spacy
import nltk
import requests
import sklearn_crfsuite
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from datasets import load_dataset
from sklearn_crfsuite import metrics
from seqeval.metrics import classification_report, accuracy_score, precision_score, f1_score
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS
import urllib.parse

In [2]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rafro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rafro\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rafro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# === PART 1 ===

## TASK 1: MODEL FOR NER

### 1. TEXT CLEANING & PREPROCESSING

In [3]:
def clean_text(text):
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-zA-Z0-9 ]", "", text)
    text = text.lower()
    words = word_tokenize(text)
    words = [w for w in words if w not in stopwords.words("english")]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)

In [4]:
raw_text = "<p>Apple was founded by Steve Jobs.</p>"
cleaned_text = clean_text(raw_text)
print("Cleaned Text:", cleaned_text)

Cleaned Text: apple founded steve job


### 2. NAMED ENTITY RECOGNITION (NER)

In [5]:
# Load the CoNLL-2003 dataset 
dataset = load_dataset("conll2003", trust_remote_code=True)
train_dataset = dataset['train'] 
validation_dataset = dataset['validation'] 
test_dataset = dataset['test'] 
# Example: Print the first example from the training set 
print(train_dataset[0]) 

{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


In [6]:
# Map NER tag indices to actual labels
ner_labels = dataset["train"].features["ner_tags"].feature.int2str
print(ner_labels)  

<bound method ClassLabel.int2str of ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None)>


In [7]:
# Feature Extraction for CRF Model
def extract_features(doc):
    return [{"word": w} for w in doc["tokens"]]

def get_labels(doc):
    return [ner_labels(label) for label in doc["ner_tags"]]

In [8]:
X_train = [extract_features(sent) for sent in train_dataset]
y_train = [get_labels(sent) for sent in train_dataset]
X_test = [extract_features(sent) for sent in test_dataset]
y_test = [get_labels(sent) for sent in test_dataset]

print("Sample Features:", X_train[0])
print("Sample Labels:", y_train[0])

Sample Features: [{'word': 'EU'}, {'word': 'rejects'}, {'word': 'German'}, {'word': 'call'}, {'word': 'to'}, {'word': 'boycott'}, {'word': 'British'}, {'word': 'lamb'}, {'word': '.'}]
Sample Labels: ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


In [9]:
# Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm="lbfgs",
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=False
)
crf.fit(X_train, y_train)

In [18]:
# Evaluate CRF model
y_pred = crf.predict(X_test)

In [19]:
nlp = spacy.load("en_ner_conll03")

# Convert spaCy predictions to IOB format aligned with test set
def spacy_ner_predict(sentences):
    y_preds = []
    for sent in sentences:
        doc = nlp(" ".join(sent["tokens"]))
        pred_tags = ["O"] * len(sent["tokens"])
        for ent in doc.ents:
            start, end, label = ent.start, ent.end, ent.label_
            if start < len(pred_tags):
                pred_tags[start] = f"B-{label}"
                for i in range(start+1, min(end, len(pred_tags))):
                    pred_tags[i] = f"I-{label}"
        y_preds.append(pred_tags)
    return y_preds

# Get gold labels from test set
ner_labels = dataset["train"].features["ner_tags"].feature.int2str
y_true = [[ner_labels(tag) for tag in sent["ner_tags"]] for sent in test_dataset]



In [20]:
# Evaluate spaCy predictions
y_pred_spacy = spacy_ner_predict(test_dataset)

In [21]:
# Compare CRF and spaCy predictions
print("📊 CRF Classification Report:")
print(metrics.flat_classification_report(y_test, y_pred))
print("📊 spaCy Classification Report:")
print(classification_report(y_true, y_pred_spacy))

📊 CRF Classification Report:
              precision    recall  f1-score   support

       B-LOC       0.89      0.78      0.83      1668
      B-MISC       0.83      0.67      0.74       702
       B-ORG       0.81      0.55      0.66      1661
       B-PER       0.74      0.43      0.55      1617
       I-LOC       0.42      0.66      0.51       257
      I-MISC       0.63      0.60      0.61       216
       I-ORG       0.53      0.63      0.58       835
       I-PER       0.72      0.55      0.62      1156
           O       0.94      0.97      0.95     38323

    accuracy                           0.91     46435
   macro avg       0.72      0.65      0.67     46435
weighted avg       0.90      0.91      0.90     46435

📊 spaCy Classification Report:
              precision    recall  f1-score   support

         LOC       0.69      0.64      0.66      1668
        MISC       0.67      0.63      0.65       702
         ORG       0.70      0.67      0.68      1661
         PER      

📊 Named Entity Recognition: CRF vs spaCy Pretrained Model
| Model   | Label Format                | Notes                                               |
|---------|-----------------------------|-----------------------------------------------------|
| CRF     | B-LOC, I-ORG, O, etc.       | Uses BIO tagging (Begin, Inside, Outside) for token-level labeling |
| spaCy   | LOC, PER, ORG, MISC         | Returns full entity spans, not token-level BIO tags    |

Note: spaCy internally uses BILUO tags during training, but outputs clean entity spans without B-/I- prefixes. CRF evaluates per-token; spaCy evaluates per-entity.

📈 Performance Comparison (weighted avrg)
| Metric     | CRF Model (Token-level) | spaCy Pretrained (Entity-level) |
|------------|--------------------------|----------------------------------|
| Precision  | 0.90                     | 0.67                             |
| Recall     | 0.91                     | 0.65                             |
| F1-Score   | 0.90                     | 0.66                             |

- The CRF model, trained specifically on the CoNLL-2003 dataset, achieves higher precision, recall, and F1-score, reflecting strong token-level performance.

- The spaCy pretrained model performs reasonably well but slightly lower. Its strength lies in ease of use, span-level predictions, and pretrained generalization.

- The difference is due to:

    - CRF evaluates per-token BIO tags.

    - spaCy evaluates entire named entity spans, which is stricter (e.g., must capture "New York City" fully).

- For fine-grained, high-accuracy tasks, a CRF trained on your specific dataset is more effective.

- For general-purpose NER across diverse texts, spaCy offers strong performance with zero training.

In [23]:
def extract_crf_entities(test_data, predictions):
    extracted = []
    for sent, tags in zip(test_data, predictions):
        tokens = sent["tokens"]
        i = 0
        while i < len(tags):
            tag = tags[i]
            if tag.startswith("B-"):
                entity = tokens[i]
                start = i
                label = tag[2:]
                i += 1
                while i < len(tags) and tags[i].startswith("I-"):
                    entity += " " + tokens[i]
                    i += 1
                end = i - 1
                extracted.append({"entity": entity, "label": label, "start": start, "end": end})
            else:
                i += 1
    return extracted

crf_entities = extract_crf_entities(test_dataset, y_pred)

# Save to JSON file
with open("crf_entities.json", "w") as f:
    json.dump(crf_entities, f, indent=2)

In [24]:
def extract_spacy_entities(test_data, model):
    all_entities = []
    for sent in test_data:
        tokens = sent["tokens"]
        text = " ".join(tokens)
        doc = model(text)
        for ent in doc.ents:
            all_entities.append({
                "entity": ent.text,
                "label": ent.label_,
                "start": ent.start,
                "end": ent.end - 1
            })
    return all_entities

spacy_entities = extract_spacy_entities(test_dataset, nlp)

# Save to JSON file
with open("spacy_entities.json", "w") as f:
    json.dump(spacy_entities, f, indent=2)

### 3. RELATION EXTRACTION (RE)

In [26]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------- -------------------------------- 2.4/12.8 MB 12.2 MB/s eta 0:00:01
     -------------- ------------------------- 4.7/12.8 MB 11.9 MB/s eta 0:00:01
     ---------------------- ----------------- 7.1/12.8 MB 11.8 MB/s eta 0:00:01
     ------------------------------ --------- 9.7/12.8 MB 11.8 MB/s eta 0:00:01
     ------------------------------------ -- 12.1/12.8 MB 12.0 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 11.2 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


The goal is to extract relations of the form (subject, predicate, object) where:
- Subject: The entity performing the action (or passive subject).
- Predicate: The verb that describes the action.
- Object: The entity affected by the action (or passive agent).

Application of extraction rules:
- Input Text: "Apple was founded by Steve Jobs."
- spaCy Process: The text is processed using the spaCy NLP pipeline.
- Subject Extraction:
    - Token: "Apple" (dependency: nsubjpass).
    - Recognized as the subject of the sentence.
- Predicate Extraction:
    - Token: "founded" (head of "Apple" and main verb of the sentence).
    - Lemma: "found" (the base form of the verb).
- Agent (Object of Preposition):
    - The phrase “by Steve Jobs” is detected as a prepositional phrase.
    - "by" is the preposition, and "Steve Jobs" is the object of the preposition (pobj).

In [None]:
nlp_re = spacy.load("en_core_web_sm")
txt = "Apple was founded by Steve Jobs."
doc = nlp_re(txt)
relations = []
# Iterate over tokens in the document
for token in doc:
    # Check for passive subject (nsubjpass) and the verb (ROOT)
    if token.dep_ == "nsubjpass" and token.head.pos_ == "VERB":
        subject = token.text  # Passive subject
        predicate = token.head.lemma_  # Predicate (base form of the verb)
        
        # Search for the prepositional phrase indicating the agent
        for child in token.head.children:
            if child.dep_ == "agent":  # Look for the agent (usually in 'by' phrase)
                for obj in child.children:
                    if obj.dep_ == "pobj":  # The object of the preposition (agent)
                        obj_text = obj.text
                        # Append the relation to the list
                        relations.append((subject, predicate, obj_text))
print(relations)

[('Apple', 'found', 'Jobs')]


It works well, we will implement this code in an extraction function to make it go through our test_dataset and save it.

In [46]:
def extract_relations(test_data, model):
    relations = []
    for sent in test_data:
        tokens = sent["tokens"]
        text = " ".join(tokens)
        doc = model(text)
        for token in doc:
            if token.dep_ == "nsubjpass" and token.head.pos_ == "VERB":
                subject = token.text
                predicate = token.head.lemma_
                for child in token.head.children:
                    if child.dep_ == "agent":
                        for agent in child.children:
                            if agent.dep_ == "pobj":
                                obj = agent.text
                                relations.append((subject, predicate, obj))
    return relations

core_web_relations = extract_relations(test_dataset, nlp_re)

# Save to JSON file
with open("core_web_relations.json", "w") as f:
    json.dump(core_web_relations, f, indent=2)

### 4. KNOWLEDGE GRAPH BUILDING

We will work with the entities from the crf extraction, saved in crf_entities.json as well as with the relations saved in core_web_relations.json.

In [63]:
EX = Namespace("http://example.org/")
g = Graph()

with open("core_web_relations.json", "r") as relations_file:
    relations = json.load(relations_file)

with open("crf_entities.json", "r") as entities_file:
    entities = json.load(entities_file)

In [64]:
# Helper function to make entity names URI-safe
def make_uri_safe(name):
    return urllib.parse.quote(name.replace(" ", "_"))

This function will resolve the entities name problem as there are spaces in the .json

In [65]:
for relation in relations:
    subject, predicate, obj = relation

    # Add triples to the graph
    g.add((URIRef(EX[subject]), URIRef(EX[predicate]), URIRef(EX[obj])))

for entity in entities:
    entity_name = entity.get("entity", None)
    entity_label = entity.get("label", None)

    if entity_name and entity_label:
        # Make the entity name URI-safe
        safe_entity_name = make_uri_safe(entity_name)
        # Add the entity with its type (label) to the graph
        g.add((URIRef(EX[safe_entity_name]), RDF.type, URIRef(EX[entity_label])))

In [66]:
query = """
SELECT ?subject ?predicate ?object
WHERE {
    ?subject ?predicate ?object .
}
"""
results = g.query(query)
for row in results:
    print(f"Subject: {row.subject}, Predicate: {row.predicate}, Object: {row.object}")

Subject: http://example.org/JACKSONVILLE, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://example.org/LOC
Subject: http://example.org/Healy, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://example.org/PER
Subject: http://example.org/Aleksander_Kwasniewski, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://example.org/PER
Subject: http://example.org/Andersson, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://example.org/PER
Subject: http://example.org/Marcelo, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://example.org/PER
Subject: http://example.org/Gabon, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://example.org/LOC
Subject: http://example.org/Dean_Holdsworth, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://example.org/PER
Subject: http://example.org/26%3D_Michaela, Predicate: http://www.w3.org/1999/

We decided to serialize the graph to a .ttl file

In [67]:
g.serialize("knowledge_graph.ttl", format="turtle")
num_triples = len(g)
print(f"The graph contains {num_triples} triples.")

The graph contains 1912 triples.


Now we want to run the programm on the following text : Star Wars IV is a Movie where there are different kinds of creatures, like humans and wookies. Some creatures are Jedis; for instance, the human Luke is a Jedi, and Master Yoda – for whom the species is not known – is also a Jedi. The wookie named Chewbacca is Han’s co-pilot on the Millennium Falcon starship. The speed of Millennium Falcon is 1.5 (above the speed of light!)

In [62]:
# Define a namespace
EX = Namespace("http://sw.org/")

# Create an RDF graph
g = Graph()

# Add entities and their types
g.add((URIRef(EX["Star_Wars_IV"]), RDF.type, URIRef(EX["Movie"])))
g.add((URIRef(EX["humans"]), RDF.type, URIRef(EX["Species"])))
g.add((URIRef(EX["wookies"]), RDF.type, URIRef(EX["Species"])))
g.add((URIRef(EX["Luke"]), RDF.type, URIRef(EX["Human"])))
g.add((URIRef(EX["Luke"]), RDF.type, URIRef(EX["Jedi"])))
g.add((URIRef(EX["Master_Yoda"]), RDF.type, URIRef(EX["Jedi"])))
g.add((URIRef(EX["Master_Yoda"]), RDF.type, URIRef(EX["Unknown_Species"])))
g.add((URIRef(EX["Chewbacca"]), RDF.type, URIRef(EX["Wookie"])))
g.add((URIRef(EX["Millennium_Falcon"]), RDF.type, URIRef(EX["Starship"])))

# Add relations
g.add((URIRef(EX["Chewbacca"]), URIRef(EX["is_co_pilot_of"]), URIRef(EX["Millennium_Falcon"])))
g.add((URIRef(EX["Millennium_Falcon"]), URIRef(EX["has_speed"]), Literal(1.5)))
g.add((URIRef(EX["Millennium_Falcon"]), URIRef(EX["speed_unit"]), Literal("above the speed of light")))

# Serialize the graph to a file (optional)
g.serialize("star_wars_knowledge_graph.ttl", format="turtle")

# SPARQL query to verify the graph
query = """
SELECT ?subject ?predicate ?object
WHERE {
    ?subject ?predicate ?object .
}
"""

# Execute the query
results = g.query(query)

# Print the results
for row in results:
    print(f"Subject: {row.subject}, Predicate: {row.predicate}, Object: {row.object}")

Subject: http://sw.org/Luke, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://sw.org/Human
Subject: http://sw.org/Master_Yoda, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://sw.org/Unknown_Species
Subject: http://sw.org/Star_Wars_IV, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://sw.org/Movie
Subject: http://sw.org/wookies, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://sw.org/Species
Subject: http://sw.org/Master_Yoda, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://sw.org/Jedi
Subject: http://sw.org/Millennium_Falcon, Predicate: http://sw.org/has_speed, Object: 1.5
Subject: http://sw.org/Millennium_Falcon, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://sw.org/Starship
Subject: http://sw.org/humans, Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://sw.org/Species
Subject: http://sw.org/Luke, Predic

## TASK 2: PIPELINE FOR KNOWLEDGE GRAPH CONSTRUCTION

### 1. FETCH NEWS ARTICLE

### 2. USE METHODS FORM TASK 1

# PART 2