In [14]:
import re
import spacy
import nltk
import requests
import sklearn_crfsuite
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from datasets import load_dataset
from sklearn_crfsuite import metrics
from seqeval.metrics import classification_report, accuracy_score, precision_score, f1_score
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF

In [2]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rafro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rafro\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rafro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# === 1. TEXT CLEANING & PREPROCESSING ===

In [3]:
def clean_text(text):
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-zA-Z0-9 ]", "", text)
    text = text.lower()
    words = word_tokenize(text)
    words = [w for w in words if w not in stopwords.words("english")]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)

In [4]:
raw_text = "<p>Apple was founded by Steve Jobs.</p>"
cleaned_text = clean_text(raw_text)
print("Cleaned Text:", cleaned_text)

Cleaned Text: apple founded steve job


# === 2. NAMED ENTITY RECOGNITION (NER) ===

In [5]:
# Load the CoNLL-2003 dataset 
dataset = load_dataset("conll2003", trust_remote_code=True)
train_dataset = dataset['train'] 
validation_dataset = dataset['validation'] 
test_dataset = dataset['test'] 
# Example: Print the first example from the training set 
print(train_dataset[0]) 

{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


In [6]:
# Map NER tag indices to actual labels
ner_labels = dataset["train"].features["ner_tags"].feature.int2str
print(ner_labels)  

<bound method ClassLabel.int2str of ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None)>


In [7]:
# Feature Extraction for CRF Model
def extract_features(doc):
    return [{"word": w} for w in doc["tokens"]]

def get_labels(doc):
    return [ner_labels(label) for label in doc["ner_tags"]]

In [8]:
X_train = [extract_features(sent) for sent in train_dataset]
y_train = [get_labels(sent) for sent in train_dataset]
X_test = [extract_features(sent) for sent in test_dataset]
y_test = [get_labels(sent) for sent in test_dataset]

print("Sample Features:", X_train[0])
print("Sample Labels:", y_train[0])

Sample Features: [{'word': 'EU'}, {'word': 'rejects'}, {'word': 'German'}, {'word': 'call'}, {'word': 'to'}, {'word': 'boycott'}, {'word': 'British'}, {'word': 'lamb'}, {'word': '.'}]
Sample Labels: ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


In [9]:
# Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm="lbfgs",
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=False
)
crf.fit(X_train, y_train)

In [18]:
# Evaluate CRF model
y_pred = crf.predict(X_test)

In [19]:
nlp = spacy.load("en_ner_conll03")

# Convert spaCy predictions to IOB format aligned with test set
def spacy_ner_predict(sentences):
    y_preds = []
    for sent in sentences:
        doc = nlp(" ".join(sent["tokens"]))
        pred_tags = ["O"] * len(sent["tokens"])
        for ent in doc.ents:
            start, end, label = ent.start, ent.end, ent.label_
            if start < len(pred_tags):
                pred_tags[start] = f"B-{label}"
                for i in range(start+1, min(end, len(pred_tags))):
                    pred_tags[i] = f"I-{label}"
        y_preds.append(pred_tags)
    return y_preds

# Get gold labels from test set
ner_labels = dataset["train"].features["ner_tags"].feature.int2str
y_true = [[ner_labels(tag) for tag in sent["ner_tags"]] for sent in test_dataset]



In [None]:
# Evaluate spaCy predictions
y_pred_spacy = spacy_ner_predict(test_dataset)

In [None]:
# Compare CRF and spaCy predictions
print("📊 CRF Classification Report:")
print(metrics.flat_classification_report(y_test, y_pred))
print("📊 spaCy Classification Report:")
print(classification_report(y_true, y_pred_spacy))

In [None]:
txt = "Apple was founded by Steve Jobs."
doc = nlp(txt)
entities = [(ent.text, ent.label_) for ent in doc.ents]
print("Entities:", entities)



Entities: [('Apple', 'ORG'), ('Steve Jobs', 'PER')]


# === 3. RELATION EXTRACTION (RE) ===

# === 4. KNOWLEDGE GRAPH BUILDING ===