# Project Web Datamining & Semantics
## Tom Delahaye - Gabriel Carlotti - DIA 3

### Criterion 1 :  Basic Implementation


#### Functional web scraping script with at least 10 articles

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

def fetch_random_wikipedia_articles(limit=100):
    base_url = "https://en.wikipedia.org/wiki/Special:Random"
    headers = {"User-Agent": "Mozilla/5.0"}
    articles = []

    for i in range(limit):
        try:
            response = requests.get(base_url, headers=headers, allow_redirects=True)
            article_url = response.url  # Final redirected URL
            print(f"[{i+1}] Fetched URL: {article_url}")

            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract the title
            title_tag = soup.find('h1', id='firstHeading')
            if not title_tag:
                print(f"[{i+1}] ERROR: No title found, skipping article.")
                continue
            title = title_tag.get_text(strip=True)

            # Extract main content from the article body
            content_div = soup.find('div', class_='mw-parser-output')
            if not content_div:
                print(f"[{i+1}] ERROR: No content container found, skipping article.")
                continue
            paragraphs = content_div.find_all('p', recursive=False)
            content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
            if len(content) < 200:
                print(f"[{i+1}] INFO: Article content too short, skipping: {title}")
                continue

            # Extract the last modification date from the footer
            last_modified_tag = soup.find('li', id='footer-info-lastmod')
            if last_modified_tag:
                last_modified_text = last_modified_tag.get_text(strip=True)
                # Remove the common prefix using regex if possible
                match = re.search(r'last edited on (.+)', last_modified_text, re.IGNORECASE)
                last_modified = pd.to_datetime(match.group(1).split(',')[0].strip(), format="%d %B %Y").date() if match else 'N/A'

            else:
                last_modified = 'N/A'

            articles.append({
                'title': title,
                'url': article_url,
                'content': content,
                'last_modified': last_modified
            })
            print(f"[{i+1}] SUCCESS: Article '{title}' scraped.")

        except Exception as e:
            print(f"[{i+1}] ERROR: Exception occurred: {e}")
            continue

    print(f"\nSUCCESS: Collected {len(articles)} article(s) successfully.")
    return pd.DataFrame(articles)




In [2]:
# Example execution
df = fetch_random_wikipedia_articles(limit=20)
print(df.head())

[1] Fetched URL: https://en.wikipedia.org/wiki/Doug_Woodward_(American_football)
[1] SUCCESS: Article 'Doug Woodward (American football)' scraped.
[2] Fetched URL: https://en.wikipedia.org/wiki/Lesbian,_Gay,_Bisexual_%26_Transgender_Community_Center
[2] SUCCESS: Article 'Lesbian, Gay, Bisexual & Transgender Community Center' scraped.
[3] Fetched URL: https://en.wikipedia.org/wiki/Mobile_Suit_Gundam:_MS_Sensen_0079
[3] INFO: Article content too short, skipping: Mobile Suit Gundam: MS Sensen 0079
[4] Fetched URL: https://en.wikipedia.org/wiki/Chronicon_Faventinum
[4] SUCCESS: Article 'Chronicon Faventinum' scraped.
[5] Fetched URL: https://en.wikipedia.org/wiki/Sugano_Dam
[5] INFO: Article content too short, skipping: Sugano Dam
[6] Fetched URL: https://en.wikipedia.org/wiki/Neocollyris_coapteroides
[6] INFO: Article content too short, skipping: Neocollyris coapteroides
[7] Fetched URL: https://en.wikipedia.org/wiki/Koko,_Benin
[7] INFO: Article content too short, skipping: Koko, Benin
[

In [3]:
df

Unnamed: 0,title,url,content,last_modified
0,Doug Woodward (American football),https://en.wikipedia.org/wiki/Doug_Woodward_(A...,"Doug Woodward(born September 12, 1958) is a fo...",2024-10-03
1,"Lesbian, Gay, Bisexual & Transgender Community...","https://en.wikipedia.org/wiki/Lesbian,_Gay,_Bi...","TheLesbian, Gay, Bisexual & Transgender Commun...",2024-09-24
2,Chronicon Faventinum,https://en.wikipedia.org/wiki/Chronicon_Favent...,TheChronicon Faventinum[1]is aLatinchronicle o...,2025-02-15
3,Innovative Hockey,https://en.wikipedia.org/wiki/Innovative_Hockey,"Innovative Hockey, Inc.was an American manufac...",2025-02-12
4,Chester Alan Arthur II,https://en.wikipedia.org/wiki/Chester_Alan_Art...,"Chester Alan Arthur II(July 25, 1864 – July 18...",2025-03-17
5,Hot Jazz Saturday Night,https://en.wikipedia.org/wiki/Hot_Jazz_Saturda...,Hot Jazz Saturday Nightis a weeklyjazzprogram ...,2025-03-02
6,Cornelis Symonsz van der Schalcke,https://en.wikipedia.org/wiki/Cornelis_Symonsz...,"Cornelis Symonsz van der Schalcke, orSchalcken...",2022-07-26
7,2000 Heineken Trophy – Men's doubles,https://en.wikipedia.org/wiki/2000_Heineken_Tr...,Since the 1999 final was cancelled due to rain...,2023-06-14


In [4]:
print(df.url[0])

https://en.wikipedia.org/wiki/Doug_Woodward_(American_football)


#### Clean text preprocessing (no HTML tags, proper sentence structure)

In [32]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from google import genai
from google.genai import types

# Download necessary NLTK data (only run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def tokenize_text(text):
    """
    Tokenizes the input text and returns a list of tokens.
    Each token is either a word or a punctuation mark.
    
    Example:
    Input: "Oleg Shatskiku made sure of the win in injury time, hitting an unstoppable left foot shot from just outside the area."
    Output: ['Oleg', 'Shatskiku', 'made', 'sure', 'of', 'the', 'win', 'in', 'injury', 'time', ',', 'hitting', 'an', 'unstoppable', 'left', 'foot', 'shot', 'from', 'just', 'outside', 'the', 'area', '.']
    """
    # This regex matches one or more word characters OR a single character that is not a word character or whitespace.
    tokens = re.findall(r"\w+|[^\w\s]", text)
    return tokens


def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation except the hyphen (-)
    punctuation_to_remove = string.punctuation.replace('-', '')
    translator = str.maketrans('', '', punctuation_to_remove)
    text = text.translate(translator)
    
    # Remove numbers (optional)
    text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespace
    text = " ".join(text.split())
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Re-join tokens into a single string
    processed_text = " ".join(tokens)
    return processed_text

def preprocess_llm(text):
    """
    Takes a DataFrame with a 'content' column and adds a new column 
    'text_processed' that contains the cleaned text.
    """
    sys_prompt="""
    You are a specialist in preprocessing text data. You will be given an article from Wikipedia and youre goal is to clean the text data by 
    keeping only the english words and separing eventualy the words that has been sticked together for instance Guiananbotanist must be Guinan botanist.
    You also need to preserve the ponctuation and correct it if necessary.
    You only return the text cleaned without commentaries, nothing else ! even if the text is already clean.
    """

    client = genai.Client(api_key="AIzaSyC_FCK8XmvLUKJ5y3K7m0dBk6u0J__44cg")

    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=f"System prompt :{sys_prompt}. Here is the text you have to clean :{text}"
        
    )
    return response.text
    

def preprocess_dataframe(df):
    """
    Takes a DataFrame with a 'content' column and adds a new column 
    'text_processed' that contains the cleaned text.
    """
    df['text_processed_llm'] = df['content'].apply(preprocess_llm)
    # df['tokens'] = df['text_processed_llm'].apply(tokenize_text)
    # df['text_processed'] = df['text_processed_llm'].apply(preprocess_text)
    return df




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gabriel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gabriel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gabriel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
df_propre = preprocess_dataframe(df)

In [7]:
df_propre

Unnamed: 0,title,url,content,last_modified,text_processed_llm,tokens
0,Doug Woodward (American football),https://en.wikipedia.org/wiki/Doug_Woodward_(A...,"Doug Woodward(born September 12, 1958) is a fo...",2024-10-03,Doug Woodward born September is a former Ameri...,"[Doug, Woodward, born, September, is, a, forme..."
1,"Lesbian, Gay, Bisexual & Transgender Community...","https://en.wikipedia.org/wiki/Lesbian,_Gay,_Bi...","TheLesbian, Gay, Bisexual & Transgender Commun...",2024-09-24,The Lesbian Gay Bisexual Transgender Community...,"[The, Lesbian, Gay, Bisexual, Transgender, Com..."
2,Chronicon Faventinum,https://en.wikipedia.org/wiki/Chronicon_Favent...,TheChronicon Faventinum[1]is aLatinchronicle o...,2025-02-15,The Chronicon Faventinum is a Latin chronicle ...,"[The, Chronicon, Faventinum, is, a, Latin, chr..."
3,Innovative Hockey,https://en.wikipedia.org/wiki/Innovative_Hockey,"Innovative Hockey, Inc.was an American manufac...",2025-02-12,Innovative Hockey Inc was an American manufact...,"[Innovative, Hockey, Inc, was, an, American, m..."
4,Chester Alan Arthur II,https://en.wikipedia.org/wiki/Chester_Alan_Art...,"Chester Alan Arthur II(July 25, 1864 – July 18...",2025-03-17,Chester Alan Arthur July July also known as Al...,"[Chester, Alan, Arthur, July, July, also, know..."
5,Hot Jazz Saturday Night,https://en.wikipedia.org/wiki/Hot_Jazz_Saturda...,Hot Jazz Saturday Nightis a weeklyjazzprogram ...,2025-03-02,Hot Jazz Saturday Night is a weekly jazz progr...,"[Hot, Jazz, Saturday, Night, is, a, weekly, ja..."
6,Cornelis Symonsz van der Schalcke,https://en.wikipedia.org/wiki/Cornelis_Symonsz...,"Cornelis Symonsz van der Schalcke, orSchalcken...",2022-07-26,Cornelis Symonsz van der Schalcke or Schalcken...,"[Cornelis, Symonsz, van, der, Schalcke, or, Sc..."
7,2000 Heineken Trophy – Men's doubles,https://en.wikipedia.org/wiki/2000_Heineken_Tr...,Since the 1999 final was cancelled due to rain...,2023-06-14,Since the final was cancelled due to rain no d...,"[Since, the, final, was, cancelled, due, to, r..."


#### Complete pipeline from scraping to graph construction

### Criterion 2 : Named Entity Recognition

#### Comparison between model CRF and Spacy conll23

In [8]:
import re
import string
import nltk
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from datasets import load_dataset
import spacy
from spacy.training import Example
from spacy.scorer import Scorer

# Download required NLTK resources
nltk.download('punkt')

#########################################
# 1. Data Preparation for CRF Training
#########################################

# Load the CoNLL-2003 dataset
dataset = load_dataset("conll2003",trust_remote_code=True)
train_data = dataset["train"]
test_data = dataset["test"]

# Get label names for NER and POS tags
ner_names = train_data.features["ner_tags"].feature.names
pos_names = train_data.features["pos_tags"].feature.names

# Prepare sentences as a list of tuples: (token, pos, ner)
def prepare_sent(sent):
    tokens = sent["tokens"]
    pos_tags = [pos_names[tag] for tag in sent["pos_tags"]]
    ner_tags = [ner_names[tag] for tag in sent["ner_tags"]]
    return list(zip(tokens, pos_tags, ner_tags))

train_sents = [prepare_sent(sent) for sent in train_data]
test_sents = [prepare_sent(sent) for sent in test_data]

# Feature extraction functions for CRF

def word2features(sent, i):
    token, pos, ner = sent[i]
    features = {
        'bias': 1.0,
        'word.lower()': token.lower(),
        'word[-3:]': token[-3:],
        'word[-2:]': token[-2:],
        'word.isupper()': token.isupper(),
        'word.istitle()': token.istitle(),
        'word.isdigit()': token.isdigit(),
        'pos': pos,
        'pos[:2]': pos[:2],
    }
    if i > 0:
        token1, pos1, _ = sent[i-1]
        features.update({
            '-1:word.lower()': token1.lower(),
            '-1:word.istitle()': token1.istitle(),
            '-1:word.isupper()': token1.isupper(),
            '-1:pos': pos1,
            '-1:pos[:2]': pos1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        token1, pos1, _ = sent[i+1]
        features.update({
            '+1:word.lower()': token1.lower(),
            '+1:word.istitle()': token1.istitle(),
            '+1:word.isupper()': token1.isupper(),
            '+1:pos': pos1,
            '+1:pos[:2]': pos1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for _, _, label in sent]

def sent2tokens(sent):
    return [token for token, _, _ in sent]

# Prepare training and test features/labels for CRF
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

#########################################
# 2. Train and Evaluate the CRF Model
#########################################

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=False
)

print("Training CRF model...")
crf.fit(X_train, y_train)
print("CRF training complete.\n")

y_pred = crf.predict(X_test)
print("CRF Model Evaluation:")
print(metrics.flat_classification_report(y_test, y_pred, digits=3))

#########################################
# 3. Evaluate spaCy's Pre-trained NER Model on the Test Dataset
#########################################

# Load spaCy's pre-trained NER model 
# (Ensure the model has been unzipped in "./en_ner_conll03")
print("\nLoading spaCy's pre-trained NER model...")
nlp = spacy.load("./best_ner_model")

# Helper function to join tokens into a single text string
def tokens_to_text(tokens):
    return " ".join(tokens)

# Function to extract entity spans (with character offsets) from token-level BIO labels.
# Returns a list of tuples: (entity_text, start_char, end_char, label)
def extract_entities(tokens, labels):
    entities = []
    current_entity = None
    current_label = None
    start_idx = None
    text = tokens_to_text(tokens)
    char_offset = 0
    token_offsets = []
    for token in tokens:
        start = text.find(token, char_offset)
        end = start + len(token)
        token_offsets.append((start, end))
        char_offset = end

    for i, (token, label) in enumerate(zip(tokens, labels)):
        if label.startswith("B-"):
            if current_entity is not None:
                entities.append((current_entity, token_offsets[start_idx][0], token_offsets[i-1][1], current_label))
            current_entity = token
            current_label = label[2:]
            start_idx = i
        elif label.startswith("I-") and current_entity is not None:
            current_entity += " " + token
        else:
            if current_entity is not None:
                entities.append((current_entity, token_offsets[start_idx][0], token_offsets[i-1][1], current_label))
                current_entity = None
                current_label = None
    if current_entity is not None:
        entities.append((current_entity, token_offsets[start_idx][0], token_offsets[-1][1], current_label))
    return entities

# Evaluate spaCy on the test dataset using spaCy's Scorer and Example classes.
def evaluate_spacy_model(test_sents, nlp):
    examples = []
    for sent in test_sents:
        tokens = sent2tokens(sent)
        text = tokens_to_text(tokens)
        gold_labels = sent2labels(sent)
        gold_entities = extract_entities(tokens, gold_labels)
        # Convert to spaCy format: list of (start, end, label)
        gold_spans = [(start, end, label) for _, start, end, label in gold_entities]
        doc = nlp.make_doc(text)
        gold_dict = {"entities": gold_spans}
        example = Example.from_dict(doc, gold_dict)
        examples.append(example)
    # Process all examples to get predictions
    for example in examples:
        example.predicted = nlp(example.reference.text)
    scorer = Scorer()
    scores = scorer.score(examples)
    return scores

spacy_scores = evaluate_spacy_model(test_sents, nlp)
print("\nspaCy Model Evaluation on Test Dataset:")
print(f"Precision: {spacy_scores.get('ents_p', 0):.3f}")
print(f"Recall:    {spacy_scores.get('ents_r', 0):.3f}")
print(f"F1-score:  {spacy_scores.get('ents_f', 0):.3f}")


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gabriel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Training CRF model...
CRF training complete.

CRF Model Evaluation:
              precision    recall  f1-score   support

       B-LOC      0.860     0.814     0.836      1668
      B-MISC      0.810     0.758     0.783       702
       B-ORG      0.771     0.730     0.750      1661
       B-PER      0.830     0.856     0.843      1617
       I-LOC      0.768     0.607     0.678       257
      I-MISC      0.667     0.667     0.667       216
       I-ORG      0.665     0.740     0.701       835
       I-PER      0.872     0.952     0.910      1156
           O      0.988     0.989     0.989     38323

    accuracy                          0.956     46435
   macro avg      0.803     0.790     0.795     46435
weighted avg      0.956     0.956     0.956     46435


Loading spaCy's pre-trained NER model...





spaCy Model Evaluation on Test Dataset:
Precision: 0.785
Recall:    0.772
F1-score:  0.779


In [69]:
import re
import string
import nltk
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from datasets import load_dataset
import spacy
from spacy.training import Example
from spacy.scorer import Scorer

def fit_crf():


    # Download required NLTK resources
    nltk.download('punkt')

    #########################################
    # 1. Data Preparation for CRF Training
    #########################################

    # Load the CoNLL-2003 dataset
    dataset = load_dataset("conll2003",trust_remote_code=True)
    train_data = dataset["train"]
    test_data = dataset["test"]

    # Get label names for NER and POS tags
    ner_names = train_data.features["ner_tags"].feature.names
    pos_names = train_data.features["pos_tags"].feature.names

    # Prepare sentences as a list of tuples: (token, pos, ner)
    def prepare_sent(sent):
        tokens = sent["tokens"]
        pos_tags = [pos_names[tag] for tag in sent["pos_tags"]]
        ner_tags = [ner_names[tag] for tag in sent["ner_tags"]]
        return list(zip(tokens, pos_tags, ner_tags))

    train_sents = [prepare_sent(sent) for sent in train_data]
    test_sents = [prepare_sent(sent) for sent in test_data]

    # Feature extraction functions for CRF

    def word2features(sent, i):
        token, pos, ner = sent[i]
        features = {
            'bias': 1.0,
            'word.lower()': token.lower(),
            'word[-3:]': token[-3:],
            'word[-2:]': token[-2:],
            'word.isupper()': token.isupper(),
            'word.istitle()': token.istitle(),
            'word.isdigit()': token.isdigit(),
            'pos': pos,
            'pos[:2]': pos[:2],
        }
        if i > 0:
            token1, pos1, _ = sent[i-1]
            features.update({
                '-1:word.lower()': token1.lower(),
                '-1:word.istitle()': token1.istitle(),
                '-1:word.isupper()': token1.isupper(),
                '-1:pos': pos1,
                '-1:pos[:2]': pos1[:2],
            })
        else:
            features['BOS'] = True

        if i < len(sent)-1:
            token1, pos1, _ = sent[i+1]
            features.update({
                '+1:word.lower()': token1.lower(),
                '+1:word.istitle()': token1.istitle(),
                '+1:word.isupper()': token1.isupper(),
                '+1:pos': pos1,
                '+1:pos[:2]': pos1[:2],
            })
        else:
            features['EOS'] = True

        return features

    def sent2features(sent):
        return [word2features(sent, i) for i in range(len(sent))]

    def sent2labels(sent):
        return [label for _, _, label in sent]

    def sent2tokens(sent):
        return [token for token, _, _ in sent]

    # Prepare training and test features/labels for CRF
    X_train = [sent2features(s) for s in train_sents]
    y_train = [sent2labels(s) for s in train_sents]

    X_test = [sent2features(s) for s in test_sents]
    y_test = [sent2labels(s) for s in test_sents]

    #########################################
    # 2. Train and Evaluate the CRF Model
    #########################################

    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=False
    )

    print("Training CRF model...")
    crf.fit(X_train, y_train)
    print("CRF training complete.\n")

    return crf

#### Number of unique entities identified (>50) 

In [9]:
def extract_entities_from_text_crf(text, crf_model):
    """
    Given a preprocessed text string and a trained CRF model, this function:
      - Tokenizes the text.
      - Obtains POS tags for each token.
      - Builds features for each token.
      - Uses the CRF model to predict BIO labels.
      - Converts the token-level predictions into entity spans (with positions).
    Returns:
      - A list of tuples (entity_text, start_char, end_char, label)
      - The number of entities extracted.
    """
    import nltk
    from nltk.tokenize import word_tokenize
    from nltk import pos_tag

    # Tokenize the text.
    tokens = word_tokenize(text)
    
    # Attempt to get POS tags. If not found, download the required resource.
    try:
        pos_tags = [tag for (_, tag) in pos_tag(tokens)]
    except LookupError:
        print("POS tagger resource not found. Downloading 'averaged_perceptron_tagger_eng' now...")
        nltk.download('averaged_perceptron_tagger_eng')
        pos_tags = [tag for (_, tag) in pos_tag(tokens)]
    
    # Create a sentence structure: (token, pos, dummy_ner)
    sent = list(zip(tokens, pos_tags, ["O"] * len(tokens)))
    
    # Extract features using the previously defined sent2features function.
    features = sent2features(sent)
    
    # Predict BIO labels with the CRF model.
    predicted_labels = crf_model.predict([features])[0]
    
    # Extract entities using the previously defined extract_entities function.
    entities = extract_entities(tokens, predicted_labels)
    
    return entities, len(entities)


In [21]:
preprocessed_text = df_propre.text_processed_llm[3]
entities, num_entities = extract_entities_from_text_crf(preprocessed_text, crf)
print("Extracted Entities:", entities)
print("Number of Entities:", num_entities)


Extracted Entities: [('Innovative Hockey Inc', 0, 21, 'ORG'), ('American', 29, 37, 'MISC'), ('California', 121, 131, 'LOC'), ('Ronald H Kunisaki', 135, 152, 'PER'), ('Tijuana Mexico Innovative', 179, 204, 'LOC'), ('Later Innovative', 285, 301, 'PER'), ('Detroit Red Wings', 346, 363, 'ORG'), ('Warrior Sports', 473, 487, 'ORG'), ('Innovative Hockey', 551, 568, 'PER'), ('Innovative Sports Technologies Inc', 583, 617, 'ORG'), ('Kunisaki Richard Carr and Kirk S Oshinomi', 638, 679, 'ORG'), ('Los Angeles Kings', 782, 799, 'ORG'), ('Dave Taylor', 807, 818, 'PER'), ('Kunisaki', 823, 831, 'PER'), ('Kunisaki', 916, 924, 'PER'), ('Innovative Hockey Inc', 933, 954, 'ORG'), ('Alexei Kovalev', 1023, 1037, 'PER'), ('Sergei Fedorov', 1061, 1075, 'PER'), ('Innovative', 1196, 1206, 'PER'), ('Bauer', 1226, 1231, 'PER'), ('Bauer', 1374, 1379, 'PER'), ('Innovative', 1440, 1450, 'PER'), ('Tijuana Innovative', 1514, 1532, 'LOC'), ('Detroit Red Wings', 1585, 1602, 'ORG'), ('Stanley Cup', 1674, 1685, 'MISC'), 

### Criterion 3 :  Relation Extraction


#### Basic relation extraction implementation & custom rules


In [None]:
def extract_relations_and_render(text):
    """
    Extracts relation triples from the input text and renders the dependency parse using spaCy's displacy.
    
    Extraction Rules:
    
    Base Rule:
      - Identify tokens with dependency 'nsubj' or 'nsubjpass' whose head is the ROOT verb.
      - The subject is taken as the token (with possible compound modifiers) and the predicate is the head's text.
    
    Custom Rule 1 – Prepositional Relation:
      - For each subject, search among the predicate's children for tokens with dependency 'prep' or 'agent'.
      - Then, for each such token, check for children with dependency 'pobj' (object of the preposition) and extract them as the object.
    
    Custom Rule 2 – Direct Object:
      - If no object is found via a prepositional phrase, look for a direct object (dependency 'dobj') of the predicate.
    
    Custom Rule 3 – Compound Entity:
      - Merge a token with its left-sibling tokens with a 'compound' dependency.
      - This is implemented in the helper function get_compound_phrase.
    
    Custom Rule 4 – Multiple Objects:
      - If multiple objects are found, each is added as a separate relation.
    
    Args:
         text (str): The input text to process.
    
    Returns:
         list: A list of relation triples, where each triple is (subject, predicate, object).
    """
    # Load spaCy's pre-trained model.
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    relations = []
    
    # Helper function (Custom Rule 3) to merge compound tokens with the main token.
    def get_compound_phrase(token):
        compounds = [child.text for child in token.lefts if child.dep_ == "compound"]
        return " ".join(compounds + [token.text]) if compounds else token.text
    
    # Iterate over tokens in the doc to extract relations.
    for token in doc:
        # Base extraction: token is a subject with head being the main verb.
        if token.dep_ in ("nsubj", "nsubjpass") and token.head.dep_ == "ROOT":
            subject = get_compound_phrase(token)
            predicate = token.head.text
            objects = []
            
            # Custom Rule 1: Look for objects in prepositional or agent phrases.
            for child in token.head.children:
                if child.dep_ in ("prep", "agent"):
                    for obj in child.children:
                        if obj.dep_ == "pobj":
                            objects.append(get_compound_phrase(obj))
            
            # Custom Rule 2: If no objects were found, check for a direct object.
            if not objects:
                for child in token.head.children:
                    if child.dep_ == "dobj":
                        objects.append(get_compound_phrase(child))
            
            # Custom Rule 4: Add each object as a separate relation.
            for obj in objects:
                relations.append((subject, predicate, obj))
    
    # Render the dependency parse for visualization.
    displacy.render(doc, style="dep", jupyter=True)
    
    return relations



Extracted Relations: [('Apple', 'founded', 'Steve Jobs')]


In [45]:
df_propre

Unnamed: 0,title,url,content,last_modified,text_processed_llm,tokens
0,Doug Woodward (American football),https://en.wikipedia.org/wiki/Doug_Woodward_(A...,"Doug Woodward(born September 12, 1958) is a fo...",2024-10-03,Doug Woodward born September is a former Ameri...,"[Doug, Woodward, born, September, is, a, forme..."
1,"Lesbian, Gay, Bisexual & Transgender Community...","https://en.wikipedia.org/wiki/Lesbian,_Gay,_Bi...","TheLesbian, Gay, Bisexual & Transgender Commun...",2024-09-24,The Lesbian Gay Bisexual Transgender Community...,"[The, Lesbian, Gay, Bisexual, Transgender, Com..."
2,Chronicon Faventinum,https://en.wikipedia.org/wiki/Chronicon_Favent...,TheChronicon Faventinum[1]is aLatinchronicle o...,2025-02-15,The Chronicon Faventinum is a Latin chronicle ...,"[The, Chronicon, Faventinum, is, a, Latin, chr..."
3,Innovative Hockey,https://en.wikipedia.org/wiki/Innovative_Hockey,"Innovative Hockey, Inc.was an American manufac...",2025-02-12,Innovative Hockey Inc was an American manufact...,"[Innovative, Hockey, Inc, was, an, American, m..."
4,Chester Alan Arthur II,https://en.wikipedia.org/wiki/Chester_Alan_Art...,"Chester Alan Arthur II(July 25, 1864 – July 18...",2025-03-17,Chester Alan Arthur July July also known as Al...,"[Chester, Alan, Arthur, July, July, also, know..."
5,Hot Jazz Saturday Night,https://en.wikipedia.org/wiki/Hot_Jazz_Saturda...,Hot Jazz Saturday Nightis a weeklyjazzprogram ...,2025-03-02,Hot Jazz Saturday Night is a weekly jazz progr...,"[Hot, Jazz, Saturday, Night, is, a, weekly, ja..."
6,Cornelis Symonsz van der Schalcke,https://en.wikipedia.org/wiki/Cornelis_Symonsz...,"Cornelis Symonsz van der Schalcke, orSchalcken...",2022-07-26,Cornelis Symonsz van der Schalcke or Schalcken...,"[Cornelis, Symonsz, van, der, Schalcke, or, Sc..."
7,2000 Heineken Trophy – Men's doubles,https://en.wikipedia.org/wiki/2000_Heineken_Tr...,Since the 1999 final was cancelled due to rain...,2023-06-14,Since the final was cancelled due to rain no d...,"[Since, the, final, was, cancelled, due, to, r..."


In [48]:

# Example usage:
example_text = df_propre.content[1]
relations = extract_relations_and_render(example_text)
print("Extracted Relations:", relations)

Extracted Relations: [('center', 'located', '13th'), ('center', 'located', 'building')]


### Criterion 3 :  Knowledge Graph Quality

In [49]:
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS

def build_knowledge_graph(entities, relations):
    """
    Converts extracted entities and relations into RDF triples and builds a knowledge graph.
    
    Parameters:
        entities (list of tuples): Each tuple is (entity_text, start_char, end_char, label)
                                   For example: [("Apple", 0, 5, "ORG"), ("Steve Jobs", 20, 30, "PERSON")]
        relations (list of tuples): Each tuple is (subject, predicate, object)
                                    For example: [("Apple", "founded_by", "Steve Jobs")]
    
    The function creates a new RDF graph using RDFLib with a custom namespace (EX).
    For each entity, it adds:
        - A triple assigning an RDF type based on the entity's label.
        - A triple to store the entity's label.
    For each relation, it adds a triple linking the subject and object via the predicate.
    
    Finally, it serializes the graph in RDF/XML format and performs a sample SPARQL query.
    
    Returns:
        rdflib.Graph: The populated knowledge graph.
    """
    # Create a new RDF graph
    g = Graph()
    
    # Define a custom namespace
    EX = Namespace("http://example.org/")
    
    # Mapping for entity types based on the label
    type_mapping = {
        "PERSON": "Person",
        "ORG": "Organization",
        "COMPANY": "Organization",
        # Fallback type for other labels:
    }
    
    # Add entities to the graph
    for ent in entities:
        entity_text, start_char, end_char, label = ent
        # Create a URI for the entity by replacing spaces with underscores
        entity_uri = URIRef(EX[entity_text.replace(" ", "_")])
        
        # Determine the type URI based on label; default to generic "Entity" if not found
        entity_type = type_mapping.get(label.upper(), "Entity")
        type_uri = URIRef(EX[entity_type])
        
        # Add the RDF type and label triples
        g.add((entity_uri, RDF.type, type_uri))
        g.add((entity_uri, RDFS.label, Literal(entity_text)))
    
    # Add relations to the graph
    for rel in relations:
        subj, pred, obj = rel
        subj_uri = URIRef(EX[subj.replace(" ", "_")])
        obj_uri = URIRef(EX[obj.replace(" ", "_")])
        pred_uri = URIRef(EX[pred.replace(" ", "_")])
        g.add((subj_uri, pred_uri, obj_uri))
    
    # Serialize the graph in RDF/XML format and print it
    serialized = g.serialize(format="xml")
    # Depending on your version of RDFLib, serialization may return bytes or a string:
    if isinstance(serialized, bytes):
        print(serialized.decode("utf-8"))
    else:
        print(serialized)
    
    # Perform a sample SPARQL query
    query = """
    SELECT ?subject ?predicate ?object
    WHERE {
      ?subject ?predicate ?object .
    }
    """
    print("\nSPARQL Query Results:")
    for row in g.query(query):
        print(f"{row.subject} {row.predicate} {row.object}")
    
    return g

# Example usage:
entities = [("Apple", 0, 5, "ORG"), ("Steve Jobs", 20, 30, "PERSON")]
relations = [("Apple", "founded_by", "Steve Jobs")]
kg = build_knowledge_graph(entities, relations)


<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF
   xmlns:ns1="http://example.org/"
   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
   xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
>
  <rdf:Description rdf:about="http://example.org/Apple">
    <rdf:type rdf:resource="http://example.org/Organization"/>
    <rdfs:label>Apple</rdfs:label>
    <ns1:founded_by rdf:resource="http://example.org/Steve_Jobs"/>
  </rdf:Description>
  <rdf:Description rdf:about="http://example.org/Steve_Jobs">
    <rdf:type rdf:resource="http://example.org/Person"/>
    <rdfs:label>Steve Jobs</rdfs:label>
  </rdf:Description>
</rdf:RDF>


SPARQL Query Results:
http://example.org/Apple http://example.org/founded_by http://example.org/Steve_Jobs
http://example.org/Steve_Jobs http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://example.org/Person
http://example.org/Steve_Jobs http://www.w3.org/2000/01/rdf-schema#label Steve Jobs
http://example.org/Apple http://www.w3.org/2000/01/rdf-schema#label App

In [51]:
text_test=df_propre.text_processed_llm[1]
text_content=df_propre.content[1]

In [57]:
# Example usage:
entities,num_entities = extract_entities_from_text_crf(text_test, crf)
relations = extract_relations_and_render(text_content)
kg = build_knowledge_graph(entities, relations)


<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF
   xmlns:ns1="http://example.org/"
   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
   xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
>
  <rdf:Description rdf:about="http://example.org/Center">
    <rdf:type rdf:resource="http://example.org/Entity"/>
    <rdfs:label>Center</rdfs:label>
  </rdf:Description>
  <rdf:Description rdf:about="http://example.org/Lesbian">
    <rdf:type rdf:resource="http://example.org/Entity"/>
    <rdfs:label>Lesbian</rdfs:label>
  </rdf:Description>
  <rdf:Description rdf:about="http://example.org/center">
    <ns1:located rdf:resource="http://example.org/13th"/>
    <ns1:located rdf:resource="http://example.org/building"/>
  </rdf:Description>
  <rdf:Description rdf:about="http://example.org/Gay_Bisexual_Transgender_Community_Center">
    <rdf:type rdf:resource="http://example.org/Organization"/>
    <rdfs:label>Gay Bisexual Transgender Community Center</rdfs:label>
  </rdf:Description>
  <rdf:De

In [55]:
entities

([('Lesbian', 4, 11, 'MISC'),
  ('Gay Bisexual Transgender Community Center', 12, 53, 'ORG'),
  ('Lesbian', 63, 70, 'MISC'),
  ('Gay Community Services Center', 75, 104, 'ORG'),
  ('Center', 125, 131, 'LOC'),
  ('lesbian', 172, 179, 'MISC'),
  ('New York City', 229, 242, 'LOC'),
  ('West Village', 297, 309, 'LOC'),
  ('West th Street', 313, 327, 'LOC'),
  ('Lower Manhattan', 331, 346, 'PER'),
  ('Food Trades', 437, 448, 'PER')],
 11)

In [56]:
relations

[('center', 'located', '13th'), ('center', 'located', 'building')]

In [58]:
kg

<Graph identifier=N4067e261dd3f4113b38ed6d86f40c252 (<class 'rdflib.graph.Graph'>)>

#### Pipeline :

In [73]:
def pipeline_articles_to_graph(limit=10):
    df_articles=fetch_random_wikipedia_articles(limit)
    df_preprocessed=preprocess_dataframe(df_articles)
    text_complet_preprocessed=df_preprocessed.text_processed_llm.sum()
    text_complet_content=df_preprocessed.content.sum()
    crf=fit_crf()
    entities,len_entitites=extract_entities_from_text_crf(text_complet_preprocessed, crf)
    relations=extract_relations_and_render(text_complet_content)
    kg=build_knowledge_graph(entities, relations)
    return kg,df_preprocessed,entities,relations

In [74]:
graph,df,entités,relations=pipeline_articles_to_graph(10)

[1] Fetched URL: https://en.wikipedia.org/wiki/Made-Up_Lovesong_43
[1] SUCCESS: Article 'Made-Up Lovesong 43' scraped.
[2] Fetched URL: https://en.wikipedia.org/wiki/Naor_Shiri
[2] SUCCESS: Article 'Naor Shiri' scraped.
[3] Fetched URL: https://en.wikipedia.org/wiki/1991_NCAA_Division_III_men%27s_ice_hockey_tournament
[3] SUCCESS: Article '1991 NCAA Division III men's ice hockey tournament' scraped.
[4] Fetched URL: https://en.wikipedia.org/wiki/Parsadih
[4] SUCCESS: Article 'Parsadih' scraped.
[5] Fetched URL: https://en.wikipedia.org/wiki/Antony_Perumbavoor
[5] INFO: Article content too short, skipping: Antony Perumbavoor
[6] Fetched URL: https://en.wikipedia.org/wiki/Danish_Auxiliary_Corps_in_Anglo-Dutch_service_1701%E2%80%931714
[6] SUCCESS: Article 'Danish Auxiliary Corps in Anglo-Dutch service 1701–1714' scraped.
[7] Fetched URL: https://en.wikipedia.org/wiki/Perugu_Siva_Reddy
[7] SUCCESS: Article 'Perugu Siva Reddy' scraped.
[8] Fetched URL: https://en.wikipedia.org/wiki/%C5%A0a

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gabriel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Training CRF model...
CRF training complete.



<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF
   xmlns:ns1="http://example.org/"
   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
   xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
>
  <rdf:Description rdf:about="http://example.org/Aam_Aadmi_Party">
    <rdf:type rdf:resource="http://example.org/Organization"/>
    <rdfs:label>Aam Aadmi Party</rdfs:label>
  </rdf:Description>
  <rdf:Description rdf:about="http://example.org/Shiri">
    <rdf:type rdf:resource="http://example.org/Entity"/>
    <rdfs:label>Shiri</rdfs:label>
    <ns1:placed rdf:resource="http://example.org/Atid_list"/>
  </rdf:Description>
  <rdf:Description rdf:about="http://example.org/Sun">
    <rdf:type rdf:resource="http://example.org/Entity"/>
    <rdf:type rdf:resource="http://example.org/Organization"/>
    <rdfs:label>Sun</rdfs:label>
  </rdf:Description>
  <rdf:Description rdf:about="http://example.org/Ramillies">
    <rdf:type rdf:resource="http://example.org/Entity"/>
    <rdfs:label>Ramillies</

In [75]:
df

Unnamed: 0,title,url,content,last_modified,text_processed_llm
0,Made-Up Lovesong 43,https://en.wikipedia.org/wiki/Made-Up_Lovesong_43,"""Made-Up Lovesong #43"" is a song byGuillemots,...",2023-11-28,"""Made-Up Lovesong #43"" is a song by Guillemots..."
1,Naor Shiri,https://en.wikipedia.org/wiki/Naor_Shiri,Naor Shiri(Hebrew:נאור שירי; born 1 May 1985) ...,2025-03-05,Naor Shiri born 1 May 1985 is an Israeli polit...
2,1991 NCAA Division III men's ice hockey tourna...,https://en.wikipedia.org/wiki/1991_NCAA_Divisi...,The1991 NCAA Division III men's ice hockey tou...,2024-12-13,The 1991 NCAA Division III men's ice hockey to...
3,Parsadih,https://en.wikipedia.org/wiki/Parsadih,Parsadihis a small Village/hamlet inBaloda Baz...,2018-08-25,Parsadih is a small Village/hamlet in Baloda B...
4,Danish Auxiliary Corps in Anglo-Dutch service ...,https://en.wikipedia.org/wiki/Danish_Auxiliary...,Having been forced to sue for peace with Swede...,2024-08-01,Having been forced to sue for peace with Swede...
5,Perugu Siva Reddy,https://en.wikipedia.org/wiki/Perugu_Siva_Reddy,Perugu Siva Reddy(12 September 1920 – 6 Septem...,2025-01-23,Perugu Siva Reddy (12 September 1920 – 6 Septe...
6,"Solar eclipse of March 4, 1802",https://en.wikipedia.org/wiki/Solar_eclipse_of...,A totalsolar eclipseoccurred at the Moon'sasce...,2025-02-15,A total solar eclipse occurred at the Moon's a...
7,Kanti Balar,https://en.wikipedia.org/wiki/Kanti_Balar,Kantibhai Balar(born 1955)[1]is an Indian poli...,2025-03-21,Kantibhai Balar (born 1955) is an Indian polit...


In [77]:
relations

[('Shiri', 'placed', 'Atid list'),
 ('He', 'was', 'jointBlue'),
 ('He', 'was', 'elections'),
 ('he', 'placed', 'Atid list,[1]and'),
 ('ice hockey', 'tournamentwas', 'culmination'),
 ('It', 'concluded', 'Stevens PointdefeatingMankato Statein'),
 ('Quarterfinals matchups', 'held', 'team venues'),
 ('soldiers', 'were', '1701'),
 ('corps', 'fought', 'Marlborough'),
 ('corps', 'fought', 'battles'),
 ('It', 'returned', 'Denmark'),
 ('It', 'returned', '1713'),
 ('It', 'returned', 'September'),
 ('He', 'completed', 'fromMadras'),
 ('He', 'completed', 'ophthalmology'),
 ('He', 'completed', 'fromAndhra University'),
 ('He', 'presented', 'papers'),
 ('He', 'held', 'eye camps'),
 ('he', 'holds', 'doctor'),
 ('Bhushanfrom theGovernment', 'established', '1990'),
 ('government eye hospital', 'established', '1990'),
 ('Asolar eclipseoccurs', 'obscuring', 'viewer'),
 ('Totality', 'occurs', 'path'),
 ('eclipse', 'was', 'parts ofAntarctica'),
 ('He', 'won', 'the2022 Gujarat Legislative Assembly'),
 ('Bal

In [78]:
entités

[('Made-Up Lovesong', 3, 19, 'PER'),
 ('Guillemots', 41, 51, 'PER'),
 ('Windowpane', 85, 95, 'ORG'),
 ('UK Singles Chart', 189, 205, 'LOC'),
 ('My', 275, 277, 'LOC'),
 ('Cliffs', 300, 306, 'LOC'),
 ('Naor Shiri', 309, 319, 'PER'),
 ('Israeli', 342, 349, 'MISC'),
 ('Knesset for Yesh Atid', 401, 422, 'ORG'),
 ('Shiri', 425, 430, 'PER'),
 ('Yesh Atid', 461, 470, 'LOC'),
 ('Yesh Atid', 709, 718, 'LOC'),
 ('Knesset', 755, 762, 'ORG'),
 ('NCAA Division III', 800, 817, 'MISC'),
 ('NCAA', 918, 922, 'LOC'),
 ('Wisconsin-Stevens Point', 951, 974, 'PER'),
 ('Mankato State', 985, 998, 'PER'),
 ('Elmira', 1131, 1137, 'LOC'),
 ('New York', 1140, 1148, 'LOC'),
 ('NCAA', 1155, 1159, 'ORG'),
 ('Tournament Most Outstanding Player', 1181, 1215, 'ORG'),
 ('Parsadih', 1290, 1298, 'PER'),
 ('Baloda Bazar Tehsil', 1328, 1347, 'LOC'),
 ('Chhattisgarh State', 1364, 1382, 'LOC'),
 ('India', 1386, 1391, 'LOC'),
 ('Bhushan Lal Jangde', 1404, 1422, 'PER'),
 ('Rajya Sabha', 1429, 1440, 'LOC'),
 ('Chhattisgarh', 147

In [79]:
graph

<Graph identifier=Ne85cc8859a484d2cb8287a8d703bf1f2 (<class 'rdflib.graph.Graph'>)>

### Criterion 4 :  Entity Linking

In [94]:
import numpy as np

def convert_rdflib_to_pykeen(graph):
    """
    Convertit un graphe RDFLib en np.ndarray de triples au format (str, str, str) compatible PyKEEN.
    """
    triples = [(str(s), str(p), str(o)) for s, p, o in graph]
    return np.array(triples, dtype=str)



In [95]:
triples=convert_rdflib_to_pykeen(graph)

In [96]:
triples

array([['http://example.org/Aam_Aadmi_Party',
        'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
        'http://example.org/Organization'],
       ['http://example.org/Shiri',
        'http://www.w3.org/2000/01/rdf-schema#label', 'Shiri'],
       ['http://example.org/Sun',
        'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
        'http://example.org/Entity'],
       ['http://example.org/Ramillies',
        'http://www.w3.org/2000/01/rdf-schema#label', 'Ramillies'],
       ['http://example.org/MLA',
        'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
        'http://example.org/Organization'],
       ['http://example.org/Lathi',
        'http://www.w3.org/2000/01/rdf-schema#label', 'Lathi'],
       ['http://example.org/Kachhadiya_Dineshbhai_Manubhai',
        'http://www.w3.org/2000/01/rdf-schema#label',
        'Kachhadiya Dineshbhai Manubhai'],
       ['http://example.org/Lathi',
        'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
        'http://exampl

In [83]:
from SPARQLWrapper import SPARQLWrapper, JSON

def enrich_with_dbpedia(entity_name):
    """
    Tente de récupérer des relations depuis DBpedia pour enrichir une entité.
    """
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    query = f"""
    SELECT DISTINCT ?related ?relation WHERE {{
        ?s rdfs:label "{entity_name}"@en .
        ?s ?relation ?related .
        ?related rdfs:label ?label .
        FILTER(LANG(?label) = 'en')
    }} LIMIT 10
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return [(entity_name, r['relation']['value'], r['related']['value']) for r in results["results"]["bindings"]]


In [102]:
from tqdm import tqdm   
def augment_graph_with_dbpedia(graph, entity_list, limit_per_entity=3):
    """
    Enrichit un graphe RDFLib avec des relations issues de DBpedia à partir d'entités du graphe existant.
    
    Args:
        graph (rdflib.Graph): le graphe RDF à enrichir.
        entity_list (list[str]): liste de noms d'entités à linker.
        limit_per_entity (int): max relations à extraire par entité.
    
    Returns:
        rdflib.Graph: graphe enrichi.
    """
    from rdflib import URIRef
    from rdflib.namespace import RDFS
    from SPARQLWrapper import SPARQLWrapper, JSON

    DBP = "http://dbpedia.org/resource/"
    DBP_PRED = "http://dbpedia.org/ontology/"

    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    
    for entity in tqdm(entity_list):
        query = f"""
        SELECT DISTINCT ?related ?relation WHERE {{
            ?s rdfs:label "{entity}"@en .
            ?s ?relation ?related .
            ?related rdfs:label ?label .
            FILTER(LANG(?label) = 'en')
        }} LIMIT {limit_per_entity}
        """
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)

        try:
            results = sparql.query().convert()
            for res in results["results"]["bindings"]:
                subj = URIRef(f"http://example.org/{entity.replace(' ', '_')}")
                pred = URIRef(res["relation"]["value"])
                obj = URIRef(res["related"]["value"])
                graph.add((subj, pred, obj))
        except Exception as e:
            print(f"❌ Failed for {entity}: {e}")

    return graph


In [103]:
# Extraire les entités du graphe RDF actuel
def get_unique_entities(graph):
    return list(set(str(s).split('/')[-1].replace('_', ' ')
                    for s, _, _ in graph if 'example.org' in str(s)))

entities = get_unique_entities(graph)
graph_augmented = augment_graph_with_dbpedia(graph, entities)


100%|██████████| 93/93 [07:06<00:00,  4.59s/it]


In [106]:
entities_augmented = get_unique_entities(graph_augmented)

In [108]:
triples_augmented=convert_rdflib_to_pykeen(graph_augmented)

In [84]:
from pykeen.triples import TriplesFactory

def create_triples_factory(triples):
    tf = TriplesFactory.from_labeled_triples(triples)
    return tf

def split_dataset(tf):
    return tf.split([0.8, 0.1, 0.1])  # train, valid, test


In [129]:
from pykeen.pipeline import pipeline

def train_embedding_model(model_name, training, validation, testing):
    results = pipeline(
        training=training,
        validation=validation,
        testing=testing,
        model=model_name,
        model_kwargs=dict(embedding_dim=50),  # ✅ Here is the fix
        epochs=100,
        training_kwargs=dict(batch_size=32),
        random_seed=42,
        
    )
    return results


In [132]:
def evaluate_model(eval_results, model_name):
    metrics = eval_results.to_dict()
    print(f"\n📊 Results for {model_name}:")
    print(f"Mean Rank: {metrics['both']['mean_rank']:.2f}")
    print(f"Mean Reciprocal Rank: {metrics['both']['mean_reciprocal_rank']:.4f}")
    print(f"Hits@1: {metrics['both']['hits_at_1']:.4f}")
    print(f"Hits@3: {metrics['both']['hits_at_3']:.4f}")
    print(f"Hits@10: {metrics['both']['hits_at_10']:.4f}")


In [87]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def analyze_entity_neighborhood(model, entity_id, k=5):
    entity_embeddings = model.entity_embeddings.weight.detach().numpy()
    entity_labels = model.triples_factory.entity_labeling.label_to_id
    id_to_label = {v: k for k, v in entity_labels.items()}

    similarities = cosine_similarity([entity_embeddings[entity_id]], entity_embeddings)[0]
    most_similar = np.argsort(similarities)[-k-1:-1][::-1]

    print(f"\nEntities most similar to {id_to_label[entity_id]}:")
    for idx in most_similar:
        print(f"{id_to_label[idx]}: {similarities[idx]:.4f}")


In [88]:
def predict_tail_entities(model, head_id, relation_id, k=5):
    predictions = model.predict_scores(
        heads=torch.tensor([head_id]),
        relations=torch.tensor([relation_id]),
    )
    top_tails = torch.topk(predictions, k=k, dim=1)
    entity_labels = model.triples_factory.entity_labeling.label_to_id
    id_to_label = {v: k for k, v in entity_labels.items()}
    
    return [(id_to_label[idx.item()], score.item())
            for idx, score in zip(top_tails.indices[0], top_tails.values[0])]


In [90]:
from sklearn.manifold import TSNE
import plotly.express as px
import numpy as np

def visualize_embeddings_plotly(model):
    """
    Visualise les embeddings des entités du modèle PyKEEN en 2D avec t-SNE + Plotly.
    """
    # Récupérer les embeddings
    embeddings = model.entity_embeddings.weight.detach().numpy()

    # Réduction de dimension avec t-SNE
    tsne = TSNE(n_components=2, random_state=42)
    reduced = tsne.fit_transform(embeddings)

    # Récupérer les labels des entités
    label_to_id = model.triples_factory.entity_labeling.label_to_id
    id_to_label = {v: k for k, v in label_to_id.items()}
    labels = [id_to_label[i] for i in range(len(reduced))]

    # Créer le DataFrame
    import pandas as pd
    df = pd.DataFrame({
        'Entity': labels,
        'x': reduced[:, 0],
        'y': reduced[:, 1],
    })

    # Visualisation interactive avec Plotly
    fig = px.scatter(df, x='x', y='y', text='Entity', hover_name='Entity',
                     title="Entity Embedding Space (t-SNE)", width=900, height=700)
    fig.update_traces(textposition='top center')
    fig.show()


In [135]:
def run_kg_embedding_pipeline(rdflib_graph):
    # Step 1: Convert RDF to PyKEEN triples
    raw_triples = convert_rdflib_to_pykeen(rdflib_graph)
    tf = create_triples_factory(raw_triples)
    
    # Step 2: Data Splitting
    training, validation, testing = split_dataset(tf)
    
    # Step 3: Train models
    model_results = {}
    for model_name in ['TransE', 'DistMult']:
        print(f"\n🚀 Training {model_name}...")
        results = train_embedding_model(model_name, training, validation,testing)
        
        # Manually evaluate on the testing set
        eval_results = results.model.evaluate(testing)
        
        # Store both training results and test evaluation
        model_results[model_name] = {
            "training": results,
            "evaluation": eval_results,
        }
    
    # Step 4: Evaluation
    for model_name, results in model_results.items():
        evaluate_model(results["evaluation"], model_name)
    
    return model_results


In [136]:
run_kg_embedding_pipeline(graph_augmented)

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [79, 79, 80]
INFO:pykeen.pipeline.api:Using device: None



🚀 Training TransE...


Training epochs on cpu: 100%|██████████| 100/100 [00:49<00:00,  2.04epoch/s, loss=0.0108, prev_loss=0.0131]
Evaluating on cpu: 100%|██████████| 80.0/80.0 [00:00<00:00, 1.95ktriple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 0.05s seconds


AttributeError: 'TransE' object has no attribute 'evaluate'