In [43]:
pip install flask flask-cors

Note: you may need to restart the kernel to use updated packages.


In [10]:
################################################################################
# CELL 1: Imports et setup
################################################################################

# %pip install nltk
import nltk
import json
import re
from nltk import pos_tag, word_tokenize, RegexpParser
import os

print("Imports done.")


Imports done.


In [3]:
################################################################################
# CELL 2: Téléchargement des ressources NLTK (si besoin)
################################################################################

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

print("Vérifie que tu as téléchargé les corpora NLTK (tokenizers, taggers, etc.).")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\calar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\calar\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\calar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\calar\AppData\Roaming\nltk_data...


Vérifie que tu as téléchargé les corpora NLTK (tokenizers, taggers, etc.).


[nltk_data]   Package omw-1.4 is already up-to-date!


In [31]:
def normalize_entity_name(text):
    return " ".join(word.capitalize() for word in text.split())

In [32]:
def validate_relation(relation_text):
    # Liste des verbes communs pour les relations
    common_verbs = {"was", "is", "lived", "died", "born", "located", "founded"}
    words = relation_text.lower().split()
    return any(verb in words for verb in common_verbs)

In [33]:
def get_named_entity(doc, text):
    for ent in doc.ents:
        if ent.text in text:
            return ent.text
    return text

In [34]:
from nltk import word_tokenize, pos_tag
from nltk.chunk import RegexpParser
import spacy

# Charger le modèle spaCy
nlp = spacy.load("en_core_web_sm")

def tokenize_and_tag(text):
    # Utiliser spaCy pour une meilleure analyse
    doc = nlp(text)
    # Convertir au format nltk pour compatibilité
    tagged = [(token.text, token.pos_) for token in doc]
    return tagged

# Améliorer la grammaire pour capturer plus de cas
GRAMMAR = r"""
    NP: {<DT|PRP\$>?<JJ.*>*<NN.*>+}
        {<NNP>+}
        {<NN>+}
    VP: {<VB.*>}
    RELATION: {<VP><IN|TO>*}
"""
chunk_parser = RegexpParser(GRAMMAR)

def extract_np_phrases(tagged_text):
    tree = chunk_parser.parse(tagged_text)
    nps = []
    for subtree in tree.subtrees():
        if subtree.label() == 'NP':
            # Nettoyer et normaliser le texte
            np_text = " ".join([token.lower() for (token, pos) in subtree.leaves()])
            np_text = np_text.strip()
            if np_text:  # Vérifier que ce n'est pas vide
                nps.append(np_text)
    return nps

def extract_triplet_simple(sentence):
    doc = nlp(sentence)
    
    # Extraire le sujet (premier nom propre ou groupe nominal)
    subject = None
    for chunk in doc.noun_chunks:
        subject = get_named_entity(doc, chunk.text)
        break
    
    # Extraire l'objet (dernier nom propre ou groupe nominal)
    object_ = None
    for chunk in doc.noun_chunks:
        object_ = get_named_entity(doc, chunk.text)
    
    if not (subject and object_) or subject == object_:
        return None
    
    # Normaliser les entités
    subject = normalize_entity_name(subject)
    object_ = normalize_entity_name(object_)
    
    # Extraire la relation
    relation = []
    subject_token = None
    object_token = None
    
    for token in doc:
        if token.text in subject:
            subject_token = token
        if token.text in object_:
            object_token = token
            
    if subject_token and object_token:
        start_idx = min(subject_token.i, object_token.i)
        end_idx = max(subject_token.i, object_token.i)
        
        for token in doc[start_idx:end_idx]:
            if token.pos_ in ['VERB', 'ADP', 'PART']:
                relation.append(token.text)
    
    relation_text = " ".join(relation).strip()
    
    if not relation_text or not validate_relation(relation_text):
        return None
        
    return (subject, relation_text, object_)

In [35]:
################################################################################
# CELL 4: Charger le JSON local
################################################################################

json_path = r"C:\Users\calar\OneDrive\Bureau\M2\NLP\NLP PROJECT\backend\sparql_2025-01-23_17-04-11Z.json"
with open(json_path, "r", encoding="utf-8") as f:
    data_json = json.load(f)

print("JSON chargé. Nombre de ressources:", len(data_json))


JSON chargé. Nombre de ressources: 64912


In [None]:
!pip install spacy fuzzywuzzy python-Levenshtein requests
!python -m spacy download en_core_web_sm

In [37]:
from fuzzywuzzy import fuzz

RELATION_MAP = {
    "was born in": [
        "http://dbpedia.org/property/birthPlace",
        "http://dbpedia.org/ontology/birthPlace"
    ],
    "lived in": [
        "http://dbpedia.org/ontology/residence",
        "http://dbpedia.org/property/residence",
        "http://dbpedia.org/ontology/livingPlace"
    ],
    "died in": [
        "http://dbpedia.org/property/deathPlace",
        "http://dbpedia.org/ontology/deathPlace",
        "http://dbpedia.org/property/placeOfDeath"
    ],
    "is set in": [
        "http://dbpedia.org/ontology/location",
        "http://dbpedia.org/property/location",
        "http://dbpedia.org/ontology/wikiPageWikiLink",
        "http://dbpedia.org/property/setting"
    ],
    "is the capital of": [
        "http://dbpedia.org/ontology/capital",
        "http://dbpedia.org/property/capital",
        "http://dbpedia.org/ontology/capitalOf",
        "http://dbpedia.org/property/capitalCity"
    ],
    "was founded in": [
        "http://dbpedia.org/ontology/foundingLocation",
        "http://dbpedia.org/property/foundingPlace"
    ],
    "is located in": [
        "http://dbpedia.org/ontology/location",
        "http://dbpedia.org/property/location",
        "http://dbpedia.org/property/locationCity"
    ]
}

def find_best_relation_match(relation_text):
    best_match = None
    best_score = 0
    
    for key in RELATION_MAP.keys():
        score = fuzz.ratio(relation_text.lower(), key.lower())
        if score > best_score and score > 70:  # Seuil de similarité
            best_score = score
            best_match = key
    
    return best_match

In [38]:
from urllib.parse import quote
import requests

def find_subject_uri(subject_text):
    """
    Amélioration avec DBpedia Lookup API
    """
    clean_text = subject_text.strip()
    lookup_url = f"https://lookup.dbpedia.org/api/search/KeywordSearch?QueryString={quote(clean_text)}"
    
    try:
        response = requests.get(lookup_url, headers={'Accept': 'application/json'})
        if response.status_code == 200:
            results = response.json().get('results', [])
            return [r['uri'] for r in results if r.get('uri')]
    except:
        pass
    
    # Fallback à la méthode originale
    sub_clean = subject_text.lower().replace(" ", "_")
    return [uri for uri in data_json.keys() if sub_clean in uri.lower()]

def find_object_uri(object_text, context=None):
    """
    Version améliorée avec gestion du contexte
    """
    obj_clean = object_text.lower().replace(" ", "_")
    
    # Dictionnaire de correspondances directes
    direct_matches = {
        "paris": ["http://dbpedia.org/resource/Paris"],
        "france": ["http://dbpedia.org/resource/France"],
        # Ajouter d'autres correspondances courantes
    }
    
    if obj_clean in direct_matches:
        return direct_matches[obj_clean]
    
    # Utiliser DBpedia Lookup comme fallback
    try:
        lookup_url = f"https://lookup.dbpedia.org/api/search/KeywordSearch?QueryString={quote(object_text)}"
        response = requests.get(lookup_url, headers={'Accept': 'application/json'})
        if response.status_code == 200:
            results = response.json().get('results', [])
            return [r['uri'] for r in results if r.get('uri')]
    except:
        pass
    
    # Dernier recours : recherche dans data_json
    return [uri for uri in data_json.keys() if obj_clean in uri.lower()]

In [39]:
################################################################################
# CELL 7: Commentaire final
################################################################################

print("""
Dans ce notebook:

1) On a chargé un fichier JSON local (sparql_2025-01-23_17-04-11Z.json) 
   contenant des ressources DBpedia mentionnant 'Paris'.

2) On a défini un pipeline simple de NLP (extraction naive de triplets).
3) On a illustré comment parcourir le JSON en Python pour trouver des 
   triplets du style (sujet, predicate, Paris).

Pour aller plus loin, on pourrait :
- Implémenter une désambiguïsation du subject => <http://dbpedia.org/resource/...>
- Chercher d'autres villes que Paris
- Ajouter une API Flask ou intégrer du SPARQLWrapper vers un endpoint local

Fin de démonstration !
""")



Dans ce notebook:

1) On a chargé un fichier JSON local (sparql_2025-01-23_17-04-11Z.json) 
   contenant des ressources DBpedia mentionnant 'Paris'.

2) On a défini un pipeline simple de NLP (extraction naive de triplets).
3) On a illustré comment parcourir le JSON en Python pour trouver des 
   triplets du style (sujet, predicate, Paris).

Pour aller plus loin, on pourrait :
- Implémenter une désambiguïsation du subject => <http://dbpedia.org/resource/...>
- Chercher d'autres villes que Paris
- Ajouter une API Flask ou intégrer du SPARQLWrapper vers un endpoint local

Fin de démonstration !



In [40]:
def check_fact(statement):
    """
    Version améliorée avec gestion d'erreurs et scoring
    """
    try:
        trip = extract_triplet_simple(statement)
        if not trip:
            return {
                "success": False,
                "error": "Impossible d'extraire un triplet",
                "statement": statement,
                "confidence": 0.0
            }
        
        subject_np, rel_text, object_np = trip
        
        # Utiliser le fuzzy matching pour la relation
        matched_rel_key = find_best_relation_match(rel_text)
        
        if not matched_rel_key:
            return {
                "success": False,
                "error": "Relation non reconnue",
                "statement": statement,
                "extracted_triplet": trip,
                "confidence": 0.0
            }
        
        dbpedia_props = RELATION_MAP[matched_rel_key]
        subj_uris = find_subject_uri(subject_np)
        obj_uris = find_object_uri(object_np)
        
        if not subj_uris or not obj_uris:
            return {
                "success": False,
                "error": f"URI non trouvé pour {subject_np if not subj_uris else object_np}",
                "statement": statement,
                "extracted_triplet": trip,
                "relation_found": matched_rel_key,
                "confidence": 0.0
            }
        
        # Vérification avec scoring
        max_confidence = 0.0
        best_match = None
        
        for s_uri in subj_uris:
            if s_uri in data_json:
                predicates_dict = data_json[s_uri]
                for prop in dbpedia_props:
                    if prop in predicates_dict:
                        for obj_val in predicates_dict[prop]:
                            if obj_val["value"] in obj_uris:
                                confidence = 1.0  # Score parfait pour une correspondance exacte
                                if confidence > max_confidence:
                                    max_confidence = confidence
                                    best_match = (s_uri, prop, obj_val["value"])
        
        if best_match:
            return {
                "success": True,
                "result": True,
                "message": f"Fact found: {best_match[0]} {best_match[1]} => {best_match[2]}",
                "extracted_triplet": trip,
                "relation_found": matched_rel_key,
                "confidence": max_confidence
            }
        
        return {
            "success": True,
            "result": False,
            "message": "Aucun triple correspondant trouvé",
            "extracted_triplet": trip,
            "relation_found": matched_rel_key,
            "confidence": 0.0
        }
    
    except Exception as e:
        return {
            "success": False,
            "error": f"Erreur inattendue: {str(e)}",
            "statement": statement,
            "confidence": 0.0
        }

In [41]:
################################################################################
# CELL 8: Quelques questions d'exemple
################################################################################

statements = [
    "Madame Roland was born in Paris",        # Dans ton JSON: Madame_Roland => birthPlace => Paris
    "Jacques Hébert lived in Paris",          # Jacques_Hébert => residence => Paris
    "Heartland is set in Paris",              # Heartland => wikiPageWikiLink => Paris
    "Paris was the capital of France",        # On n'a pas la triple capital => ?
    "Voltaire died in Paris",                 # Voltaire => deathPlace => Paris
    "Louis Figo was born in Paris",           # test d'erreur, car Figo n'est pas né à Paris
    "Auguste Rodin was born in Paris"         # Rodin => birthPlace => Paris
]

for st in statements:
    result = check_fact(st)
    print("Statement:", st)
    print("Result:", result)
    print("--------")


Statement: Madame Roland was born in Paris
Result: {'success': True, 'result': True, 'message': 'Fact found: http://dbpedia.org/resource/Madame_Roland http://dbpedia.org/property/birthPlace => http://dbpedia.org/resource/Paris', 'extracted_triplet': ('Madame Roland', 'born in', 'Paris'), 'relation_found': 'was born in', 'confidence': 1.0}
--------
Statement: Jacques Hébert lived in Paris
Result: {'success': True, 'result': True, 'message': 'Fact found: http://dbpedia.org/resource/Jacques_Hébert http://dbpedia.org/ontology/residence => http://dbpedia.org/resource/Paris', 'extracted_triplet': ('Jacques Hébert', 'lived in', 'Paris'), 'relation_found': 'lived in', 'confidence': 1.0}
--------
Statement: Heartland is set in Paris
Result: {'success': False, 'error': "Impossible d'extraire un triplet", 'statement': 'Heartland is set in Paris', 'confidence': 0.0}
--------
Statement: Paris was the capital of France
Result: {'success': False, 'error': "Impossible d'extraire un triplet", 'statemen

In [42]:
################################################################################
# CELL 8: Quelques questions d'exemple
################################################################################

statements = [
    "Madame Roland was born in Paris",        # Dans ton JSON: Madame_Roland => birthPlace => Paris
    "Jacques Hébert lived in Paris",          # Jacques_Hébert => residence => Paris
    "Heartland is set in Paris",              # Heartland => wikiPageWikiLink => Paris
    "Paris was the capital of France",        # On n'a pas la triple capital => ?
    "Voltaire died in Paris",                 # Voltaire => deathPlace => Paris
    "Louis Figo was born in Paris",           # test d'erreur, car Figo n'est pas né à Paris
    "Auguste Rodin was born in Paris"         # Rodin => birthPlace => Paris
]

for st in statements:
    result = check_fact(st)
    print("Statement:", st)
    print("Result:", result)
    print("--------")


Statement: Madame Roland was born in Paris
Result: {'success': True, 'result': True, 'message': 'Fact found: http://dbpedia.org/resource/Madame_Roland http://dbpedia.org/property/birthPlace => http://dbpedia.org/resource/Paris', 'extracted_triplet': ('Madame Roland', 'born in', 'Paris'), 'relation_found': 'was born in', 'confidence': 1.0}
--------
Statement: Jacques Hébert lived in Paris
Result: {'success': True, 'result': True, 'message': 'Fact found: http://dbpedia.org/resource/Jacques_Hébert http://dbpedia.org/ontology/residence => http://dbpedia.org/resource/Paris', 'extracted_triplet': ('Jacques Hébert', 'lived in', 'Paris'), 'relation_found': 'lived in', 'confidence': 1.0}
--------
Statement: Heartland is set in Paris
Result: {'success': False, 'error': "Impossible d'extraire un triplet", 'statement': 'Heartland is set in Paris', 'confidence': 0.0}
--------
Statement: Paris was the capital of France
Result: {'success': False, 'error': "Impossible d'extraire un triplet", 'statemen

In [17]:
################################################################################
# CELL 9: Commentaires finaux et pistes d'amélioration
################################################################################

"""
Ce code montre un pipeline plus robuste que la version minimale:
- On a un RELATION_MAP pour faire le lien phrase => propriété(s) DBpedia
- On a un naive subject/object matching (ex. "Madame Roland" => "http://dbpedia.org/resource/Madame_Roland")
- On check si la triple (subjectURI, propertyURI, objectURI) existe dans le data_json

Limites:
1) Le mapping de relation est artisanal et partiel.
2) Le matching de la forme "Madame Roland" => "http://dbpedia.org/resource/Madame_Roland" est naïf:
   - on fait un simple "sub_clean in uri.lower()".
3) On ne gère pas la langue, les majuscules, les homonymes, etc.
4) On se limite à "Paris" ou "France" pour l'objet => "http://dbpedia.org/resource/Paris"/"France".
5) On se base sur ton JSON local, qui ne contient que des mentions de "Paris" (et pas d'autres villes).

Améliorations possibles:
- Utiliser un vrai parseur sémantique (spaCy / stanza) pour extraire des triplets plus fiables.
- Faire du fuzzy matching plus avancé via label => URI (ex. en local, on peut indexer "Madame Roland" => "Madame_Roland" si on avait rdfs:label).
- Charger un triple store local (Virtuoso/Fuseki) et faire des requêtes SPARQL => plus fiable, plus complet.
- Gérer un mapping plus large: "capital" => dbo:capitalCity + dbp:capital + etc.

Mais ce notebook illustre déjà un "Automated Fact Checking" en version simplifiée, 
adapté à ton dataset JSON mentionnant Paris.
"""


'\nCe code montre un pipeline plus robuste que la version minimale:\n- On a un RELATION_MAP pour faire le lien phrase => propriété(s) DBpedia\n- On a un naive subject/object matching (ex. "Madame Roland" => "http://dbpedia.org/resource/Madame_Roland")\n- On check si la triple (subjectURI, propertyURI, objectURI) existe dans le data_json\n\nLimites:\n1) Le mapping de relation est artisanal et partiel.\n2) Le matching de la forme "Madame Roland" => "http://dbpedia.org/resource/Madame_Roland" est naïf:\n   - on fait un simple "sub_clean in uri.lower()".\n3) On ne gère pas la langue, les majuscules, les homonymes, etc.\n4) On se limite à "Paris" ou "France" pour l\'objet => "http://dbpedia.org/resource/Paris"/"France".\n5) On se base sur ton JSON local, qui ne contient que des mentions de "Paris" (et pas d\'autres villes).\n\nAméliorations possibles:\n- Utiliser un vrai parseur sémantique (spaCy / stanza) pour extraire des triplets plus fiables.\n- Faire du fuzzy matching plus avancé via l

In [46]:
from flask import Flask, request, jsonify
from flask_cors import CORS
import random
from threading import Thread

app = Flask(__name__)
CORS(app)

def generate_response(fact_result):
    """
    Génère une réponse naturelle basée sur le résultat de la vérification
    """
    if not fact_result['success']:
        error_responses = [
            "Je suis désolé, mais je ne peux pas analyser cette phrase correctement.",
            "Cette question est un peu compliquée pour moi.",
            "Pourriez-vous reformuler votre question différemment ?",
        ]
        return {
            "natural_response": random.choice(error_responses),
            "details": fact_result
        }

    subject, relation, object_ = fact_result.get('extracted_triplet', ('', '', ''))
    confidence = fact_result.get('confidence', 0.0)

    if fact_result['result']:
        # Réponses positives
        positive_responses = [
            f"Oui, c'est exact ! {subject} {relation} {object_}.",
            f"En effet, j'ai trouvé cette information dans mes données.",
            f"C'est correct ! Cette information est vérifiée.",
        ]
        response = random.choice(positive_responses)
        
        # Ajouter des détails si disponibles
        if fact_result.get('message'):
            response += f"\nCette information provient de DBpedia."
            
    else:
        # Réponses négatives
        negative_responses = [
            f"Je ne peux pas confirmer que {subject} {relation} {object_}.",
            "Je n'ai pas trouvé cette information dans mes données.",
            "Cette affirmation ne semble pas être correcte selon mes sources.",
        ]
        response = random.choice(negative_responses)
        
        # Suggérer une correction si possible
        if fact_result.get('alternative_facts'):
            response += f"\nCependant, voici ce que je sais : {fact_result['alternative_facts']}"

    return {
        "natural_response": response,
        "details": fact_result
    }

@app.route('/verify-fact', methods=['POST'])
def verify_fact():
    """
    Route pour vérifier un fait et générer une réponse naturelle
    """
    try:
        data = request.get_json()
        statement = data.get('statement')
        
        if not statement:
            return jsonify({
                "error": "Aucune question fournie",
                "success": False
            }), 400

        # Vérifier le fait
        fact_result = check_fact(statement)
        
        # Générer une réponse naturelle
        response = generate_response(fact_result)
        
        return jsonify({
            "success": True,
            "response": response["natural_response"],
            "verification_details": response["details"],
            "extracted_information": {
                "triplet": fact_result.get('extracted_triplet'),
                "confidence": fact_result.get('confidence'),
                "relation_found": fact_result.get('relation_found')
            }
        })

    except Exception as e:
        return jsonify({
            "error": str(e),
            "success": False
        }), 500

@app.route('/get-sample-questions', methods=['GET'])
def get_sample_questions():
    """
    Retourne une liste de questions exemple
    """
    sample_questions = [
        "Madame Roland was born in Paris",
        "Voltaire died in Paris",
        "Auguste Rodin was born in Paris",
        "Paris was the capital of France",
        "Jacques Hébert lived in Paris"
    ]
    return jsonify({
        "success": True,
        "sample_questions": sample_questions
    })

def run_flask():
    """
    Fonction pour exécuter Flask dans un thread séparé
    """
    app.run(debug=False, port=5000, use_reloader=False)

# Démarrer Flask dans un thread séparé
flask_thread = Thread(target=run_flask)
flask_thread.daemon = True  # Le thread s'arrêtera quand le notebook sera fermé
flask_thread.start()

print("Le serveur Flask est démarré sur http://localhost:5000")

# Cellule de test (à exécuter dans une cellule séparée)
"""
import requests

# Test de la route get-sample-questions
response = requests.get('http://localhost:5000/get-sample-questions')
print("Sample Questions Response:", response.json())

# Test de la route verify-fact
test_data = {"statement": "Voltaire died in Paris"}
response = requests.post('http://localhost:5000/verify-fact', json=test_data)
print("\nVerify Fact Response:", response.json())
"""

Le serveur Flask est démarré sur http://localhost:5000


'\nimport requests\n\n# Test de la route get-sample-questions\nresponse = requests.get(\'http://localhost:5000/get-sample-questions\')\nprint("Sample Questions Response:", response.json())\n\n# Test de la route verify-fact\ntest_data = {"statement": "Voltaire died in Paris"}\nresponse = requests.post(\'http://localhost:5000/verify-fact\', json=test_data)\nprint("\nVerify Fact Response:", response.json())\n'

 * Serving Flask app '__main__'


 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [23/Jan/2025 20:07:54] "OPTIONS /verify-fact HTTP/1.1" 200 -
127.0.0.1 - - [23/Jan/2025 20:07:54] "POST /verify-fact HTTP/1.1" 400 -
127.0.0.1 - - [23/Jan/2025 20:08:56] "OPTIONS /verify-fact HTTP/1.1" 200 -
127.0.0.1 - - [23/Jan/2025 20:08:56] "POST /verify-fact HTTP/1.1" 400 -
127.0.0.1 - - [23/Jan/2025 20:09:00] "GET /get-sample-questions HTTP/1.1" 200 -
127.0.0.1 - - [23/Jan/2025 20:09:41] "OPTIONS /verify-fact HTTP/1.1" 200 -
127.0.0.1 - - [23/Jan/2025 20:09:44] "POST /verify-fact HTTP/1.1" 200 -
127.0.0.1 - - [23/Jan/2025 20:10:35] "GET /get-sample-questions HTTP/1.1" 200 -
127.0.0.1 - - [23/Jan/2025 20:10:37] "OPTIONS /verify-fact HTTP/1.1" 200 -
127.0.0.1 - - [23/Jan/2025 20:10:37] "POST /verify-fact HTTP/1.1" 200 -
127.0.0.1 - - [23/Jan/2025 20:10:42] "GET /get-sample-questions HTTP/1.1" 200 -
127.0.0.1 - - [23/Jan/2025 20:10:43] "OPTIONS /verify-fact HTTP/1.1" 200 -
127.0.0.1 - - [23/Jan/2025 20:10:46] "PO

In [47]:
import os
import signal
os.kill(os.getpid(), signal.SIGINT)

: 