# NER Pipeline

...
## 📄 Overview

...


---

## 🛠 Dependencies

To run this notebook, make sure you have the following:

### ✅ Requirements

- **Python 3.8 or higher**
- **Jupyter Notebook** or **JupyterLab** (to run `.ipynb` files)

Install Jupyter Notebook:
```bash
pip install notebook
```
### 📦 Python Packages
```bash
pip install beautifulsoup4 pandas openpyxl matplotlib
```
---

In [21]:
# -------------- IMPORTS ----------------
import time
import pandas as pd
import spacy
import numpy as np
from spacy import displacy
from time import sleep
import bs4
from bs4 import BeautifulSoup
import os
import requests
import re
import lxml
from pathlib import Path
# ---------------------------------------

# Load The Data

In [22]:
def load_excel_file(file_path:str):


    """
        Load a Excel file into a DataFrame

        Parameters:
            file_path (str): The file path

        Returns:
            df (pd.Dataframe) : the result of Excel file
    """
    # Open the Excel file
    df = pd.read_excel(file_path)

    # Clean
    df.drop(df.columns[0], axis=1, inplace=True) # Drop the Column
    #print(df.isnull().sum())
    df = df.fillna('') # clean the missing description

    return df

df = load_excel_file("ressources/20231101_raw.xlsx")
df.head()

Unnamed: 0,titles,sub_title,days,channel,category,desc,length,start_hour,start_mins,stop_hour,stop_mins,clean_titles
0,Faster than fear,Série TV\nSérie policière\nRéalisateur :\nFlor...,20231101,13eme RUE,Série TV,Ralf a pu prouver son innocence et Sunny a été...,50,1,30,2,20,Faster than fear
1,Commissaire Magellan (S1-E30),Série TV\nSérie policière\nDurée : 1h40min\nRé...,20231101,13eme RUE,Série TV,L'oeuvre du talentueux photographe Tristan Gar...,105,2,20,4,5,Commissaire Magellan
2,Einstein : équations criminelles (S3-E1),Série TV\nSérie policière\nDurée : 42min\nRéal...,20231101,13eme RUE,Série TV,Un châtelain féru de chasse et avec la gâchett...,45,4,5,4,50,Einstein : équations criminelles
3,La mort du Père Noël,Cinéma\nCourt métrage\nDurée : 15min\nRéalisat...,20231101,13eme RUE,Cinéma,Le Père Noël est mort. Qui l'a tué ?,10,4,50,5,0,La mort du Père Noël
4,La belle affaire,Cinéma\nCourt métrage\nDurée : 25min\nRéalisat...,20231101,13eme RUE,Cinéma,"A la frontière suisse, une détective est charg...",25,5,0,5,25,La belle affaire


# Use Spacy 

In [23]:
# Choose the Spacy modele 
NER = spacy.load("fr_core_news_sm")

In [24]:
ner_spacy : ("text_entite", "label") = []
ner_spacy_id : ("start", "end") = []


def apply_spacy(df):
    """
        Apply Spacy

        Parameters:
            df (pd.Dataframe): the dataframe 
    """
    # Start Chrono
    start_chrono = time.time()
    # foreach row , use spacy on the description
    for idx, row in df.iterrows():
        desc = str(row["desc"]) # extract the description
        entities = NER(desc) # use spacy 
        entities_founds = set() # Manage duplicates automatically
        entities_locations = []

        for word in entities.ents:
            entities_founds.add((word.text, word.label_))
            entities_locations.append((word.start, word.end))

        ner_spacy.append(entities_founds)
        ner_spacy_id.append(entities_locations)

    # End Chrono
    end_chrono = time.time()
    chrono = end_chrono - start_chrono
    print(f"Execute in : {chrono:.2f}s")

apply_spacy(df)

Execute in : 70.09s


In [25]:
def build_ner_dataframe(df, ner_spacy, ner_spacy_id):
    """
    Build a new DataFrame from the original DataFrame and precomputed NER outputs.
    
    The original DataFrame must contain at least two columns: 'titles' and 'desc'.
    For each row, the function uses the provided NER results (entity text/label and their corresponding indices)
    to extract a portion of the description text with some context around each entity.
    
    Parameters:
        df (pd.DataFrame): The original DataFrame containing a 'titles' column and a 'desc' (description) column.
        ner_spacy (list): A list (one per row) of lists of tuples, each tuple containing the entity text and its label.
                          For example: [[("Barack Obama", "PERSON"), ("Washington", "GPE")], ...]
        ner_spacy_id (list): A list (one per row) of lists of tuples, each tuple containing the start and end indices of the entity.
                             For example: [[(0, 12), (20, 30)], ...]
                             
    Returns:
        pd.DataFrame: A new DataFrame with the following columns:
                      - "titles": the original title repeated for each extracted entity.
                      - "NER": the extracted entity text.
                      - "NER_label": the label/type of the entity.
                      - "desc": a snippet of the original description around the entity.
    """
    # Convert the 'titles' and 'desc' columns to lists
    titles_list = df["titles"].tolist()
    descriptions_list = df["desc"].tolist()
    
    # Initialize lists to store the new data
    title_list = []
    text_list = []
    label_list = []
    location_list = []
    
    # Loop through each entry in the original DataFrame (by index)
    for i in range(len(titles_list)):
        current_desc = descriptions_list[i]
        # Process the description using the NER function to obtain a full processed document.
        # Assume that NER() returns an object where converting it to a string gives its full text.
        current_NER_doc = NER(current_desc)
        
        # Get the precomputed NER entities and their indices for this row
        current_NER = ner_spacy[i]
        indices = ner_spacy_id[i]
        
        # Get the title associated with the current row
        current_title = titles_list[i]
        
        # Loop over each entity and its corresponding indices
        for entity, idx in zip(current_NER, indices):
            entity_text, entity_label = entity
            
            # Append data to the corresponding list
            title_list.append(current_title)
            text_list.append(entity_text)
            label_list.append(entity_label)
            
            # Extract a snippet from the full document text around the entity for context.
            # If the entity starts near the beginning, we extract from the start.
            if idx[0] < 10:
                # If entity is close to the end of the document, include all text until the end.
                if idx[1] + 8 > len(current_NER_doc):
                    location_list.append(current_NER_doc)
                else:
                    location_list.append(current_NER_doc[:idx[1] + 8])
            else:
                # Else extract 10 characters before the entity start.
                # If the entity is near the end, extract until the end.
                if idx[1] + 8 > len(current_NER_doc):
                    location_list.append(current_NER_doc[idx[0] - 10:])
                else:
                    location_list.append(current_NER_doc[idx[0] - 10: idx[1] + 8])
    
    # Build a new DataFrame using the collected lists.
    new_df = pd.DataFrame(zip(title_list, text_list, label_list, location_list))
    
    # Rename the columns to be more informative.
    new_df.rename(columns={0: "titles", 1: "NER", 2: "NER_label", 3: "desc"}, inplace=True)
    
    return new_df

df_ner = build_ner_dataframe(df, ner_spacy, ner_spacy_id)
df_ner.head()


Unnamed: 0,titles,NER,NER_label,desc
0,Faster than fear,Sunny,LOC,"(Ralf, a, pu, prouver, son, innocence, et, Sun..."
1,Faster than fear,Nora,LOC,"(Elle, n', a, plus, rien, à, voir, avec, l', a..."
2,Faster than fear,Sunny,PER,"(Haffner, ,, mais, celui-ci, demande, à, ne, p..."
3,Faster than fear,Haffner,PER,"(à, Sunny, ., D', ailleurs, ,, elle, est, pers..."
4,Faster than fear,Marcel,PER,"(adresser, à, ..., elle, ., En, garde, à, vue,..."


# CasEN

In [5]:
# Generate text file 
text_location = "Result/Corpus/"
for count, desc in enumerate(df["desc"]):
    file_path = os.path.join(text_location, f"np{count}.txt")
    with open(file_path, 'w', encoding="utf-8") as f:
        f.write(str(desc))

In [7]:
# Run CasEN

casen_ipynb_path = "CasEN_fr/CasEN_fr.2.0/CasEN.ipynb"
get_ipython().run_line_magic('run', str(casen_ipynb_path))

Sat May  3 17:42:33 2025
####
Folder to process :  D:\travail\Stage\Stage_NER\Result\Corpus 
Script :  CasEN_Analyse_synthese_grf.uniscript
np0.txt
np1.txt
np10.txt
np100.txt
np1000.txt
np1001.txt
np1002.txt
np1003.txt
np1004.txt
np1005.txt
np1006.txt
np1007.txt
np1008.txt
np1009.txt
np101.txt
np1010.txt
np1011.txt
np1012.txt
np1013.txt
np1014.txt
np1015.txt
np1016.txt
np1017.txt
np1018.txt
np1019.txt
np102.txt
np1020.txt
np1021.txt
np1022.txt
np1023.txt
np1024.txt
np1025.txt
np1026.txt
np1027.txt
np1028.txt
np1029.txt
np103.txt
np1030.txt
np1031.txt
np1032.txt
np1033.txt
np1034.txt
np1035.txt
np1036.txt
np1037.txt
np1038.txt
np1039.txt
np104.txt
np1040.txt
np1041.txt
np1042.txt
np1043.txt
np1044.txt
np1045.txt
np1046.txt
np1047.txt
np1048.txt
np1049.txt
np105.txt
np1050.txt
np1051.txt
np1052.txt
np1053.txt
np1054.txt
np1055.txt
np1056.txt
np1057.txt
np1058.txt
np1059.txt
np106.txt
np1060.txt
np1061.txt
np1062.txt
np1063.txt
np1064.txt
np1065.txt
np1066.txt
np1067.txt
np1068.txt
np1069

In [13]:
# Extract data From CasEN file

def extract_entities_from_element(element : BeautifulSoup):
    """
        Extract each entities from a beautiful soup element

        Parameters:
            element (BeautifulSoup): BeautifulSoup element

        Returns:
            dict : containing entity tag, text, and graph used (grf).
    
    """
    # with open("CasEn_Result.txt", 'a', encoding="utf-8") as f:
    #     f.write(str(element) + "\n")

    tag = element.name
    text = element.get_text()
    grf = element.attrs.get("grf")
    
    data = {
        "tag": tag,
        "grf": grf,
        "text": text,
    }
    
    # Check if there is child
    children = element.find_all(recursive=False)

    if children:
        children_data = {}
        for child in children:
            child_data = extract_entities_from_element(child)
            # add same taged child into a list
            child_tag = child_data["tag"]
            if child_tag in children_data:
                if not isinstance(children_data[child_tag], list):
                    children_data[child_tag] = [children_data[child_tag]]
                children_data[child_tag].append(child_data)
            else:
                children_data[child_tag] = child_data
        data.update(children_data)
    
    return data

def extract_entities_from_file(file_path : str):
    """
        Read and extract entities from one file
        
        Parameters:
            file_path (str) : The path of the file

        Returns:
            list[dict] : entities founds in the files
    """
    entities_from_this_file = []

    with open(file_path, 'r', encoding="utf-8") as f:
        file_content = f.read()
        soup = BeautifulSoup(file_content, "html.parser")
        
        excluded_tags = ["s", "p"] # list of tag that we don't care 

        for element in soup.find_all(lambda tag: tag.name not in excluded_tags):
            entities_from_this_file.append(extract_entities_from_element(element))

    return entities_from_this_file

def extract_entities(folder_path : str):
    """
        Extract entities from CasEN result in all txt file in the folder

        Parameters:
            folder_path (str): The path for the folder

        Returns:
            list[list] : list of every entities founds for each file in the folder
    """

    ner_casEN = []

    path = Path(folder_path)
    if path.is_dir():
        txt_files = list(path.glob("*.txt"))
        if len(txt_files) != 0:
            for file in txt_files:
                ner_casEN.append(
                    {
                        "filename" : file.name,
                        "entities" : extract_entities_from_file(file)
                    }
                )
        else:
            print(f"No .txt file were found in {folder_path}")
    else:
        print(f"{folder_path} is not a folder")

    return ner_casEN


In [17]:
ner_casEN = extract_entities("Result/CasEN_Result/Res_CasEN_Analyse_synthese_grf")
ner_casEN

[{'filename': 'np0.result.txt', 'entities': []},
 {'filename': 'np1.result.txt',
  'entities': [{'tag': 'preinclude',
    'grf': 'grfpersContextePersonne',
    'text': 'photographe',
    'rolename': {'tag': 'rolename',
     'grf': 'grfroleName',
     'text': 'photographe'}},
   {'tag': 'rolename', 'grf': 'grfroleName', 'text': 'photographe'},
   {'tag': 'persname',
    'grf': 'grfpersPrenomNom',
    'text': 'Tristan Garil',
    'forename': {'tag': 'forename', 'grf': 'grftagPrenom', 'text': 'Tristan'},
    'surname': {'tag': 'surname', 'grf': 'grftagNomFamille', 'text': 'Garil'}},
   {'tag': 'forename', 'grf': 'grftagPrenom', 'text': 'Tristan'},
   {'tag': 'surname', 'grf': 'grftagNomFamille', 'text': 'Garil'},
   {'tag': 'orgname',
    'grf': 'grforgSeule',
    'text': 'galerie Delandin',
    'org': {'tag': 'org', 'grf': 'grforgProximite', 'text': 'galerie'},
    'persname': {'tag': 'persname',
     'grf': 'grfpersGenerique',
     'text': 'Delandin',
     'surname': {'tag': 'surname', 

In [19]:
def transform_casen_entities(casen_results):
    """
    Transform the CasEN results into a list of sets containing (entity_text, entity_label) tuples.
    
    Parameters:
        casen_results (list): List of dictionaries, each with keys 'filename' and 'entities'.
                              The 'entities' value is a list of dictionaries containing at least
                              'tag' and 'text'.
    
    Returns:
        list of set: One set per file, containing tuples (text, label) for the relevant entities.
    """
    transformed_results = []
    # Define a mapping between tag and desired label.
    tag_to_label = {
        "persname": "PER",
        "placename": "LOC",
        "geoname": "LOC",
        "orgname": "ORG"
    }
    
    for file_result in casen_results:
        entity_set = set()
        for entity in file_result.get("entities", []):
            tag = entity.get("tag")
            text = entity.get("text")
            # If the tag is one of those we care about.
            if tag in tag_to_label:
                label = tag_to_label[tag]
                # On peut éventuellement nettoyer ou normaliser le texte ici, par exemple .strip()
                entity_set.add((text.strip(), label))
        transformed_results.append(entity_set)
    
    return transformed_results

# Exemple d'utilisation :
# Supposez que 'ner_casEN_raw' est votre liste d'extractions CasEN telle que vous l'avez indiquée.
ner_casEN = transform_casen_entities(ner_casEN)

In [15]:
def compare_entity_extractions(spacy_entities, casen_entities):
    """
    Compare the entity extractions from spaCy and CasEN for a series of documents.

    Parameters:
        spacy_entities (list of iterables): Each element contains entities (e.g., tuples like (text, label))
                                            extracted by spaCy for a document.
        casen_entities (list of iterables): Each element contains entities extracted by CasEN for a document.
        
    Returns:
        intersections (list of sets): For each document, the set of common entities between spaCy and CasEN.
        intersection_counts (list of int): The number of common entities per document.
        spacy_counts (list of int): The total number of entities extracted by spaCy per document.
        casen_counts (list of int): The total number of entities extracted by CasEN per document.
    """
    intersections = []
    intersection_counts = []
    spacy_counts = []
    casen_counts = []
    
    for spacy_result, casen_result in zip(spacy_entities, casen_entities):
        # Convert spaCy results to a set if not already one
        spacy_set = set(spacy_result)
        
        # Compute the common entities (intersection)
        common_entities = spacy_set.intersection(casen_result)
        
        intersections.append(common_entities)
        intersection_counts.append(len(common_entities))
        spacy_counts.append(len(spacy_set))
        casen_counts.append(len(casen_result))
    
    return intersections, intersection_counts, spacy_counts, casen_counts

In [29]:
intersections, inter_counts, spacy_total, casen_total = compare_entity_extractions(ner_spacy, ner_casEN)
df_results = pd.DataFrame({
    "Document": range(len(spacy_total)),
    "spaCy_Total": spacy_total,
    "CasEN_Total": casen_total,
    "Common_Count": inter_counts,
    "Common_Entities": [list(items) for items in intersections]
})

# Afficher les premières lignes
df_results.head(10)

Unnamed: 0,Document,spaCy_Total,CasEN_Total,Common_Count,Common_Entities
0,0,5,0,0,[]
1,1,5,6,3,"[(Delandin, PER), (Selma Berrayah, PER), (Este..."
2,2,0,0,0,[]
3,3,1,3,0,[]
4,4,0,1,0,[]
5,5,0,2,0,[]
6,6,0,1,0,[]
7,7,7,1,0,[]
8,8,6,1,0,[]
9,9,5,1,0,[]
