In [1]:
import pandas as pd 
import numpy as np 
from PyPDF2 import PdfReader
from datetime import datetime 
import dateparser
import re
import json
import os
import cv2
import pytesseract
from pdf2image import convert_from_path
from dateparser import parse as dateparser
import matplotlib.pyplot as plt

# Extracting the text 

In [18]:
def extract_pdf_to_txt(src_dir, dest_dir):
    # List all files in the source directory
    all_files = os.listdir(src_dir)

    # Filter PDF files
    pdf_files = [f for f in all_files if f.lower().endswith('.pdf')]

    # Loop through all the PDF files
    for pdf_file in pdf_files:
        pdf_path = os.path.join(src_dir, pdf_file)
        reader = PdfReader(pdf_path)

        # Extract text from the first page
        first_page_text = reader.pages[0].extract_text()

        # Search for a date in the format "DD MONTH YYYY" using a regular expression
        date_match = re.search(r'\d{1,2} (JANVIER|FÉVRIER|MARS|AVRIL|MAI|JUIN|JUILLET|AOÛT|SEPTEMBRE|OCTOBRE|NOVEMBRE|DÉCEMBRE)  \d{4}', first_page_text)

        # If a date is found, use it as a filename
        if date_match:
            # Get the matched date string
            date_str = date_match.group()

            # Remove extra spaces from the date string
            date_str = ' '.join(date_str.split())

            # Convert the date string to a datetime object
            date = dateparser.parse(date_str, languages=['fr'])

            # Format the date as "YY-MM-DD"
            formatted_date = date.strftime('%y-%m-%d')

            # Create a filename using the formatted date
            filename = f"{formatted_date}.txt"
        # If a date is not found, use the original PDF filename as a base
        else:
            filename = f"{os.path.splitext(pdf_file)[0]}.txt"

        txt_path = os.path.join(dest_dir, filename)

        # Open the output file for writing with 'utf-8' encoding
        with open(txt_path, "w", encoding='utf-8') as f:
            # Loop through all the pages in the PDF
            for page in reader.pages:
                # Extract text from the current page and write it to the output file
                f.write(page.extract_text())

In [49]:
# Using cv2
def extract_pdf_to_txt(src_dir, dest_dir):
    pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract.exe'

    all_files = os.listdir(src_dir)
    pdf_files = [f for f in all_files if f.lower().endswith('.pdf')]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(src_dir, pdf_file)

        # Convert PDF pages to images
        images = convert_from_path(pdf_path)

        # Extract text from the first page
        first_page_np = np.array(images[0])
        first_page_gray = cv2.cvtColor(first_page_np, cv2.COLOR_BGR2GRAY)
        first_page_thresh = cv2.threshold(first_page_gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
        first_page_text = pytesseract.image_to_string(first_page_thresh, lang='fra', output_type=pytesseract.Output.STRING)

        date_match = re.search(r'\d{1,2} (JANVIER|FÉVRIER|MARS|AVRIL|MAI|JUIN|JUILLET|AOÛT|SEPTEMBRE|OCTOBRE|NOVEMBRE|DÉCEMBRE) \d{4}', first_page_text)

        if date_match:
            date_str = date_match.group()
            date_str = ' '.join(date_str.split())
            date = dateparser(date_str, languages=['fr'])
            formatted_date = date.strftime('%y-%m-%d')
            filename = f"{formatted_date}.txt"
        else:
            filename = f"{os.path.splitext(pdf_file)[0]}.txt"
        
        def preprocess_image(image_np):
            # Resize the image
            height, width = image_np.shape[:2]
            resized_image = cv2.resize(image_np, (width * 2, height * 2), interpolation=cv2.INTER_CUBIC)

            # Convert to grayscale
            gray = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)

            # Apply Gaussian blur
            blur = cv2.GaussianBlur(gray, (5, 5), 0)

            # Thresholding
            thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

            # Dilation and Erosion
            kernel = np.ones((1, 1), np.uint8)
            dilated = cv2.dilate(thresh, kernel, iterations=1)
            eroded = cv2.erode(dilated, kernel, iterations=1)

            return eroded

        txt_path = os.path.join(dest_dir, filename)

        with open(txt_path, "w", encoding='utf-8') as f:
            for image in images:
                image_np = np.array(image)
                preprocessed_image = preprocess_image(image_np)
                text = pytesseract.image_to_string(preprocessed_image, lang='fra', output_type=pytesseract.Output.STRING)
                f.write(text)

In [50]:
src_dir = r'C:\Users\Stephanie\Documents\GitHub\NLP_Parlement\pdf'
dest_dir = r'C:\Users\Stephanie\Documents\GitHub\NLP_Parlement\txt'
extract_pdf_to_txt(src_dir, dest_dir)

# Important information

In [2]:
parlementarians = pd.read_csv(r'C:\Users\Stephanie\Documents\GitHub\NLP_Parlement\2022.07.22_Parlementaires PFWB_0.csv', header=None)
column_names = ['Prénom', 'nom', 'sexe', 'résidence', 'naissance', 'date', 'parti', 'titre']
parlementarians.columns = column_names

In [3]:
known_speakers = parlementarians.apply(lambda row: {"name": row["Prénom"] + " " + row["nom"], "title": row["titre"], 'parti': row['parti']}, axis=1).to_list()

# Convert to JSON file

In [4]:
# Helper functions
# Define regular expressions and helper function to parse questions, speakers and statements
def find_starting_point(text, starting_sentence='La séance est ouverte'):
    starting_point = text.find(starting_sentence)
    if starting_point != -1:
        return starting_point + len(starting_sentence)
    return None

def clean_line_breaks(text):
    # Replace hyphenated line breaks with an empty string
    cleaned_text = re.sub(r'-\s+', '', text)
    # Clean the text from the \n characters 
    cleaned_text = cleaned_text.replace('\n', ' ')
    return cleaned_text


def find_questions(text):
    question_pattern = re.compile(r'(?<=\d\.\d Question)(.*?)(?=\d\.\d Question)', re.DOTALL)
    return question_pattern.findall(text)

def find_projets(text):
    projet_pattern = re.compile(r'\d+ Projet de décret(.*?)(?=\d+ Projet de décret|$)', re.DOTALL)
    return projet_pattern.findall(text)

def process_transcript(text, known_speakers):
    # Create a regex pattern for speaker names
    speaker_pattern = r'(?:Mme|M\.)\s+(?:' + '|'.join([re.escape(speaker["name"]) for speaker in known_speakers]) + r')\b(?:\s*\([^)]*\)|\s*[,.-]|-)'


    # Update the regex pattern to account for the space before the speaker's name
    speaker_pattern_with_space = r'\s*(?:' + '|'.join([re.escape(speaker["name"]) for speaker in known_speakers]) + r')\b'

    # Find the starting point of the actual conversation
    starting_point_pattern = re.compile(r'Question de .*? à .*?«.*?»', re.DOTALL)
    starting_point = starting_point_pattern.search(text)
    if starting_point:
        text = text[starting_point.end():]

    # Extract the theme
    theme_pattern = re.compile(r'«(.*?)»')
    theme_match = theme_pattern.search(text)
    if theme_match:
        theme = theme_match.group(1)
        # Update the starting point of the text after the theme
        text = text[theme_match.end():]
    else:
        theme = 'TBD'

    # Split the text into parts
    parts = re.split('(' + speaker_pattern + ')', text)
    # Initialize the result list
    result = []


    # Initialize the current speaker
    current_speaker = None

    # Iterate through the parts
    for part in parts:
        found_speaker = False
        for speaker in known_speakers:
            if speaker["name"] == part.strip():
                current_speaker = speaker
                found_speaker = True
                break

        if not found_speaker and current_speaker is not None:
            # Append the spoken text by the current speaker
            spoken_text = part.strip()

            # Remove text within brackets and following whitespace
            spoken_text = re.sub(r'\(.*?\)\s*', '', spoken_text)

            if spoken_text:
                result.append({"speaker": current_speaker["name"], "title": current_speaker["title"], "text": spoken_text})

    return result, theme

def create_json_file(questions_data, projets_data, filename):
    data = {
        "questions": questions_data,
        "projets": projets_data,
    }
    with open(filename, "w", encoding='utf-8') as outfile:
        json.dump(data, outfile, ensure_ascii=False, indent=2)

In [5]:
def process_file(file_path, known_speakers):
    with open(file_path, 'r', encoding='utf-8') as f:
        full_text = f.read()

    starting_point = find_starting_point(full_text)
    if starting_point is not None:
        transcript_text = full_text[starting_point:]
    else:
        print("Starting sentence not found. Analyzing the full text.")
        transcript_text = full_text

    transcript_text = transcript_text.replace('\n', ' ')
    transcript_text = clean_line_breaks(transcript_text)

    date = os.path.splitext(os.path.basename(file_path))[0]

    # Find and process questions
    questions_text = find_questions(transcript_text)
    questions_data = []
    for question_text in questions_text:
        result, theme = process_transcript(question_text, known_speakers)
        question_data = {
            "date": date,
            "type": 'question',
            "theme": theme,
            'thematic': 'TBD', 
            "text": result
        }
        questions_data.append(question_data)

    # Find and process projets
    projets_text = find_projets(transcript_text)
    projets_data = []
    for projet_text in projets_text:
        result, theme = process_transcript(projet_text, known_speakers)
        projet_data = {
            "date": date,
            "type": 'projet',
            "theme": theme,
            'thematic': 'TBD', 
            "text": result
        }
        projets_data.append(projet_data)

    create_json_file(questions_data, projets_data, f"{date}.json")

    return questions_data, projets_data

In [6]:
questions, projets = process_file(r'txt\23-04-12.txt', known_speakers)

In [7]:
questions[3]

{'date': '23-04-12',
 'type': 'question',
 'theme': 'Audit externe de la Ligue belge francophone d’athlétisme',
 'thematic': 'TBD',
 'text': [{'speaker': 'Mourad Sahli',
   'title': 'Député',
   'text': ". Le 14 mars dernier, en commission des Sports, Madame la Ministre, je vous ai interrogée sur les dysfonctionnements au sein de la Ligue belge francophone d’athlétisme et, en particulier, sur les difficultés entre certains administrateurs et le président de la LBFA. Vous m'avez alors répondu que, d’après le rapport d’audit, la LBFA entretenait une comptabilité correcte et qu'elle s’inscrivait dans un processus de bonne gouvernance. Je  trouvais votre réponse quelque peu succincte.  À mes yeux, deux points sont importants dans ce dossier: d’une part, le rapport provisoire reçu par votre administration aurait été envoyé à la LBFA pour avoir son avis avant l'élaboration du rapport final et, d’autre part, les lanceurs d'alerte n’auraient pas été auditionnés et n’auraient, par conséquent, p

In [8]:
def process_directory(directory_path, known_speakers):
    all_questions_data = []
    all_projets_data = []

    for file_name in os.listdir(directory_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(directory_path, file_name)
            questions_data, projets_data = process_file(file_path, known_speakers)
            all_questions_data.extend(questions_data)
            #all_projets_data.extend(projets_data)

    create_json_file(all_questions_data, all_projets_data, 'combined.json')

In [68]:
process_directory(r'C:\Users\Stephanie\Documents\GitHub\NLP_Parlement\txt', known_speakers)

# Thematic

In [15]:
import spacy
import classy_classification
from spacy.util import minibatch, compounding
from spacy.training.example import Example

In [16]:
labels = pd.read_csv('labels.csv')
labels['combined_text'] = labels['titre'] + ' ' + labels['text']

# Convert your data to the required format
def load_data(df):
    data = []
    for index, row in df.iterrows():
        label = row['label']
        text = row['combined_text']
        data.append((text, {"cats": {label: 1.0}}))
    return data

train_data = load_data(labels)

In [17]:
# Create the blank model and add the text categorizer to the pipeline
nlp = spacy.blank("fr")
textcat = nlp.add_pipe("textcat")

# Add labels to the text categorizer
for _, annotations in train_data:
    for label in annotations.get("cats"):
        textcat.add_label(label)

# Train the text categorizer
optimizer = nlp.begin_training()
batch_size = 8
epochs = 10

for epoch in range(epochs):
    losses = {}
    batches = minibatch(train_data, size=compounding(1.0, 4.0, 1.001))
    
    for batch in batches:
        texts, annotations = zip(*batch)
        examples = [Example.from_dict(nlp.make_doc(text), annots) for text, annots in zip(texts, annotations)]
        nlp.update(examples, sgd=optimizer, drop=0.2, losses=losses)
    print(f"Epoch: {epoch}, Loss: {losses['textcat']}")

# Test the trained model
test_text = "Réduction de la subvention accordée à bpost pour la distribution de la presse périodique"
doc = nlp(test_text)
print(doc.cats)

Epoch: 0, Loss: 3.4668735414743423
Epoch: 1, Loss: 3.5999881103634834
Epoch: 2, Loss: 3.475020319223404
Epoch: 3, Loss: 3.3965729102492332
Epoch: 4, Loss: 3.2497308775782585
Epoch: 5, Loss: 2.9026045463979244
Epoch: 6, Loss: 2.2346629481762648
Epoch: 7, Loss: 1.5175540847267257
Epoch: 8, Loss: 0.7579404590360355
Epoch: 9, Loss: 0.41725427449591734
{'Budget': 0.019625067710876465, 'Culture et média ': 0.3861103653907776, 'Enfance': 0.0010101201478391886, 'Enseignement supérieur et Promotion Sociale ': 0.014809043146669865, 'Interculturalité et Egalité des Chances ': 0.01695610210299492, 'Jeunesse et Sport ': 0.03689073771238327, 'Politique et International': 0.011693249456584454, 'Santé': 0.5129052996635437}


In [18]:
text = " Madame la Ministre, nous ne parlons pas souvent de littérature dans ce Parlement et pourtant, en tant que romanistes, nous avons ce centre d’intérêt en commun. Je ne vous interroge pas sur la qualité, bien réelle, des œuvres produites en Fédération Wallonie-Bruxelles, mais plutôt sur les conditions socio-économiques dans lesquelles les auteurs de notre Fédération  doivent travailler.  Une étude récente réalisée par l’ASBL Bela jette une lumière assez crue sur ces conditions. Elle révèle notamment que trois quarts des auteurs vivent avec un revenu inférieur à 1 000 euros par mois tirés de leur activité. Ce n’est pas telle’ D . . . . D D ment étonnant lorsqu'on sait que, sur un livre vendu une vingtaine d’euros, l’auteur n’en reçoit que 1,5 voire maximum 2 euros. Il faut déjà en vendre beaucoup pour pouvoir en vivre. [l y a certes quelques best-sellers, mais ils font figure d’exceptions et sont les arbres qui cachent la forêt.  Au-delà de ces situations socio-économiques, l’étude révèle également un manque de reconnaissance et de considération ressenti par les personnes interrogées. Plus grave encore, elle fait apparaître une situation de détresse psychologique dans 7 % des cas. Sans vouloir réactiver l’image mythique du poète maudit ou de l’écrivain au ban de la société, le constat qui doit être posé ici et maintenant est bel et  bien celui d’une extrême précarité. CRI N°12 Madame la Ministre, comment réagissez-vous vis-à-vis de ce constat”? Quelles réponses pouvez-vous apporter à cette situation? Trois demandes sont clairement exprimées par les auteurs: d’abord celle d’un refinancement, le financement actuel étant jugé insuffisant; ensuite, celle d’un accompagnement d'ordre psychologique, pour aider les auteurs et autrices à sortir de l’isolement qu'ils et elles ressentent; enfin, celle d’un accompagnement plus professionnel pour leur permettre entre autres de bénéficier de plus de visibilité.  Quelles sont les initiatives nouvelles en matière de promotion des lettres belges de langue française? Je rappelle que c’est l’une des missions de notre Communauté depuis sa création. Quelle est, d’après les informations vous revenant du terrain, votre appréciation de la mise en œuvre de la réforme du statut des artistes entrée en application? Pensez-vous qu’elle constitue une réponse suffisante à  cette situation d’extrême précarité de nos auteurs?"

In [19]:
doc = nlp(text)
print(doc.cats)

{'Budget': 0.0012807386228814721, 'Culture et média ': 0.9656847715377808, 'Enfance': 0.0018391121411696076, 'Enseignement supérieur et Promotion Sociale ': 0.0018180167535319924, 'Interculturalité et Egalité des Chances ': 0.0018495387630537152, 'Jeunesse et Sport ': 0.01825811341404915, 'Politique et International': 0.006519528105854988, 'Santé': 0.002750222571194172}


In [22]:
# predicting the thematic 
def process_json(json_data, nlp):
    questions = json_data["questions"]
    results = []

    for question in questions:
        theme = question["theme"]
        text_parts = question["text"]

        for text_part in text_parts:
            text = text_part["text"]
            words = text.split()
            if len(words) > 10:
                # Get the predicted categories and their scores
                doc = nlp(text)
                categories = doc.cats

                # Find the category with the highest score
                max_score = 0
                best_category = None
                for category, score in categories.items():
                    if score > max_score:
                        max_score = score
                        best_category = category
                
                # Update the 'thematic' field in the question
                question["thematic"] = best_category

                # Store the result
                '''results.append({"theme": theme, "text": text, "predicted_thematic": best_category})
                break
                '''

    return json_data

In [25]:
# Load the JSON data
with open("json\combined.json", "r", encoding='utf-8') as f:
    json_data = json.load(f)

# Process the JSON data
updated_json_data = process_json(json_data, nlp)

# Write the updated JSON data back to the file
with open("json\combined.json", "w", encoding='utf-8') as f:
    json.dump(updated_json_data, f, ensure_ascii=False, indent=4)

# Analysis

In [4]:
# How Many questions in my json file 
with open(r"C:\Users\Stephanie\Documents\GitHub\NLP_Parlement\json\combined.json", "r", encoding='utf-8') as file:
    data = json.load(file)

# Count the number of questions
question_count = len(data['questions'])

print("Number of questions:", question_count)

Number of questions: 119


In [5]:
# How many questions per date
# Count the number of questions per date
questions_per_date = {}
for question in data["questions"]:
    date = question["date"]
    if date in questions_per_date:
        questions_per_date[date] += 1
    else:
        questions_per_date[date] = 1

# Print the number of questions per date
for date, count in questions_per_date.items():
    print(f"Date: {date}, Number of questions: {count}")

Date: 20-05-13, Number of questions: 8
Date: 22-09-08, Number of questions: 8
Date: 22-09-14, Number of questions: 6
Date: 22-09-28, Number of questions: 7
Date: 22-10-12, Number of questions: 8
Date: 22-10-26, Number of questions: 8
Date: 22-11-16, Number of questions: 6
Date: 22-11-30, Number of questions: 8
Date: 22-12-14, Number of questions: 8
Date: 23-01-11, Number of questions: 8
Date: 23-01-25, Number of questions: 8
Date: 23-02-08, Number of questions: 7
Date: 23-03-01, Number of questions: 8
Date: 23-03-15, Number of questions: 8
Date: 23-03-29, Number of questions: 7
Date: 23-04-12, Number of questions: 6
