In [2]:
import pandas as pd 
import numpy as np 
from PyPDF2 import PdfReader
from datetime import datetime 
import dateparser
import re
import locale
import json
import os

# Extracting the text 

In [3]:
def extract_pdf_to_txt(src_dir, dest_dir):
    # List all files in the source directory
    all_files = os.listdir(src_dir)

    # Filter PDF files
    pdf_files = [f for f in all_files if f.lower().endswith('.pdf')]

    # Loop through all the PDF files
    for pdf_file in pdf_files:
        pdf_path = os.path.join(src_dir, pdf_file)
        reader = PdfReader(pdf_path)

        # Extract text from the first page
        first_page_text = reader.pages[0].extract_text()

        # Search for a date in the format "DD MONTH YYYY" using a regular expression
        date_match = re.search(r'\d{1,2} (JANVIER|FÉVRIER|MARS|AVRIL|MAI|JUIN|JUILLET|AOÛT|SEPTEMBRE|OCTOBRE|NOVEMBRE|DÉCEMBRE)  \d{4}', first_page_text)

        # If a date is found, use it as a filename
        if date_match:
            # Get the matched date string
            date_str = date_match.group()

            # Remove extra spaces from the date string
            date_str = ' '.join(date_str.split())

            # Convert the date string to a datetime object
            date = dateparser.parse(date_str, languages=['fr'])

            # Format the date as "YY-MM-DD"
            formatted_date = date.strftime('%y-%m-%d')

            # Create a filename using the formatted date
            filename = f"{formatted_date}.txt"
        # If a date is not found, use the original PDF filename as a base
        else:
            filename = f"{os.path.splitext(pdf_file)[0]}.txt"

        txt_path = os.path.join(dest_dir, filename)

        # Open the output file for writing with 'utf-8' encoding
        with open(txt_path, "w", encoding='utf-8') as f:
            # Loop through all the pages in the PDF
            for page in reader.pages:
                # Extract text from the current page and write it to the output file
                f.write(page.extract_text())

src_dir = r'C:\Users\Stephanie\Documents\GitHub\NLP_Parlement\pdf'
dest_dir = r'C:\Users\Stephanie\Documents\GitHub\NLP_Parlement\txt'
extract_pdf_to_txt(src_dir, dest_dir)

# Important information

In [40]:
parlementarians = pd.read_csv(r'C:\Users\Stephanie\Documents\GitHub\NLP_Parlement\2022.07.22_Parlementaires PFWB_0.csv', header=None)
column_names = ['Prénom', 'nom', 'sexe', 'résidence', 'naissance', 'date', 'parti', 'titre']
parlementarians.columns = column_names

In [42]:
known_speakers = parlementarians.apply(lambda row: {"name": row["Prénom"] + " " + row["nom"], "title": row["titre"], 'parti': row['parti']}, axis=1).to_list()


# Convert to JSON file

In [51]:
# Helper functions

# Define regular expressions and helper function to parse questions, speakers and statements
def find_starting_point(text, starting_sentence='La séance est ouverte'):
    starting_point = text.find(starting_sentence)
    if starting_point != -1:
        return starting_point + len(starting_sentence)
    return None

# Start the text after the table des matieres 
starting_point = find_starting_point(full_text)
if starting_point is not None:
    transcript_text = full_text[starting_point:]
else:
    print("Starting sentence not found. Analyzing the full text.")
    transcript_text = full_text

def clean_line_breaks(text):
    # Replace hyphenated line breaks with an empty string
    cleaned_text = re.sub(r'-\s+', '', text)
    # Clean the text from the \n characters 
    cleaned_text = cleaned_text.replace('\n', ' ')
    return cleaned_text

transcript_text = clean_line_breaks(transcript_text)


def find_questions(text):
    question_pattern = re.compile(r'(?<=\d\.\d Question)(.*?)(?=\d\.\d Question)', re.DOTALL)
    return question_pattern.findall(text)

def find_projets(text):
    projet_pattern = re.compile(r'\d+ Projet de décret(.*?)(?=\d+ Projet de décret|$)', re.DOTALL)
    return projet_pattern.findall(text)

def process_transcript(text, known_speakers):
    # Create a regex pattern for speaker names
    speaker_pattern = r'\b(?:' + '|'.join([re.escape(speaker["name"]) for speaker in known_speakers]) + r')\b'

    # Split the text into parts
    parts = re.split('(' + speaker_pattern + ')', text)
    # Initialize the result list
    result = []

    # Initialize the current speaker
    current_speaker = None

    # Iterate through the parts
    for part in parts:
        found_speaker = False
        for speaker in known_speakers:
            if speaker["name"] == part.strip():
                current_speaker = speaker
                found_speaker = True
                break

        if not found_speaker and current_speaker is not None:
            # Append the spoken text by the current speaker
            spoken_text = part.strip()
            if spoken_text:
                result.append({"speaker": current_speaker["name"], "title": current_speaker["title"], "text": spoken_text})

    return result

def create_json_file(questions_data, projets_data, filename):
    data = {
        "questions": questions_data,
        "projets": projets_data,
    }
    with open(filename, "w", encoding='utf-8') as outfile:
        json.dump(data, outfile, ensure_ascii=False, indent=2)

In [44]:
def process_file(file_path, known_speakers):
    with open(file_path, 'r', encoding='utf-8') as f:
        full_text = f.read()

    starting_point = find_starting_point(full_text)
    if starting_point is not None:
        transcript_text = full_text[starting_point:]
    else:
        print("Starting sentence not found. Analyzing the full text.")
        transcript_text = full_text

    transcript_text = transcript_text.replace('\n', ' ')
    transcript_text = clean_line_breaks(transcript_text)

    date = os.path.splitext(os.path.basename(file_path))[0]

    # Find and process questions
    questions_text = find_questions(transcript_text)
    questions_data = []
    for question_text in questions_text:
        result = process_transcript(question_text, known_speakers)
        question_data = {
            "date": date,
            "type": questions_type,
            "theme": theme,
            "text": result
        }
        questions_data.append(question_data)

    # Find and process projets
    projets_text = find_projets(transcript_text)
    projets_data = []
    for projet_text in projets_text:
        result = process_transcript(projet_text, known_speakers)
        projet_data = {
            "date": date,
            "type": projets_type,
            "theme": theme,
            "text": result
        }
        projets_data.append(projet_data)

    create_json_file(questions_data, projets_data, f"{date}.json")

    return questions_data, projets_data

In [45]:
questions, projets = process_file(r'C:\Users\Stephanie\Documents\GitHub\NLP_Parlement\txt\20-05-13.txt', known_speakers)

In [50]:
def process_directory(directory_path, known_speakers):
    all_questions_data = []
    all_projets_data = []

    for file_name in os.listdir(directory_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(directory_path, file_name)
            questions_data, projets_data = process_file(file_path, known_speakers)
            all_questions_data.extend(questions_data)
            all_projets_data.extend(projets_data)

    create_json_file(all_questions_data, all_projets_data, 'combined.json')

In [52]:
process_directory(r'C:\Users\Stephanie\Documents\GitHub\NLP_Parlement\txt', known_speakers)