In [1]:
import json
import pandas as pd
from bs4 import BeautifulSoup
from IPython.display import display, HTML
import sys
sys.path.insert(0, '../')
import utilities.functions as fct

In [13]:
# Chemins vers les fichiers JSON d'entraînement et de développement
train_path = '../SCOTUS/train.json'
dev_path = '../SCOTUS/dev.json'

In [14]:
# Chargement des données d'entraînement à partir du fichier JSON
with open(dev_path, 'r', encoding="utf-8") as f:
    train = json.load(f)

In [4]:
def html_processing(html_content):
    """
    Récupère le contenu intéressant du HTML en conservant les balises <a>, <em> et <blockquote>.
    
    Parameters:
    html_content (str): Le contenu HTML à traiter.

    Returns:
    list: Une liste de chaînes contenant le texte traité et les balises spécifiées.
    """
    soup = BeautifulSoup(html_content, 'html.parser')

    # On suppose que le contenu intéressant se trouve dans cette classe
    div_class = "-display-inline-block text-left"
    target_div = soup.find('div', class_=div_class)
    
    extract = []

    temp_result = []  # Liste pour stocker le texte traité
    if target_div:
        to_merge = False  # Indicateur pour savoir si on doit fusionner les textes
        for element in target_div.children:
            # Ignorer les sauts de ligne
            if element.getText() == '\n':
                continue
            
            extract.append(element.get_text())
            
            # Traiter les balises <a> et <em>
            if element.name in ["a", "em"]:
                # temp_result.append(f"<{element.name}> {element.get_text()} </{element.name}>")
                temp_result.append(f"{element.get_text()}")
                to_merge = True
            # Si l'élément est du texte à fusionner
            elif to_merge:
                if temp_result[-1].startswith("<em>"):
                    temp_result[-2] += temp_result[-1] + element.get_text()
                else:
                    temp_result[-2] += " " + temp_result[-1] + " " + element.get_text()
                temp_result.pop()  # Retirer le dernier élément
                to_merge = False
            # Traiter les balises <blockquote>
            elif element.name == "blockquote":
                # temp_result.append(f"<{element.name}>{element.get_text()}</{element.name}>")
                temp_result.append(f"{element.get_text()}")
            # Pour le texte normal
            else:
                temp_result.append(element.get_text())
    else:
        print(f"Aucun div trouvé avec la classe '{div_class}'.")

    result = []  # Liste finale pour le résultat

    # Nettoyer et fusionner le texte dans le résultat final
    for res in temp_result:
        # Remplacer les tabulations et les sauts de ligne
        if '\t' in res:
            string = res.replace('\t', '').replace('\n', ' ')
            result.append(string)
        else:
            if len(result) < 1:
                result.append(res.replace('\n', ''))
            else:
                result[-1] += " " + res.replace('\n', ' ')           
    return result, extract

Commented so we don't overwrite the txt files by accident

In [None]:
# Traitement des données d'entraînement
for i in range(len(train)):
    html_content = train[i]["raw_source"]  # Récupérer le contenu HTML

    text, _ = html_processing(html_content)  # Traiter le contenu HTML

    name = f'../SCOTUS_data/text/train_{i}.txt'  # Nom du fichier de sortie

    # Écrire le texte traité dans un fichier
    with open(name, 'w', encoding='utf-8') as f:
        for line in text:
            f.write(line + '\n')

In [15]:
paragraph_target_df = pd.DataFrame()
  
for i in range(len(train)):
    tmp = train[i]["raw_target"]
    text = tmp["facts_of_the_case"] + " \n" + tmp["question"] + " \n" + tmp["conclusion"]

    facts_of_the_case = tmp['facts_of_the_case']
    question = tmp['question']
    conclusion = tmp['conclusion']

    facts_of_the_case = fct.cleanhtml(facts_of_the_case)
    question = fct.cleanhtml(question)
    conclusion = fct.cleanhtml(conclusion)

    paragraph_target_df = pd.concat([
        paragraph_target_df,
        pd.DataFrame({
            "facts_of_the_case": [facts_of_the_case],
            "question": [question],
            "conclusion": [conclusion]
        })
    ], ignore_index=True)

    soup = BeautifulSoup(text, 'html.parser')
    clean_text = fct.cleanhtml(text)

    name = f'../SCOTUS_data/summary/train_{i}.txt'  # Nom du fichier de sortie
    
    # Écrire le texte traité dans un fichier
    with open(name, 'w', encoding='utf-8') as f:
      f.write(clean_text)

paragraph_target_df.head()

Unnamed: 0,facts_of_the_case,question,conclusion
0,Acting on a tip from a confidential informant ...,Does the Fourth Amendment require police to ob...,"No. In a per curiam opinion, the Court held th..."
1,Robert Smith was convicted of first-degree mur...,Did the Court of Appeals err in interpreting A...,On the certification question (534 U.S. 157):I...
2,Jessie L. Jackson was sentenced to life in pri...,Did the federal appellate court properly deter...,The state court did not unreasonably apply the...
3,Citizens Bank (Citizens) and Alafabco Inc. (Al...,Is a debt-restructuring agreement between two ...,"In a per curiam opinion, the unanimous Court h..."
4,"In January 1999, a 14-year-old girl disappeare...",1. Does an arrest for which the police did not...,An arrest within the meaning of the Fourth Ame...


In [16]:
paragraph_target_df.to_csv('../SCOTUS_data/paragraph_target_df_dev.csv', index=False)

In [None]:
def split_extract(extract_texts):
    to_display = [] 
    for extract_text in extract_texts:
        parts = extract_text.split('\n')
        to_display.extend(part.strip() for part in parts if part.strip()) 
        
    return to_display  

Display the cleaned text in the html to see if every thing is extracted (but find only 1 occurence)

In [None]:
html_content = train[0]["raw_source"]  # Récupérer le contenu HTML
text, extract_texts = html_processing(html_content)

to_display = split_extract(extract_texts)

# we skip the first lines because the highlighting doesn't work well with the first html tags 

skipped_lines = 240 # where the syllabus start
#skipped_lines = 375 # where the opinions start

lines = html_content.splitlines()

beginning = "\n".join(lines[:skipped_lines])
rest = "\n".join(lines[skipped_lines:])

highlighted_result = fct.highlight_html(rest, to_display)

display(HTML(beginning+highlighted_result))