In [1]:
import csv
import xml.etree.ElementTree as ET
import re

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Specify the input XML file path and output CSV file path
xml_file_path = '/content/drive/MyDrive/all_case_data.xml'
csv_file_path = 'cleaned_cases_two.csv'

In [4]:
# Function to clean individual text elements
def clean_text(text):
    # Remove cookie warnings, meta-info, etc.
    if text is None:
        return ""
    if "cookies" in text.lower() or "find case law" in text.lower():
        return ""
    # Remove excessive whitespace
    text = text.strip()
    # Normalize dates (e.g., "29th February 2024" -> "2024-02-29")
    text = re.sub(r'(\d{1,2})(st|nd|rd|th)\s([A-Za-z]+)\s(\d{4})', r'\4-\3-\1', text)
    # Remove redundant tags like <text />
    text = re.sub(r"<text />", "", text)
    return text

In [5]:
# Open the CSV file in write mode
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)  # Use csv.writer for single column
    writer.writerow(['cleaned_text'])  # Write the header

    # Parse the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # Iterate through each case in the XML
    for case in root.findall(".//case"):
        # Extract and clean text elements
        cleaned_texts = []
        for text_element in case.findall(".//text"):
            cleaned = clean_text(text_element.text)
            if cleaned:
                cleaned_texts.append(cleaned)

        # Write the cleaned text as a single row
        writer.writerow(["\n".join(cleaned_texts)])

print(f"Processing complete. Cleaned data saved to '{csv_file_path}'.")

Processing complete. Cleaned data saved to 'cleaned_cases_two.csv'.


In [7]:
import pandas as pd
df = pd.read_csv('cleaned_cases_two.csv')

In [8]:
df

Unnamed: 0,cleaned_text
0,Judgments and decisions from 2001 onwards\n[20...
1,Judgments and decisions from 2001 onwards\n[20...
2,Judgments and decisions from 2001 onwards\n[20...
3,Judgments and decisions from 2001 onwards\n[20...
4,Judgments and decisions from 2001 onwards\n[20...
...,...
95,Judgments and decisions from 2001 onwards\n[20...
96,Judgments and decisions from 2001 onwards\n[20...
97,Judgments and decisions from 2001 onwards\n[20...
98,Judgments and decisions from 2001 onwards\n[20...
