In [2]:
from bs4 import BeautifulSoup

# Load the HTML file
with open("D:\\ai-legal-document-analysis\\data\\raw\\Computer_Misuse_Act.html", "r", encoding="Windows-1252") as file:
    soup = BeautifulSoup(file, "html.parser")

In [3]:
import re

def extract_cases(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    cases = []
    
    # Find all <b> tags containing case titles
    for b_tag in soup.find_all('b'):
        case_title = b_tag.get_text().strip()
        
        # Find any links within this case title
        links = []
        for a_tag in b_tag.find_all('a'):
            if 'href' in a_tag.attrs:
                links.append(a_tag['href'])
        # Add cases even if they don't have links
        if not links:
            links.append('No links available')
        # Only add cases that have a title
        if case_title:
            cases.append({
                'title': case_title,
                'links': links
            })
    
    return cases

# Read and parse the HTML file
with open("D:\\ai-legal-document-analysis\\data\\raw\\Computer_Misuse_Act.html", 'r', encoding='Windows-1252') as f:
    html_content = f.read()

# Extract cases
cases = extract_cases(html_content)

# Display results
for case in cases:
    print(f"Title: {case['title']}")
    if case['links']:
        print("Links:", ', '.join(case['links']))
    print('-' * 80)

Title: F & C Alternative Investments (Holdings) Ltd v Barthelemy (No 2) (Barthelemy v F & C Alternative Investments (Holdings) Ltd) [2011] EWHC 1731 (Ch); [2012] Ch 613; [2012] 3 WLR 10; [2012] Bus LR 891, Ch D (Sales J)
Links: http://www.bailii.org/ew/cases/EWHC/Ch/2011/1731.html, http://www.bailii.org/ew/cases/EWHC/Ch/2011/1731.html, http://www.bailii.org/ew/cases/EWHC/Ch/2011/1731.html, http://www.bailii.org/ew/cases/EWHC/Ch/2011/2807.html
--------------------------------------------------------------------------------
Title: R v Bow Street Metropolitan Stipendiary Magistrate, Ex parte Government of the United States of America (R v Governor of Brixton Prison, Ex parte Allison, United States of America (Government of the), Ex parte) [1999] QB 847; [1998] 3 WLR 1156, DC
Links: No links available
--------------------------------------------------------------------------------
Title: F & C Alternative Investments (Holdings) Ltd v Barthelemy (No 3) (Barthelemy v F & C Alternative Invest

In [None]:
import csv
import os

def save_cases_to_csv(cases, csv_filename="cases.csv", missing_links_filename="cases_no_links.csv"):
    
    # Save cases with links
    with open(csv_filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Title", "Link"])
        
        for case in cases:
            for link in case["links"]:
                writer.writerow([case["title"], link])

    print(f"✅ Titles & links saved to {csv_filename}")

    # Save cases without links separately
    with open(missing_links_filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Title"])
        
        for case in cases:
            if case["links"][0] == "No links available":
                writer.writerow([case["title"]])

    print(f"✅ Cases without links logged in {missing_links_filename}")

save_cases_to_csv(cases)

print(f"Cases with links saved at: {os.path.abspath('cases.csv')}")
print(f"Cases without links saved at: {os.path.abspath('cases_no_links.csv')}")

✅ Titles & links saved to cases.csv
✅ Cases without links logged in cases_no_links.csv
Cases with links saved at: d:\ai-legal-document-analysis\notebooks\cases.csv
Cases without links saved at: d:\ai-legal-document-analysis\notebooks\cases_no_links.csv


In [2]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin


In [4]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin

def get_headers():
    return {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    }

def is_document_link(href):
    """Check if the link is a document file"""
    document_extensions = ('.pdf', '.rtf', '.doc', '.docx')
    return any(ext in href.lower() for ext in document_extensions)

def get_file_extension(href):
    """Extract the correct file extension from the link"""
    for ext in ['.pdf', '.rtf', '.doc', '.docx']:
        if ext in href.lower():
            return ext
    return '.pdf'  # default to PDF if no extension found

def download_document_from_bailii(url, download_dir, session):
    try:
        # Get the HTML page first
        response = session.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Look for document download links
            doc_link = None
            for link in soup.find_all('a', href=True):
                href = link['href']
                if is_document_link(href) or 'download' in href.lower():
                    doc_link = urljoin(url, href)
                    break

            if doc_link:
                # Extract case name from URL for filename
                case_name = url.split('/')[-1].replace('.html', '')
                file_extension = get_file_extension(doc_link)
                file_path = os.path.join(download_dir, f"{case_name}{file_extension}")
                
                # Download the document
                doc_response = session.get(doc_link, stream=True)
                if doc_response.status_code == 200:
                    with open(file_path, 'wb') as doc_file:
                        for chunk in doc_response.iter_content(chunk_size=8192):
                            if chunk:
                                doc_file.write(chunk)
                    print(f"✅ Successfully downloaded {file_extension} for case: {case_name}")
                    return True
                else:
                    print(f"❌ Failed to download document from {doc_link}")
            else:
                print(f"❌ No document link found on page: {url}")
        else:
            print(f"❌ Failed to access page: {url}")
            
    except Exception as e:
        print(f"❌ Error processing {url}: {str(e)}")
    return False

def main():
    # Create a session for better performance and retry handling
    session = requests.Session()
    session.headers.update(get_headers())
    
    # Read the CSV file with case links
    csv_file_path = 'D:\\ai-legal-document-analysis\\data\\raw\\cases.csv'
    download_directory = 'D:\\ai-legal-document-analysis\\data\\raw\\Documents'
    
    # Create download directory if it doesn't exist
    if not os.path.exists(download_directory):
        os.makedirs(download_directory)
    
    # Read URLs from CSV
    df = pd.read_csv(csv_file_path)
    urls = df['Link'].tolist()
    
    # Track statistics
    total = len(urls)
    successful = 0
    failed = 0
    
    # Process each URL
    for i, url in enumerate(urls, 1):
        if url != 'No links available':
            print(f"\nProcessing {i}/{total}: {url}")
            if download_document_from_bailii(url, download_directory, session):
                successful += 1
            else:
                failed += 1
            time.sleep(2)  # Polite delay between requests
    
    # Print summary
    print("\n📊 Download Summary:")
    print(f"Total URLs processed: {total}")
    print(f"Successfully downloaded: {successful}")
    print(f"Failed downloads: {failed}")
    print(f"Documents saved in: {download_directory}")

if __name__ == "__main__":
    main()


Processing 1/480: http://www.bailii.org/ew/cases/EWHC/Ch/2011/1731.html
✅ Successfully downloaded .pdf for case: 1731

Processing 2/480: http://www.bailii.org/ew/cases/EWHC/Ch/2011/1731.html
✅ Successfully downloaded .pdf for case: 1731

Processing 3/480: http://www.bailii.org/ew/cases/EWHC/Ch/2011/1731.html
✅ Successfully downloaded .pdf for case: 1731

Processing 4/480: http://www.bailii.org/ew/cases/EWHC/Ch/2011/2807.html
✅ Successfully downloaded .rtf for case: 2807

Processing 6/480: http://www.bailii.org/ew/cases/EWHC/Ch/2011/2807.html
✅ Successfully downloaded .rtf for case: 2807

Processing 7/480: http://www.bailii.org/ew/cases/EWHC/Ch/2011/2807.html
✅ Successfully downloaded .rtf for case: 2807

Processing 9/480: http://www.bailii.org/ew/cases/EWHC/QB/2017/3113.html
✅ Successfully downloaded .pdf for case: 3113

Processing 10/480: http://www.bailii.org/ew/cases/EWCA/Civ/2018/2339.html
✅ Successfully downloaded .rtf for case: 2339

Processing 11/480: http://www.bailii.org/ew/c