In [None]:
from google.colab import drive
import os
import requests
from bs4 import BeautifulSoup
import zipfile

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Extracting JSON files

In [None]:
import os
import requests
from bs4 import BeautifulSoup
import zipfile

BASE_FOLDER = '/content/drive/My Drive/Virginia/Virginia_json_files'
os.makedirs(BASE_FOLDER, exist_ok=True)

# Define the dictionary of folder names and bulk data links
folders_and_links = {
    "Howison_1850-1851": "https://static.case.law/howison/",
    "Jeff_1730-1772": "https://static.case.law/jeff/",
    "Va_1779-2004": "https://static.case.law/va/",
    "Va_app_1985-2017": "https://static.case.law/va-app/",
    "Va_Ch_Dec_1788-1799": "https://static.case.law/va-ch-dec/",
    "Va_cir_1856-2016": "https://static.case.law/va-cir/",
    "Va_col_dec_1729-1741": "https://static.case.law/va-col-dec/",
    "Va_dec_1871-1900": "https://static.case.law/va-dec/",
    "Va_Patt_Health_1855-1857": "https://static.case.law/va-patt-heath/"
}

def get_zip_links(base_url):
    """
    Scrape the base URL to get all .zip links for a given bulk data page.
    """
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, "html.parser")
    zip_links = []

    # Find all .zip links
    for link in soup.find_all("a", href=True):
        if link['href'].endswith(".zip"):
            if link['href'].startswith("http"):  # Full URL
                zip_links.append(link['href'])
            else:  # Relative path
                zip_links.append(base_url + link['href'])
    return zip_links

def download_and_extract_json_single_folder(zip_link, target_folder):
    """
    Download a .zip file, extract JSON files, and save them in a single folder.
    """
    # Download the .zip file
    zip_file_name = zip_link.split("/")[-1]
    zip_file_path = os.path.join(target_folder, zip_file_name)
    print(f"Downloading: {zip_link}")

    with requests.get(zip_link, stream=True) as r:
        with open(zip_file_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    print(f"Downloaded: {zip_file_path}")

    # Extract only JSON files from the zip
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        for file in zip_ref.namelist():
            if file.endswith(".json"):
                json_file_name = os.path.basename(file)
                json_file_path = os.path.join(target_folder, json_file_name)

                with zip_ref.open(file) as source, open(json_file_path, "wb") as target:
                    target.write(source.read())
    print(f"Extracted JSON files into {target_folder}")

    # Remove the .zip file after extraction
    os.remove(zip_file_path)
    print(f"Removed zip file: {zip_file_path}")

# Target folder for all JSON files
target_folder = os.path.join(BASE_FOLDER, "All_JSON_Files")
os.makedirs(target_folder, exist_ok=True)

# Step 3: Process all folders and links
for main_folder_name, base_url in folders_and_links.items():
    print(f"Processing: {main_folder_name} ({base_url})")
    zip_links = get_zip_links(base_url)
    print(f"Found {len(zip_links)} .zip links for {main_folder_name}")

    # Process each zip link
    for zip_link in zip_links:
        download_and_extract_json_single_folder(zip_link, target_folder)

print(f"All JSON files have been downloaded and extracted into {target_folder}")


Processing: Howison_1850-1851 (https://static.case.law/howison/)
Found 1 .zip links for Howison_1850-1851
Downloading: https://static.case.law/howison/1.zip
Downloaded: /content/drive/My Drive/Virginia/Virginia_json_files/All_JSON_Files/1.zip
Extracted JSON files into /content/drive/My Drive/Virginia/Virginia_json_files/All_JSON_Files
Removed zip file: /content/drive/My Drive/Virginia/Virginia_json_files/All_JSON_Files/1.zip
Processing: Jeff_1730-1772 (https://static.case.law/jeff/)
Found 1 .zip links for Jeff_1730-1772
Downloading: https://static.case.law/jeff/1.zip
Downloaded: /content/drive/My Drive/Virginia/Virginia_json_files/All_JSON_Files/1.zip
Extracted JSON files into /content/drive/My Drive/Virginia/Virginia_json_files/All_JSON_Files
Removed zip file: /content/drive/My Drive/Virginia/Virginia_json_files/All_JSON_Files/1.zip
Processing: Va_1779-2004 (https://static.case.law/va/)
Found 267 .zip links for Va_1779-2004
Downloading: https://static.case.law/va/1.zip
Downloaded: /co

# Removing Extra Metadata.json files

In [None]:
FILES_TO_REMOVE = ["VolumeMetadata.json", "CasesMetadata.json"]
def remove_files_from_folders(base_folder, files_to_remove):
    """
    Recursively removes specified files from the base folder and its subfolders.
    """
    for root, dirs, files in os.walk(base_folder):
        for file in files:
            if file in files_to_remove:
                file_path = os.path.join(root, file)
                try:
                    os.remove(file_path)
                    print(f"Removed: {file_path}")
                except Exception as e:
                    print(f"Error removing {file_path}: {e}")

# Run the function
remove_files_from_folders(BASE_FOLDER, FILES_TO_REMOVE)