In [1]:
import os
import urllib.request
from tqdm import tqdm
import lzma


# --- Config ---
DATA_DIR = "pile_of_law_data"
os.makedirs(DATA_DIR, exist_ok=True)

files_to_download = [
    "train.atticus_contracts.0.jsonl.xz",     # Real-world contracts
    "train.resource_contracts.jsonl.xz",      # Public resource agreements 
    "train.tos.jsonl.xz",                     # Corporate-style ToS 
    "train.cfpb_cc.jsonl.xz",                 # Consumer protection/credit agreements
    "train.uscode.jsonl.xz",                  # Federal statutes 
    "train.state_code.jsonl.xz",              # State-level laws 
    "train.cfr.jsonl.xz",                     # Federal regulations 
]

BASE_URL = "https://huggingface.co/datasets/pile-of-law/pile-of-law/resolve/main/data/"

for idx, filename in enumerate(files_to_download):
    url = BASE_URL + filename
    local_path = os.path.join(DATA_DIR, filename)

    with tqdm(total=1, position=0, desc=f"Downloading {filename}", leave=True, bar_format="{desc}") as progress:
        if os.path.exists(local_path):
            progress.set_description(f"Already exists: {filename}")
            continue

        try:
            urllib.request.urlretrieve(url, local_path)
            progress.set_description(f"Saved: {filename}")
        except Exception as e:
            progress.set_description(f"Error: {filename} ({e})")
        
        try:
            with lzma.open(local_path) as test_file:
                test_file.read(1)  
        except Exception:
            progress.set_description(f"Corrupt file: {filename}")
            os.remove(local_path)
            continue

Already exists: train.atticus_contracts.0.jsonl.xz: 
Already exists: train.resource_contracts.jsonl.xz: 
Already exists: train.tos.jsonl.xz: 
Already exists: train.cfpb_cc.jsonl.xz: 
Already exists: train.uscode.jsonl.xz: 
Already exists: train.state_code.jsonl.xz: 
Already exists: train.cfr.jsonl.xz: 


In [2]:
import os
import lzma
from tqdm import tqdm

DATA_DIR = "pile_of_law_data"
OUTPUT_DIR = "pile_of_law_decompressed"  

os.makedirs(OUTPUT_DIR, exist_ok=True)

for filename in os.listdir(DATA_DIR):
    if filename.endswith(".jsonl.xz"):
        input_path = os.path.join(DATA_DIR, filename)
        output_path = os.path.join(OUTPUT_DIR, filename.replace(".xz", ""))
        
        if os.path.exists(output_path):
            tqdm.write(f"Skipping {filename} - already decompressed")
            continue

        tqdm.write(f"Decompressing {filename} → {os.path.basename(output_path)}")

        try:
            with lzma.open(input_path, "rb") as compressed, open(output_path, "wb") as out_file:
                out_file.write(compressed.read())

            tqdm.write(f"Saved: {output_path}")

        except Exception as e:
            tqdm.write(f"Failed to decompress {filename}: {e}")

print(f"Decompressed files saved to '{OUTPUT_DIR}' directory.")

Skipping train.resource_contracts.jsonl.xz - already decompressed
Skipping train.state_code.jsonl.xz - already decompressed
Skipping train.atticus_contracts.0.jsonl.xz - already decompressed
Skipping train.cfpb_cc.jsonl.xz - already decompressed
Skipping train.tos.jsonl.xz - already decompressed
Skipping train.cfr.jsonl.xz - already decompressed
Skipping train.uscode.jsonl.xz - already decompressed
Decompressed files saved to 'pile_of_law_decompressed' directory.


In [3]:
import json
import os
import re
from pathlib import Path

OUTPUT_DIR = "pile_of_law_decompressed"
PARSED_DIR = "parsed_docs_batches"
os.makedirs(PARSED_DIR, exist_ok=True)

# Map raw filenames to clean source labels
SOURCE_NAME_MAP = {
    "train.atticus_contracts.0": "Atticus Contracts",
    "train.resource_contracts": "Resource Contracts",
    "train.tos": "Terms of Service",
    "train.cfpb_cc": "Consumer Credit Agreements",
    "train.uscode": "US Code",
    "train.state_code": "State Codes",
    "train.cfr": "Code of Federal Regulations"
}

def clean_chunk_text(text):
    if not isinstance(text, str):
        return ""
    text = text.replace('\x00', '')
    text = re.sub(r"[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]", "", text)
    text = re.sub(r'-{5,}', '', text)             # remove visual dividers
    text = re.sub(r'\n\s*\n+', '\n\n', text)      # collapse extra blank lines
    text = re.sub(r'[ \t]+', ' ', text)           # normalize intra-line whitespace
    text = text.replace('\u00A0', ' ')            # non-breaking spaces
    return text.strip()

batch_size = 10000
doc_buffer = []
file_count = 0
doc_count = 0

for file in sorted(os.listdir(OUTPUT_DIR)):
    if file.endswith(".jsonl"):
        filepath = os.path.join(OUTPUT_DIR, file)
        raw_source = Path(file).stem
        source = SOURCE_NAME_MAP.get(raw_source, raw_source)

        with open(filepath, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    doc = json.loads(line)

                    if "text" not in doc:
                        print(f"Warning: No 'text' field in document from {file}")
                    elif not isinstance(doc["text"], str):
                        print(f"Warning: 'text' field is not a string in document from {file}")

                    doc_buffer.append({
                        "source": source,
                        "text": clean_chunk_text(doc.get("text", ""))
                    })
                    doc_count += 1

                    if len(doc_buffer) >= batch_size:
                        file_count += 1
                        batch_path = os.path.join(PARSED_DIR, f"parsed_docs_{file_count:04d}.json")
                        with open(batch_path, 'w', encoding='utf-8') as out_f:
                            json.dump(doc_buffer, out_f)
                        print(f"Saved batch {file_count} with {len(doc_buffer)} docs to {batch_path}")
                        doc_buffer = []

                except json.JSONDecodeError:
                    print(f"Skipping malformed line in {file}")
                except Exception as e:
                    print(f"Error processing line in {file}: {str(e)}")

if doc_buffer:
    file_count += 1
    batch_path = os.path.join(PARSED_DIR, f"parsed_docs_{file_count:04d}.json")
    with open(batch_path, 'w', encoding='utf-8') as out_f:
        json.dump(doc_buffer, out_f)
    print(f"Saved final batch {file_count} with {len(doc_buffer)} docs to {batch_path}")

print(f"Total documents parsed: {doc_count}")
print(f"Total batch files written: {file_count}")

Saved batch 1 with 10000 docs to parsed_docs_batches/parsed_docs_0001.json
Saved batch 2 with 10000 docs to parsed_docs_batches/parsed_docs_0002.json
Saved batch 3 with 10000 docs to parsed_docs_batches/parsed_docs_0003.json
Saved batch 4 with 10000 docs to parsed_docs_batches/parsed_docs_0004.json
Saved batch 5 with 10000 docs to parsed_docs_batches/parsed_docs_0005.json
Saved batch 6 with 10000 docs to parsed_docs_batches/parsed_docs_0006.json
Saved batch 7 with 10000 docs to parsed_docs_batches/parsed_docs_0007.json
Saved batch 8 with 10000 docs to parsed_docs_batches/parsed_docs_0008.json
Saved batch 9 with 10000 docs to parsed_docs_batches/parsed_docs_0009.json
Saved batch 10 with 10000 docs to parsed_docs_batches/parsed_docs_0010.json
Saved batch 11 with 10000 docs to parsed_docs_batches/parsed_docs_0011.json
Saved batch 12 with 10000 docs to parsed_docs_batches/parsed_docs_0012.json
Saved final batch 13 with 2191 docs to parsed_docs_batches/parsed_docs_0013.json
Total documents 

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
import os

# --- Config ---
PARSED_DIR = "parsed_docs_batches"
CHUNKED_DIR = "chunked_docs_batches"
os.makedirs(CHUNKED_DIR, exist_ok=True)

MAX_CHUNKS_PER_FILE = 250_000
MAX_FILE_SIZE_MB = 150

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=200,  
    separators=["\n\n", "\n", " ", ""],
    length_function=len
)

batch_files = sorted([
    f for f in os.listdir(PARSED_DIR)
    if f.startswith("parsed_docs_") and f.endswith(".json")
])

chunk_file_idx = 1
chunk_count = 0
file_bytes = 0
total_chunks = 0
out_f = open(os.path.join(CHUNKED_DIR, f"chunked_docs_{chunk_file_idx:04d}.jsonl"), "w", encoding="utf-8")

for batch_file in batch_files:
    input_path = os.path.join(PARSED_DIR, batch_file)

    with open(input_path, 'r', encoding='utf-8') as f_in:
        parsed_docs = json.load(f_in)

        for doc_idx, doc in enumerate(parsed_docs):
            try:
                chunks = text_splitter.split_text(doc["text"])

                for chunk_index, chunk in enumerate(chunks):
                    chunk_obj = {
                        "source": doc["source"],
                        "chunk": chunk,
                        "doc_index": doc_idx,
                        "chunk_index": chunk_index
                    }
                    json_str = json.dumps(chunk_obj)
                    byte_len = len(json_str.encode("utf-8"))

                    if chunk_count >= MAX_CHUNKS_PER_FILE or file_bytes + byte_len > MAX_FILE_SIZE_MB * 1024 * 1024:
                        out_f.close()
                        chunk_file_idx += 1
                        out_f = open(os.path.join(CHUNKED_DIR, f"chunked_docs_{chunk_file_idx:04d}.jsonl"), "w", encoding="utf-8")
                        chunk_count = 0
                        file_bytes = 0

                    out_f.write(json_str + "\n")
                    chunk_count += 1
                    file_bytes += byte_len
                    total_chunks += 1

            except Exception as e:
                print(f"Error processing doc {doc_idx} in {batch_file}: {e}")

# Final flush
if not out_f.closed:
    out_f.close()

print(f"Chunking complete. Total chunks: {total_chunks}")
print(f"Output files written to '{CHUNKED_DIR}'")

Chunking complete. Total chunks: 16884492
Output files written to 'chunked_docs_batches'


In [5]:
# !pip install --upgrade snowflake-snowpark-python
from snowflake.snowpark import Session
import getpass

password = getpass.getpass("Enter Snowflake password: ")

connection_parameters = {
    "account": "SFEDU02-PDB57018",
    "user": "CAT",
    "password": password,
    "role": "TRAINING_ROLE",
    "warehouse": "DAMG7374",
    "database": "LAWS_CONTRACTS",
    "schema": "TEXT"
}

session = Session.builder.configs(connection_parameters).create()
print("Connected to Snowflake!")

Connected to Snowflake!


In [6]:
import json
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

CHUNKED_DIR = "chunked_docs_batches"
BATCH_FILES = sorted(f for f in os.listdir(CHUNKED_DIR) if f.endswith(".jsonl"))

def upload_to_snowflake(file):
    path = os.path.join(CHUNKED_DIR, file)
    try:
        with open(path, "r", encoding="utf-8") as f:
            records = [json.loads(line) for line in f if line.strip()]
        if not records:
            return f"Skipped empty: {file}"

        df = session.create_dataframe(records, schema=["SOURCE", "CHUNK", "DOC_INDEX", "CHUNK_INDEX"])
        df.write.mode("append").save_as_table("DOCS_CHUNKS_TABLE")
        return f"Uploaded {len(records)} chunks from {file}"
    except Exception as e:
        return f"Error uploading {file}: {e}"

# --- Run uploads in parallel ---
MAX_WORKERS = 8
print(f"Starting concurrent upload with {MAX_WORKERS} workers...")

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(upload_to_snowflake, file): file for file in BATCH_FILES}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Uploading to Snowflake"):
        print(future.result())

Starting concurrent upload with 8 workers...


Uploading to Snowflake:   1%|          | 1/96 [01:55<3:03:14, 115.73s/it]

Uploaded 180733 chunks from chunked_docs_0006.jsonl


Uploading to Snowflake:   2%|▏         | 2/96 [01:59<1:18:21, 50.02s/it] 

Uploaded 179965 chunks from chunked_docs_0003.jsonl


Uploading to Snowflake:   3%|▎         | 3/96 [02:03<44:26, 28.67s/it]  

Uploaded 178575 chunks from chunked_docs_0002.jsonl
Uploaded 179848 chunks from chunked_docs_0005.jsonl


Uploading to Snowflake:   5%|▌         | 5/96 [02:05<19:47, 13.05s/it]

Uploaded 179943 chunks from chunked_docs_0007.jsonl


Uploading to Snowflake:   6%|▋         | 6/96 [02:07<14:58,  9.98s/it]

Uploaded 180538 chunks from chunked_docs_0008.jsonl


Uploading to Snowflake:   7%|▋         | 7/96 [02:10<11:45,  7.93s/it]

Uploaded 180244 chunks from chunked_docs_0004.jsonl


Uploading to Snowflake:   8%|▊         | 8/96 [02:13<09:22,  6.39s/it]

Uploaded 180580 chunks from chunked_docs_0001.jsonl


Uploading to Snowflake:   9%|▉         | 9/96 [02:56<24:59, 17.23s/it]

Uploaded 180198 chunks from chunked_docs_0009.jsonl


Uploading to Snowflake:  10%|█         | 10/96 [03:54<42:09, 29.41s/it]

Uploaded 179362 chunks from chunked_docs_0010.jsonl


Uploading to Snowflake:  11%|█▏        | 11/96 [03:57<30:38, 21.63s/it]

Uploaded 180405 chunks from chunked_docs_0013.jsonl


Uploading to Snowflake:  12%|█▎        | 12/96 [04:04<24:03, 17.19s/it]

Uploaded 180395 chunks from chunked_docs_0012.jsonl


Uploading to Snowflake:  14%|█▎        | 13/96 [04:05<17:10, 12.42s/it]

Uploaded 180532 chunks from chunked_docs_0011.jsonl


Uploading to Snowflake:  15%|█▍        | 14/96 [04:08<12:49,  9.39s/it]

Uploaded 180480 chunks from chunked_docs_0014.jsonl


Uploading to Snowflake:  16%|█▌        | 15/96 [04:10<09:53,  7.33s/it]

Uploaded 179140 chunks from chunked_docs_0015.jsonl


Uploading to Snowflake:  17%|█▋        | 16/96 [04:11<07:16,  5.46s/it]

Uploaded 180327 chunks from chunked_docs_0017.jsonl


Uploading to Snowflake:  18%|█▊        | 17/96 [04:19<08:15,  6.27s/it]

Uploaded 178996 chunks from chunked_docs_0016.jsonl


Uploading to Snowflake:  19%|█▉        | 18/96 [04:38<13:05, 10.08s/it]

Uploaded 179859 chunks from chunked_docs_0018.jsonl


Uploading to Snowflake:  20%|█▉        | 19/96 [05:32<29:50, 23.25s/it]

Uploaded 179768 chunks from chunked_docs_0019.jsonl


Uploading to Snowflake:  21%|██        | 20/96 [06:04<32:46, 25.88s/it]

Uploaded 180283 chunks from chunked_docs_0022.jsonl


Uploading to Snowflake:  22%|██▏       | 21/96 [06:05<22:46, 18.22s/it]

Uploaded 180315 chunks from chunked_docs_0024.jsonl


Uploading to Snowflake:  23%|██▎       | 22/96 [06:05<15:53, 12.88s/it]

Uploaded 180354 chunks from chunked_docs_0020.jsonl


Uploading to Snowflake:  24%|██▍       | 23/96 [06:05<11:06,  9.13s/it]

Uploaded 180236 chunks from chunked_docs_0021.jsonl


Uploading to Snowflake:  25%|██▌       | 24/96 [06:07<08:11,  6.83s/it]

Uploaded 179768 chunks from chunked_docs_0023.jsonl


Uploading to Snowflake:  26%|██▌       | 25/96 [06:15<08:23,  7.09s/it]

Uploaded 179831 chunks from chunked_docs_0025.jsonl


Uploading to Snowflake:  27%|██▋       | 26/96 [06:20<07:38,  6.55s/it]

Uploaded 180063 chunks from chunked_docs_0026.jsonl


Uploading to Snowflake:  28%|██▊       | 27/96 [06:27<07:34,  6.59s/it]

Uploaded 180784 chunks from chunked_docs_0027.jsonl


Uploading to Snowflake:  29%|██▉       | 28/96 [07:55<35:07, 31.00s/it]

Uploaded 179851 chunks from chunked_docs_0028.jsonl


Uploading to Snowflake:  30%|███       | 29/96 [07:56<24:44, 22.16s/it]

Uploaded 180026 chunks from chunked_docs_0029.jsonl


Uploading to Snowflake:  31%|███▏      | 30/96 [08:03<19:12, 17.46s/it]

Uploaded 180063 chunks from chunked_docs_0033.jsonl


Uploading to Snowflake:  32%|███▏      | 31/96 [08:04<13:42, 12.65s/it]

Uploaded 179795 chunks from chunked_docs_0031.jsonl


Uploading to Snowflake:  33%|███▎      | 32/96 [08:11<11:45, 11.02s/it]

Uploaded 180365 chunks from chunked_docs_0030.jsonl


Uploading to Snowflake:  34%|███▍      | 33/96 [08:15<09:26,  8.99s/it]

Uploaded 180176 chunks from chunked_docs_0034.jsonl


Uploading to Snowflake:  35%|███▌      | 34/96 [08:16<06:36,  6.40s/it]

Uploaded 180188 chunks from chunked_docs_0032.jsonl


Uploading to Snowflake:  36%|███▋      | 35/96 [08:23<06:36,  6.50s/it]

Uploaded 180028 chunks from chunked_docs_0035.jsonl


Uploading to Snowflake:  38%|███▊      | 36/96 [08:40<09:52,  9.87s/it]

Uploaded 180106 chunks from chunked_docs_0036.jsonl


Uploading to Snowflake:  39%|███▊      | 37/96 [09:12<16:03, 16.33s/it]

Uploaded 180376 chunks from chunked_docs_0037.jsonl


Uploading to Snowflake:  40%|███▉      | 38/96 [09:52<22:46, 23.55s/it]

Uploaded 180085 chunks from chunked_docs_0039.jsonl


Uploading to Snowflake:  41%|████      | 39/96 [09:57<17:06, 18.00s/it]

Uploaded 180315 chunks from chunked_docs_0038.jsonl


Uploading to Snowflake:  42%|████▏     | 40/96 [10:11<15:34, 16.69s/it]

Uploaded 179757 chunks from chunked_docs_0040.jsonl


Uploading to Snowflake:  43%|████▎     | 41/96 [10:19<12:55, 14.10s/it]

Uploaded 180571 chunks from chunked_docs_0041.jsonl


Uploading to Snowflake:  44%|████▍     | 42/96 [10:19<08:56,  9.94s/it]

Uploaded 180065 chunks from chunked_docs_0042.jsonl


Uploading to Snowflake:  45%|████▍     | 43/96 [10:28<08:24,  9.52s/it]

Uploaded 180073 chunks from chunked_docs_0043.jsonl


Uploading to Snowflake:  46%|████▌     | 44/96 [10:31<06:31,  7.54s/it]

Uploaded 179208 chunks from chunked_docs_0044.jsonl


Uploading to Snowflake:  47%|████▋     | 45/96 [10:34<05:26,  6.40s/it]

Uploaded 179773 chunks from chunked_docs_0045.jsonl


Uploading to Snowflake:  48%|████▊     | 46/96 [10:41<05:25,  6.51s/it]

Uploaded 180628 chunks from chunked_docs_0046.jsonl


Uploading to Snowflake:  49%|████▉     | 47/96 [10:44<04:20,  5.32s/it]

Uploaded 179953 chunks from chunked_docs_0047.jsonl


Uploading to Snowflake:  50%|█████     | 48/96 [10:49<04:20,  5.43s/it]

Uploaded 180633 chunks from chunked_docs_0048.jsonl


Uploading to Snowflake:  51%|█████     | 49/96 [11:34<13:28, 17.21s/it]

Uploaded 157652 chunks from chunked_docs_0049.jsonl


Uploading to Snowflake:  52%|█████▏    | 50/96 [11:37<09:51, 12.85s/it]

Uploaded 184769 chunks from chunked_docs_0050.jsonl


Uploading to Snowflake:  53%|█████▎    | 51/96 [12:04<13:00, 17.35s/it]

Uploaded 185903 chunks from chunked_docs_0051.jsonl


Uploading to Snowflake:  54%|█████▍    | 52/96 [12:19<12:11, 16.62s/it]

Uploaded 147499 chunks from chunked_docs_0054.jsonl


Uploading to Snowflake:  55%|█████▌    | 53/96 [12:23<09:05, 12.69s/it]

Uploaded 174677 chunks from chunked_docs_0053.jsonl


Uploading to Snowflake:  56%|█████▋    | 54/96 [12:28<07:20, 10.49s/it]

Uploaded 185280 chunks from chunked_docs_0052.jsonl


Uploading to Snowflake:  57%|█████▋    | 55/96 [12:32<05:42,  8.35s/it]

Uploaded 170618 chunks from chunked_docs_0056.jsonl


Uploading to Snowflake:  58%|█████▊    | 56/96 [12:34<04:27,  6.68s/it]

Uploaded 167057 chunks from chunked_docs_0055.jsonl


Uploading to Snowflake:  59%|█████▉    | 57/96 [12:45<05:05,  7.84s/it]

Uploaded 169503 chunks from chunked_docs_0057.jsonl


Uploading to Snowflake:  60%|██████    | 58/96 [12:46<03:38,  5.74s/it]

Uploaded 171150 chunks from chunked_docs_0058.jsonl


Uploading to Snowflake:  61%|██████▏   | 59/96 [12:54<03:59,  6.47s/it]

Uploaded 170783 chunks from chunked_docs_0059.jsonl


Uploading to Snowflake:  62%|██████▎   | 60/96 [13:59<14:30, 24.17s/it]

Uploaded 171745 chunks from chunked_docs_0060.jsonl


Uploading to Snowflake:  64%|██████▎   | 61/96 [14:07<11:13, 19.24s/it]

Uploaded 175589 chunks from chunked_docs_0061.jsonl


Uploading to Snowflake:  65%|██████▍   | 62/96 [14:15<08:58, 15.83s/it]

Uploaded 167185 chunks from chunked_docs_0064.jsonl


Uploading to Snowflake:  66%|██████▌   | 63/96 [14:19<06:47, 12.35s/it]

Uploaded 172569 chunks from chunked_docs_0063.jsonl


Uploading to Snowflake:  67%|██████▋   | 64/96 [14:30<06:20, 11.88s/it]

Uploaded 175016 chunks from chunked_docs_0062.jsonl


Uploading to Snowflake:  68%|██████▊   | 65/96 [14:37<05:18, 10.29s/it]

Uploaded 163277 chunks from chunked_docs_0066.jsonl


Uploading to Snowflake:  69%|██████▉   | 66/96 [14:43<04:29,  8.97s/it]

Uploaded 176265 chunks from chunked_docs_0065.jsonl


Uploading to Snowflake:  70%|██████▉   | 67/96 [14:48<03:46,  7.80s/it]

Uploaded 177991 chunks from chunked_docs_0067.jsonl


Uploading to Snowflake:  71%|███████   | 68/96 [14:55<03:34,  7.65s/it]

Uploaded 168320 chunks from chunked_docs_0068.jsonl


Uploading to Snowflake:  72%|███████▏  | 69/96 [14:57<02:41,  5.99s/it]

Uploaded 162002 chunks from chunked_docs_0069.jsonl


Uploading to Snowflake:  73%|███████▎  | 70/96 [15:05<02:54,  6.71s/it]

Uploaded 164145 chunks from chunked_docs_0070.jsonl


Uploading to Snowflake:  74%|███████▍  | 71/96 [15:08<02:20,  5.60s/it]

Uploaded 172382 chunks from chunked_docs_0071.jsonl


Uploading to Snowflake:  75%|███████▌  | 72/96 [15:23<03:22,  8.43s/it]

Uploaded 174496 chunks from chunked_docs_0072.jsonl


Uploading to Snowflake:  76%|███████▌  | 73/96 [16:06<07:12, 18.81s/it]

Uploaded 170736 chunks from chunked_docs_0073.jsonl


Uploading to Snowflake:  77%|███████▋  | 74/96 [16:23<06:39, 18.16s/it]

Uploaded 161612 chunks from chunked_docs_0075.jsonl
Uploaded 167571 chunks from chunked_docs_0074.jsonl


Uploading to Snowflake:  79%|███████▉  | 76/96 [16:41<04:46, 14.31s/it]

Uploaded 176211 chunks from chunked_docs_0076.jsonl


Uploading to Snowflake:  80%|████████  | 77/96 [16:44<03:23, 10.71s/it]

Uploaded 175708 chunks from chunked_docs_0077.jsonl


Uploading to Snowflake:  81%|████████▏ | 78/96 [16:51<02:53,  9.63s/it]

Uploaded 174289 chunks from chunked_docs_0079.jsonl


Uploading to Snowflake:  82%|████████▏ | 79/96 [16:53<02:05,  7.38s/it]

Uploaded 178703 chunks from chunked_docs_0078.jsonl


Uploading to Snowflake:  83%|████████▎ | 80/96 [16:54<01:27,  5.49s/it]

Uploaded 171023 chunks from chunked_docs_0080.jsonl


Uploading to Snowflake:  84%|████████▍ | 81/96 [17:04<01:44,  6.94s/it]

Uploaded 179860 chunks from chunked_docs_0081.jsonl


Uploading to Snowflake:  85%|████████▌ | 82/96 [17:37<03:26, 14.78s/it]

Uploaded 178992 chunks from chunked_docs_0083.jsonl


Uploading to Snowflake:  86%|████████▋ | 83/96 [17:41<02:29, 11.54s/it]

Uploaded 171872 chunks from chunked_docs_0082.jsonl


Uploading to Snowflake:  88%|████████▊ | 84/96 [18:32<04:38, 23.25s/it]

Uploaded 182015 chunks from chunked_docs_0084.jsonl


Uploading to Snowflake:  89%|████████▊ | 85/96 [18:37<03:14, 17.72s/it]

Uploaded 171212 chunks from chunked_docs_0085.jsonl


Uploading to Snowflake:  90%|████████▉ | 86/96 [18:52<02:50, 17.06s/it]

Uploaded 176984 chunks from chunked_docs_0087.jsonl


Uploading to Snowflake:  91%|█████████ | 87/96 [18:59<02:06, 14.02s/it]

Uploaded 197168 chunks from chunked_docs_0086.jsonl


Uploading to Snowflake:  93%|█████████▎| 89/96 [19:07<00:59,  8.57s/it]

Uploaded 200698 chunks from chunked_docs_0088.jsonl
Uploaded 175528 chunks from chunked_docs_0089.jsonl


Uploading to Snowflake:  94%|█████████▍| 90/96 [19:16<00:52,  8.72s/it]

Uploaded 168970 chunks from chunked_docs_0091.jsonl


Uploading to Snowflake:  95%|█████████▍| 91/96 [19:17<00:32,  6.47s/it]

Uploaded 177995 chunks from chunked_docs_0090.jsonl


Uploading to Snowflake:  96%|█████████▌| 92/96 [19:24<00:25,  6.47s/it]

Uploaded 170932 chunks from chunked_docs_0092.jsonl


Uploading to Snowflake:  97%|█████████▋| 93/96 [19:26<00:15,  5.04s/it]

Uploaded 171063 chunks from chunked_docs_0093.jsonl


Uploading to Snowflake:  98%|█████████▊| 94/96 [19:28<00:08,  4.38s/it]

Uploaded 92523 chunks from chunked_docs_0096.jsonl


Uploading to Snowflake:  99%|█████████▉| 95/96 [19:37<00:05,  5.59s/it]

Uploaded 170785 chunks from chunked_docs_0095.jsonl


Uploading to Snowflake: 100%|██████████| 96/96 [19:38<00:00, 12.27s/it]

Uploaded 172612 chunks from chunked_docs_0094.jsonl



