In [2]:
import os
import zstandard as zstd
import hashlib
import codecs
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
input_path="gu_meta_part_1.jsonl.zst"
output_path="gu_meta_part_1.txt"
expected_sha256="6f0bbaabd018fa5421c4f4fb4ef281359f602ecfd06c2b2bca606f48e48e354c" 

In [13]:
def file_sha256(file_path, chunk_size=2**20):
    sha256 = hashlib.sha256()
    with open(file_path, "rb") as f:
        while chunk := f.read(chunk_size):
            sha256.update(chunk)
    return sha256.hexdigest()

def decompress_jsonl_zst(input_path, output_path=None, chunk_size=2**20):
    if output_path is None:
        output_path = os.path.splitext(input_path)[0] + ".txt"
    
    dctx = zstd.ZstdDecompressor()
    decoder = codecs.getincrementaldecoder("utf-8")()

    with open(input_path, "rb") as f_in, open(output_path, "w", encoding="utf-8") as f_out:
        with dctx.stream_reader(f_in) as reader:
            while True:
                chunk = reader.read(chunk_size)
                if not chunk:
                    break
                f_out.write(decoder.decode(chunk))
        f_out.write(decoder.decode(b"", final=True))
    
    print(f"✅ Decompressed {os.path.basename(input_path)} → {os.path.basename(output_path)}")

def batch_decompress(checksum_file, folder):
    with open(checksum_file, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split(maxsplit=1)
            if len(parts) != 2:
                continue
            expected_hash, filename = parts
            file_path = os.path.join(folder, filename)

            if not os.path.exists(file_path):
                print(f"❌ Missing file: {filename}")
                continue

            actual_hash = file_sha256(file_path)
            if actual_hash.lower() != expected_hash.lower():
                print(f"❌ Checksum mismatch for {filename}! Skipping.")
                continue

            print(f"✔ Checksum OK for {filename}")
            decompress_jsonl_zst(file_path)

# Example usage:
batch_decompress(
    checksum_file=r"D:\collage\NLP\Assignment_1\2\checksum.sha256.txt",
    folder=r"D:\collage\NLP\Assignment_1\2"
)


✔ Checksum OK for gu_meta_part_1.jsonl.zst
✅ Decompressed gu_meta_part_1.jsonl.zst → gu_meta_part_1.jsonl.txt
✔ Checksum OK for gu_meta_part_3.jsonl.zst
✅ Decompressed gu_meta_part_3.jsonl.zst → gu_meta_part_3.jsonl.txt
✔ Checksum OK for gu_meta_part_2.jsonl.zst
✅ Decompressed gu_meta_part_2.jsonl.zst → gu_meta_part_2.jsonl.txt
✔ Checksum OK for gu_meta_part_4.jsonl.zst
✅ Decompressed gu_meta_part_4.jsonl.zst → gu_meta_part_4.jsonl.txt


In [14]:

import json

In [6]:

def file_sha256(file_path, chunk_size=2**20):
    sha256 = hashlib.sha256()
    with open(file_path, "rb") as f:
        while chunk := f.read(chunk_size):
            sha256.update(chunk)
    return sha256.hexdigest()

def decompress_jsonl_zst(input_path, output_path=None, chunk_size=2**20):
    if output_path is None:
        output_path = os.path.splitext(input_path)[0] + ".txt"
    
    dctx = zstd.ZstdDecompressor()
    decoder = codecs.getincrementaldecoder("utf-8")()

    with open(input_path, "rb") as f_in, open(output_path, "w", encoding="utf-8") as f_out:
        with dctx.stream_reader(f_in) as reader:
            while True:
                chunk = reader.read(chunk_size)
                if not chunk:
                    break
                f_out.write(decoder.decode(chunk))
        f_out.write(decoder.decode(b"", final=True))
    
    print(f"✅ Decompressed {os.path.basename(input_path)} → {os.path.basename(output_path)}")

def batch_decompress(checksum_file, folder):
    with open(checksum_file, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split(maxsplit=1)
            if len(parts) != 2:
                continue
            expected_hash, filename = parts
            file_path = os.path.join(folder, filename)

            if not os.path.exists(file_path):
                print(f"❌ Missing file: {filename}")
                continue

            actual_hash = file_sha256(file_path)
            if actual_hash.lower() != expected_hash.lower():
                print(f"❌ Checksum mismatch for {filename}! Skipping.")
                continue

            print(f"✔ Checksum OK for {filename}")
            decompress_jsonl_zst(file_path)

# Example usage:
batch_decompress(
    checksum_file=r"D:\collage\NLP\Assignment_1\2\checksum2.sha256.txt",
    folder=r"D:\collage\NLP\Assignment_1\2"
)


❌ Checksum mismatch for gu_meta_part_3.jsonl.zst! Skipping.
✔ Checksum OK for gu_meta_part_4.jsonl.zst


OSError: [Errno 28] No space left on device