In [None]:
# Consolidation of all documents
import pickle
import os

def load_in_chunks(file_path, chunk_size=100000):
    """
    Step-by-step loading of large files, using generators to save memory
    :param file_path: file path
    :param chunk_size: number of entries per load
    """
    with open(file_path, 'rb') as f:
        try:
            while True:
                chunk = []
                for _ in range(chunk_size):
                    try:
                        item = pickle.load(f)
                        chunk.append(item)
                    except EOFError:
                        break
                if chunk:
                    yield chunk
                else:
                    break
        except EOFError:
            pass

def merge_and_save(file_paths, output_file, chunk_size=100000):
    """
    Merge multiple large files and save them to the output file
    :param file_paths: list of paths of files to be merged
    :param output_file: output path of merged files
    :param chunk_size: number of entries to process at a time
    """
    total_size = sum(os.path.getsize(file) for file in file_paths)
    processed_size = 0

    with open(output_file, 'wb') as out_f:
        for file_path in file_paths:
            for chunk in load_in_chunks(file_path, chunk_size):
                # Merge the current chunk into the main dictionary
                for data in chunk:
                    # Update merge logic
                    if isinstance(data, dict):
                        pickle.dump(data, out_f)
                processed_size += os.path.getsize(file_path)
                progress = processed_size / total_size * 100
                print(f"Processed {processed_size} bytes of {total_size} bytes ({progress:.2f}%)")

In [3]:
# List of file paths
file_paths = [f'../pkl/m3_chunk_512/m3_chunk_512_embedding_{i}.pkl' for i in range(1,6)]
output_file = './512_merged_chunk_embedding.pkl'

merge_and_save(file_paths, output_file,chunk_size= 1024 * 1024)

Processed 5756980926 bytes of 28784337326 bytes (20.00%)
Processed 11514972412 bytes of 28784337326 bytes (40.00%)
Processed 17272440874 bytes of 28784337326 bytes (60.01%)
Processed 23028652099 bytes of 28784337326 bytes (80.00%)
Processed 28784337326 bytes of 28784337326 bytes (100.00%)


In [1]:
import pickle
import numpy as np
# Load the .pkl file
with open('../pkl/m3_chunk_512/m3_chunk_512_embedding_1.pkl', 'rb') as file:
    doc_embeddings_dict = pickle.load(file)
    doc_ids = list(doc_embeddings_dict.keys())
    doc_embeddings = np.array([doc_embeddings_dict[doc_id] for doc_id in doc_ids])

In [2]:
print(len(doc_embeddings_dict.keys()))

first_value = next(iter(doc_embeddings_dict.keys()))
print(f"First value: {first_value}")
print(len(doc_embeddings_dict[first_value]))

keys_array = list(doc_embeddings_dict.keys())
print(keys_array[:5])

2732821
First value: doc-en-9633_chunk_0
1024
['doc-en-9633_chunk_0', 'doc-en-9633_chunk_1', 'doc-en-9633_chunk_2', 'doc-en-9633_chunk_3', 'doc-en-9633_chunk_4']
