创建二进制文件
Первая создает бинарный файл (min 2Гб), состоящий из случайных 32-разрядных беззнаковых целых чисел (big endian). 

In [1]:
import os
import struct
import random

def create_binary_file(filename, size_in_gb=2):
    num_bytes = size_in_gb * (1024**3)  
    # 将GB转换为字节
    num_integers = num_bytes // 4  
    # 计算32位整数的数量

    with open(filename, 'wb') as f:
        for _ in range(num_integers):
            integer = random.randint(0, 0xFFFFFFFF)
            f.write(struct.pack('>I', integer))  
            # 以大端模式写入Big-Endian

create_binary_file('mydata')


读取文件并进行计算,
Вторая считает сумму этих чисел (с применением длинной арифметики), находит минимальное и максимальное число.

In [2]:
import os
import struct
import random

def read_and_calculate(filename):
    sum_numbers = 0
    min_number = float('inf')
    max_number = float('-inf')

    with open(filename, 'rb') as f:
        while True:
            bytes_read = f.read(4)
            if not bytes_read:
                break
            number = struct.unpack('>I', bytes_read)[0]
            sum_numbers += number
            if number < min_number:
                min_number = number
            if number > max_number:
                max_number = number
    return sum_numbers, min_number, max_number

result = read_and_calculate('mydata')
print("Sum:", result[0], "Min:", result[1], "Max:", result[2])


Sum: 1152966617501680374 Min: 32 Max: 4294967293


多线程和内存映射文件.
Многопоточная + memory-mapped files

In [3]:
import mmap
import threading

def process_chunk(mm, offset, size):
    local_sum = 0
    local_min = float('inf')
    local_max = float('-inf')
    for i in range(offset, offset + size, 4):
        number = struct.unpack('>I', mm[i:i+4])[0]
        local_sum += number
        if number < local_min:
            local_min = number
        if number > local_max:
            local_max = number
    return local_sum, local_min, local_max

def threaded_read_and_calculate(filename, num_threads=4):
    file_size = os.path.getsize(filename)
    chunk_size = file_size // num_threads

    with open(filename, 'r+b') as f:
        mm = mmap.mmap(f.fileno(), 0)
        threads = []
        results = []

        for i in range(num_threads):
            offset = i * chunk_size
            size = chunk_size if i < num_threads - 1 else file_size - offset
            thread = threading.Thread(target=lambda q, arg1, arg2, arg3: q.append(process_chunk(arg1, arg2, arg3)),
                                      args=(results, mm, offset, size))
            threads.append(thread)
            thread.start()

        for thread in threads:
            thread.join()

        total_sum = sum(x[0] for x in results)
        total_min = min(x[1] for x in results)
        total_max = max(x[2] for x in results)
    return total_sum, total_min, total_max

result = threaded_read_and_calculate('mydata')
print("Sum:", result[0], "Min:", result[1], "Max:", result[2])


Sum: 1152966617501680374 Min: 32 Max: 4294967293


比较两者时间.
 Сравните время работы.

In [4]:
import time

start_time = time.time()
read_and_calculate('random_integers.bin')
print("顺序读取时间 Простое последовательное чтение:{:.2f} s".format(time.time() - start_time))

start_time = time.time()
threaded_read_and_calculate('random_integers.bin')
print("多线程读取时间 Многопоточная + memory-mapped files:{:.2f} s".format(time.time() - start_time))


顺序读取时间 Простое последовательное чтение:73.46 s
多线程读取时间 Многопоточная + memory-mapped files:95.09 s
