In [59]:
import mmap
from random import randint
from sys import getsizeof
import os
from datetime import datetime
import math
from tqdm import tqdm
import numpy as np
import time

In [36]:
getsizeof(2 ** 30), getsizeof(2 ** 40)

(32, 32)

In [49]:
def create_data(filename='numbers.txt', size=512*1024*1024):
    low = 0
    high = 2 ** 32 - 1
        
    with open(filename, 'wb') as f:
        for _ in tqdm(range(size)):
            f.write(np.random.randint(low, high, dtype= np.uint32).tobytes())
    print("Total count of numbers: ", size)

In [50]:
create_data()

100%|██████████| 536870912/536870912 [48:39<00:00, 183879.60it/s]  

Total count of numbers:  536870912





### Последовательное чтение

In [57]:
%%time
with open("numbers.txt", "rb") as f:
    buf = f.read()
    numbers = np.frombuffer(buf, dtype=np.dtype('uint32').newbyteorder('B'))
    s = 0
    mxN = float("-inf")
    mnN = float("inf")
    for n in tqdm(numbers):
        s += n
        mxN = max(n, mxN)
        mnN = min(n, mnN)
    print(s, mxN, mnN)

100%|██████████| 536870912/536870912 [08:59<00:00, 994879.85it/s] 

1152935057642425099 4294967292 35





### MMap и последовательное чтение

In [58]:
%%time
f_name = "numbers.txt"
with open(f_name, 'r+b') as f:
    with mmap.mmap(f.fileno(), length=0, offset=0, access=mmap.ACCESS_READ) as mm:
        buf = mm.read()
        numbers = np.frombuffer(buf, dtype=np.dtype('uint32').newbyteorder('B'))
        s = 0
        mxN = float("-inf")
        mnN = float("inf")
        for n in tqdm(numbers):
            s += n
            mxN = max(n, mxN)
            mnN = min(n, mnN)
        print(s, mxN, mnN)

100%|██████████| 536870912/536870912 [08:59<00:00, 995437.87it/s] 

1152935057642425099 4294967292 35
CPU times: user 8min 53s, sys: 4.69 s, total: 8min 57s
Wall time: 9min 9s





### Многопоточное

In [60]:
%%time
import concurrent.futures


n_workers = 8

    
def get_statistics(ls):
    ls = list(ls)
    return sum(ls), min(ls) if len(ls) else float("inf"), max(ls) if len(ls) else float("-inf")


with open('numbers.txt', 'r+b') as f:
    with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mm:
        delt = int(count / n_workers)
        buf = mm.read()
        numbers = np.frombuffer(buf, dtype=np.dtype('uint32').newbyteorder('B'))
        ls = (numbers[i:i + delt] for _ in range(0, count, delt))
        
        with concurrent.futures.ProcessPoolExecutor(max_workers=n_workers) as executor:
            arr = executor.map(get_statistics, ls)
        
        ans = 0
        mxN = float("-inf")
        mnN = float("inf")
        for s, mn, mx in arr:
            ans += s
            mxN = max(mx, mxN)
            mnN = min(mn, mnN)
        print(ans, mxN, mnN)

NameError: name 'count' is not defined