In [1]:
import mmap
from random import randint
from sys import getsizeof
import os
from datetime import datetime
import math
from tqdm import tqdm
import numpy as np
import time
import threading
import concurrent.futures as ft
import multiprocessing

In [2]:
getsizeof(2 ** 32), getsizeof(2 ** 50)

(32, 32)

In [3]:
def create_data(filename='numbers.txt', size=17179869184):
    """
    Генерация чисел и запись в файл.
    filename - имя файла;
    size - минимальный размер файла.
    """
    length = 0
    count = 0
    with open(filename, 'w') as f:
        while length < size:
            st = f"{randint(2 ** 32, 2 ** 50)}\n"
            f.write(st)
            length += getsizeof(st)
            count += 1
            if count % 100000000 == 0:
                print(length / (8 * 1048576), count)
    print("Total count of numbers: ", count)

In [4]:
%%time
create_data()


775.0170210599899 100000000
1550.0345447063446 200000000
Total count of numbers:  264252199
CPU times: user 8min 57s, sys: 20.5 s, total: 9min 17s
Wall time: 9min 52s


### Последовательное чтение

In [5]:
%%time
with open("numbers.txt", "r") as f:
    line = " "
    s = 0
    mxN = float("-inf")
    mnN = float("inf")
    count = 0
    while line:
        count += 1
        line = f.readline()
        if not line.strip().isnumeric():
            break
        n = int(line.strip())
        s += n
        mxN = max(n, mxN)
        mnN = min(n, mnN)
        
        if count % 30_000_000 == 0:
            print(count)

print(s, mxN, mnN)

30000000
60000000
90000000
120000000
150000000
180000000
210000000
240000000
148754336297275047145047 1125899906312296 4296273272
CPU times: user 6min, sys: 6 s, total: 6min 6s
Wall time: 6min 10s


### MMap и последовательное чтение

In [6]:
%%time
f_name = "numbers.txt"
with open(f_name, 'r+b') as f:
    with mmap.mmap(f.fileno(), length=0, offset=0, access=mmap.ACCESS_READ) as mm:
        line = " "
        s = 0
        mxN = float("-inf")
        mnN = float("inf")
        count = 0
        while line:
            count += 1
            line = mm.readline()
            if not line.strip().decode().isnumeric():
                break
            n = int(line.strip().decode())
            s += n
            mxN = max(n, mxN)
            mnN = min(n, mnN)

            if count % 30_000_000 == 0:
                print(count)

print(s, mxN, mnN)             

30000000
60000000
90000000
120000000
150000000
180000000
210000000
240000000
148754336297275047145047 1125899906312296 4296273272
CPU times: user 6min 9s, sys: 3.42 s, total: 6min 12s
Wall time: 6min 15s


### Многопоточное

In [None]:
%%time
import concurrent.futures


n_workers = 8

def get_numbers(f, length):
    ls = []
    for i in range(length):
        line = f.readline()
        if line.strip():
            ls.append(int(line.strip()))
    return iter(ls)
    
def get_statistics(ls):
    ls = list(ls)
    return sum(ls), min(ls) if len(ls) else float("inf"), max(ls) if len(ls) else float("-inf")

with open('numbers.txt', 'r+b') as f:
    with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mm:
        line = " "
        delt = int(count / n_workers)
        ls = (get_numbers(mm, delt) for _ in range(0, count, delt))
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=n_workers) as executor:
            arr = executor.map(get_statistics, ls)
        
        ans = 0
        mxN = float("-inf")
        mnN = float("inf")
        for s, mn, mx in arr:
            ans += s
            mxN = max(mx, mxN)
            mnN = min(mn, mnN)
        print(ans, mxN, mnN)