In [14]:
from func import timer, get_factors, get_sum_of_factors, create_file, get_data_from_file
import warnings
warnings.filterwarnings("ignore")

##### Генерируем файл с 5000 чисел int32

In [2]:
create_file(5000)

##### Извлекаем и обрабатываем данные из файла

In [3]:
numbers = get_data_from_file('file.txt')

#### Подсчет последовательным алгоритмом

In [5]:
with timer():
    print(get_sum_of_factors(numbers))


20333
Time: 3.1591727000195533


#### С помощью multiprocessing

In [7]:
from multiprocessing import Pool
with timer():
    n_proc = 4
    part_size = len(numbers)//4
    init = map(lambda x: (numbers[x*part_size:part_size*(x+1)],), range(n_proc))
    with Pool() as pool:
        results = pool.starmap(get_sum_of_factors, init)
    print(sum(results))

20333
Time: 1.7054818999022245


#### C помощью PySpark

In [10]:
import os
import sys

from pyspark import SparkContext
from pyspark.sql import SparkSession

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

##### RDD

In [11]:
spark = SparkContext(master='local[8]', appName='demo1')
with timer():
    rdd = spark.textFile('file.txt').map(lambda x:int(x)).map(lambda x: get_factors(x)).sum()

Time: 3.9471883999649435


##### PySpark pandas

In [15]:
import pyspark.pandas as ps
df = ps.read_csv('file.txt',names=['numbers'])
with timer():
    df['numbers'].apply(get_factors).sum()
spark.stop()

Time: 5.94504990009591


##### Spark SQL

In [34]:
from pyspark.sql import functions as F
from pyspark.sql.functions import udf,col
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.master('local[*]').appName('Pys').getOrCreate()
df = spark.read.csv('file.txt',schema="numbers INT")
get_factorsUDF = udf(lambda x:get_factors(x),IntegerType())
with timer():
    df.withColumn('fractors',get_factorsUDF(col('numbers'))).select(F.sum(col('fractors'))).show()


+-------------+
|sum(fractors)|
+-------------+
|        20333|
+-------------+

Time: 3.9229267998598516


##### Просто пандас

In [37]:
import pandas as pd
df = pd.read_csv('file.txt', names=['numbers'])
with timer():
    df['numbers'].apply(get_factors).sum()

Time: 3.3960144999437034
