In [23]:
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import StructField, FloatType, StructType
import pandas as pd
import numpy as np
from numpy import random
import time

StatementMeta(large, 11, 23, Finished, Available)

In [24]:
one_m = 1000000
num_records_m = 10
num_records = num_records_m * one_m

StatementMeta(large, 11, 24, Finished, Available)

In [25]:
def simple_numpy_calc(narr):
    return np.array([np.mean(x) for x in narr])

StatementMeta(large, 11, 25, Finished, Available)

## Show the logic over 5 records

In [26]:
pdf=pd.DataFrame([tuple(random.uniform(1,10,3)) for i in range(5)], columns=["x", "y","z"])
pdf


StatementMeta(large, 11, 26, Finished, Available)

Unnamed: 0,x,y,z
0,6.408064,7.358587,4.508358
1,4.405009,7.316064,7.3927
2,3.470467,5.045778,6.76963
3,2.108697,6.53613,5.291473
4,1.588373,4.657182,3.064074


In [27]:
arr = pdf[:100].to_numpy()
arr

StatementMeta(large, 11, 27, Finished, Available)

array([[6.4080642 , 7.35858678, 4.50835829],
       [4.40500907, 7.31606429, 7.39270004],
       [3.47046724, 5.04577795, 6.76962958],
       [2.1086974 , 6.53612977, 5.29147271],
       [1.58837302, 4.65718193, 3.06407448]])

In [28]:
mean_arr = simple_numpy_calc(arr)
mean_arr

StatementMeta(large, 11, 28, Finished, Available)

array([6.09166976, 6.3712578 , 5.09529159, 4.64543329, 3.10320981])

In [29]:
pdf["mean"]=mean_arr
pdf

StatementMeta(large, 11, 29, Finished, Available)

Unnamed: 0,x,y,z,mean
0,6.408064,7.358587,4.508358,6.09167
1,4.405009,7.316064,7.3927,6.371258
2,3.470467,5.045778,6.76963,5.095292
3,2.108697,6.53613,5.291473,4.645433
4,1.588373,4.657182,3.064074,3.10321


## Doing it in Python (Pandas/Numpy)

In [30]:
pdf=pd.DataFrame([tuple(random.uniform(1,10,3)) for i in range(num_records)], columns=["x", "y","z"])
print(f"{len(pdf)/one_m} M")
pdf[:3]


StatementMeta(large, 11, 30, Finished, Available)

10.0 M


Unnamed: 0,x,y,z
0,4.793687,3.704907,5.944925
1,9.207575,9.932247,8.183925
2,9.628982,1.771443,6.662008


In [31]:
start = time.time()
arr = pdf.to_numpy()
mean_arr = simple_numpy_calc(arr)
pdf["mean"]=mean_arr
print(f"Run in {time.time()-start} seconds")
pdf[:3]

StatementMeta(large, 11, 31, Finished, Available)

Run in 55.66557192802429 seconds


Unnamed: 0,x,y,z,mean
0,4.793687,3.704907,5.944925,4.814506
1,9.207575,9.932247,8.183925,9.107916
2,9.628982,1.771443,6.662008,6.020811


## Doing it at scale in Spark

Creating a Spark dataframe and repartition over 100 partitions

In [32]:
df=spark.createDataFrame(pdf)
assert df.count()==num_records
display(df.limit(3))


StatementMeta(large, 11, 32, Finished, Available)

SynapseWidget(Synapse.DataFrame, 3e0fedad-4dc8-4572-aeda-837641171df0)

In [33]:
df.rdd.getNumPartitions()

StatementMeta(large, 11, 33, Finished, Available)

8

Creating a function to apply with **mapInPandas**

https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.mapInPandas.html#pyspark-sql-dataframe-mapinpandas

In [34]:
def some_func(iterator):
    for p_df in iterator:
        arr = p_df.to_numpy()
        mean_arr = simple_numpy_calc(arr)
        p_df["mean"]=mean_arr
        yield p_df


StatementMeta(large, 11, 34, Finished, Available)

In [35]:
schema = StructType([StructField("mean", FloatType())])

StatementMeta(large, 11, 35, Finished, Available)

In [36]:
df2=df.mapInPandas(some_func, schema)
start = time.time()
display(df2)
print(f"Run in {time.time()-start} seconds")

StatementMeta(large, 11, 36, Finished, Available)

SynapseWidget(Synapse.DataFrame, f6a16ab3-02d1-4902-ab9d-85612730bdf5)

Run in 0.6651790142059326 seconds
