In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import numpy as np
import pandas as pd

In [2]:
sc.stop()

NameError: name 'sc' is not defined

In [3]:
conf = SparkConf().setAppName('spark_pd').setMaster('local')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [4]:
sc

# Apache Arrow in PySpark
Apache Arrow是内存中的列式数据格式，在Spark中使用它来在JVM和Python进程之间有效地传输数据。

In [52]:
# Enable Arrow-based columnar data transfers
# spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") 3.0
# 2.4.6 需要 pyarrow 0.14.1  以下 pip install pyarrow==0.14.1

In [5]:
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
pdf = pd.DataFrame(np.random.rand(1000, 3), columns=list('ABC'))
pdf

Unnamed: 0,A,B,C
0,0.315843,0.940104,0.490178
1,0.490224,0.071753,0.165527
2,0.013744,0.160253,0.058007
3,0.271961,0.878121,0.056256
4,0.130155,0.321431,0.060351
...,...,...,...
995,0.313890,0.132734,0.228223
996,0.465946,0.415298,0.762607
997,0.370015,0.895913,0.759118
998,0.830166,0.345172,0.423870


In [6]:
df = spark.createDataFrame(pdf)
df

DataFrame[A: double, B: double, C: double]

In [7]:
df.select('A').filter('B > 0.5').count()

500

In [8]:
# Convert the Spark DataFrame back to a Pandas DataFrame using Arrow
result_pdf = df.select("*").toPandas()

In [9]:
result_pdf

Unnamed: 0,A,B,C
0,0.315843,0.940104,0.490178
1,0.490224,0.071753,0.165527
2,0.013744,0.160253,0.058007
3,0.271961,0.878121,0.056256
4,0.130155,0.321431,0.060351
...,...,...,...
995,0.313890,0.132734,0.228223
996,0.465946,0.415298,0.762607
997,0.370015,0.895913,0.759118
998,0.830166,0.345172,0.423870


# Pandas UDFs

Pandas UDFs are user defined functions that are executed by Spark using Arrow to transfer data and Pandas to work with the data, which allows vectorized operations.

Before Spark 3.0, Pandas UDFs used to be defined with PandasUDFType. From Spark 3.0 with Python 3.6+, you can also use Python type hints. Using Python type hints are preferred and using PandasUDFType will be deprecated in the future release.

In [60]:
from pyspark.sql.functions import pandas_udf
# spark 3.0 版本
@pandas_udf("col1 string, col2 long")
def func(s1: pd.Series, s2: pd.Series, s3: pd.DataFrame) -> pd.DataFrame:
    s3['col2'] = s1 + s2.str.len()
    return s3

NotImplementedError: Invalid returnType with scalar Pandas UDFs: StructType(List(StructField(col1,StringType,true),StructField(col2,LongType,true))) is not supported

## Scalar方式


In [12]:
import pandas as pd

from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import LongType

# Declare the function and create the UDF
def multiply_func(a, b):
    return a * b

multiply = pandas_udf(multiply_func, returnType=LongType())

# The function for a pandas_udf should be able to execute with local Pandas data
x = pd.Series([1, 2, 3])
print(multiply_func(x, x))
# 0    1
# 1    4
# 2    9
# dtype: int64

# Create a Spark DataFrame, 'spark' is an existing SparkSession
df = spark.createDataFrame(pd.DataFrame(x, columns=["x"]))

# Execute function as a Spark vectorized UDF
df.select(multiply(col("x"), col("x"))).show()

0    1
1    4
2    9
dtype: int64
+-------------------+
|multiply_func(x, x)|
+-------------------+
|                  1|
|                  4|
|                  9|
+-------------------+



## Grouped Map

In [16]:
from pyspark.sql.functions import pandas_udf, PandasUDFType

df = spark.createDataFrame(
    [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
    ("id", "v"))

@pandas_udf("id long, v double", PandasUDFType.GROUPED_MAP)
def subtract_mean(pdf):
    # pdf is a pandas.DataFrame
    v = pdf.v
    return pdf.assign(v=v - v.mean())

df.groupby("id").apply(subtract_mean).show()

+---+----+
| id|   v|
+---+----+
|  1|-0.5|
|  1| 0.5|
|  2|-3.0|
|  2|-1.0|
|  2| 4.0|
+---+----+



In [18]:
df.groupBy?

[0;31mSignature:[0m [0mdf[0m[0;34m.[0m[0mgroupBy[0m[0;34m([0m[0;34m*[0m[0mcols[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Groups the :class:`DataFrame` using the specified columns,
so we can run aggregation on them. See :class:`GroupedData`
for all the available aggregate functions.

:func:`groupby` is an alias for :func:`groupBy`.

:param cols: list of columns to group by.
    Each element should be a column name (string) or an expression (:class:`Column`).

>>> df.groupBy().avg().collect()
[Row(avg(age)=3.5)]
>>> sorted(df.groupBy('name').agg({'age': 'mean'}).collect())
[Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
>>> sorted(df.groupBy(df.name).avg().collect())
[Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
>>> sorted(df.groupBy(['name', df.age]).count().collect())
[Row(name='Alice', age=2, count=1), Row(name='Bob', age=5, count=1)]

.. versionadded:: 1.3
[0;31mFile:[0m      /usr/local/spark/python/pyspark/sql/