In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, LongType, DoubleType
from pyspark.sql.functions import mean, stddev, col
import pandas as pd

# Create a SparkSession
spark = SparkSession.builder \
    .appName("NormalizationExample") \
    .config("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem") \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .getOrCreate()

df1 = spark.createDataFrame([(202407130921, 1, 1.0),
                             (202407130921, 2, 2.0),
                             (202407130922, 1, 3.0),
                             (202407130922, 2, 4.0)],('time', 'id', 'v1'))
df2 = spark.createDataFrame([(202407130921, 1, 'x'),
                             (202407130921, 2, 'y'),(202407130922, 1, 'x'),(202407130922, 2, 'y')],('time', 'id', 'v2'))
def merge_ordered(l, r):
    return pd.merge_ordered(l, r)

new_df = df1.groupby('id').cogroup(df2.groupby('id')).applyInPandas(merge_ordered, schema='time long, id int, v1 double, v2 string')


In [2]:
new_df.write.csv('foo.csv', header=True, mode='overwrite')
spark.read.csv('foo.csv', header=True).show()

+------------+---+---+---+
|        time| id| v1| v2|
+------------+---+---+---+
|202407130921|  1|1.0|  x|
|202407130922|  1|3.0|  x|
|202407130921|  2|2.0|  y|
|202407130922|  2|4.0|  y|
+------------+---+---+---+



In [3]:
new_df.createOrReplaceTempView('TableA')
spark.sql("select count(*) from TableA").show()

+--------+
|count(1)|
+--------+
|       4|
+--------+



In [8]:
from pyspark.sql.functions import pandas_udf
import pandas as pd
#@pandas_udf("integer")

def add_one(series:pd.Series)->pd.Series:
    return series+1

spark.udf.register("add_Two", add_one)
# spark.sql("select add_Two(v1) from TableA").show()

<function __main__.add_one(series: pandas.core.series.Series) -> pandas.core.series.Series>

In [9]:
from pyspark.sql.functions import expr

new_df.select(expr('count(*)')>0).show()

+--------------+
|(count(1) > 0)|
+--------------+
|          true|
+--------------+



In [10]:
new_df.selectExpr('add_Two(v1)').show()

+-----------+
|add_Two(v1)|
+-----------+
|        2.0|
|        4.0|
|        3.0|
|        5.0|
+-----------+

