In [47]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
columns = ['seqno', 'Name']
data = [(1, "VJ"), (2, "bnm"), (3, "mnb")]
df = spark.createDataFrame(data, columns)
df.show(truncate=True)

+-----+----+
|seqno|Name|
+-----+----+
|    1|  VJ|
|    2| bnm|
|    3| mnb|
+-----+----+



In [48]:
def to_upper_case(str):
    rel=""
    arr = str.split(" ")
    for x in arr:
        rel += x[0:1].upper()+x[1:]+" "
    return rel

from pyspark.sql.functions import col, udf
convert = udf(lambda z:to_upper_case(z))

df.select(col('seqno'), convert(col('Name')).alias('Changed Name')).show()

+-----+------------+
|seqno|Changed Name|
+-----+------------+
|    1|         VJ |
|    2|        Bnm |
|    3|        Mnb |
+-----+------------+



In [49]:
df.withColumn("changed_name", convert(col('Name'))).show()

+-----+----+------------+
|seqno|Name|changed_name|
+-----+----+------------+
|    1|  VJ|         VJ |
|    2| bnm|        Bnm |
|    3| mnb|        Mnb |
+-----+----+------------+



In [50]:
spark.udf.register("changed_to_upper", to_upper_case)
df.createOrReplaceTempView("new_name")
spark.sql('select seqno, changed_to_upper(Name) as Names from new_name').show()

+-----+-----+
|seqno|Names|
+-----+-----+
|    1|  VJ |
|    2| Bnm |
|    3| Mnb |
+-----+-----+



In [51]:
import sys
import os
import pyarrow
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType


# Set environment variables
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Create SparkSession with explicit configurations
spark = SparkSession.builder \
    .appName("ApplyInPandasExample") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "true") \
    .config("spark.python.worker.reuse", "false") \
    .config("spark.pyspark.python", sys.executable) \
    .config("spark.pyspark.driver.python", sys.executable) \
    .getOrCreate()



# Create a simple DataFrame
data = [(1, "red", 10), (2, "blue", 20), (3, "red", 30), (4, "blue", 40)]
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("color", StringType(), True),
    StructField("value", IntegerType(), True)
])
df = spark.createDataFrame(data, schema)

# Define a simple function to apply
def subtract_mean(pdf):
    pdf['value_minus_mean'] = pdf['value'] - pdf['value'].mean()
    return pdf

# Define complete output schema
output_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("color", StringType(), True),
    StructField("value", IntegerType(), True),
    StructField("value_minus_mean", DoubleType(), True)
])

# Apply the function using applyInPandas
result = df.groupBy("color").applyInPandas(subtract_mean, schema=output_schema)

# Show the result
result.show()

# Stop the SparkSession
spark.stop()

+---+-----+-----+----------------+
| id|color|value|value_minus_mean|
+---+-----+-----+----------------+
|  2| blue|   20|           -10.0|
|  4| blue|   40|            10.0|
|  1|  red|   10|           -10.0|
|  3|  red|   30|            10.0|
+---+-----+-----+----------------+



In [52]:
spark = SparkSession.builder \
    .appName("ApplyInPandasExample") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "true") \
    .config("spark.python.worker.reuse", "false") \
    .config("spark.pyspark.python", sys.executable) \
    .config("spark.pyspark.driver.python", sys.executable) \
    .getOrCreate()
df = spark.createDataFrame([
    ['red', 'banana', 1, 10], ['blue', 'banana', 2, 20], ['red', 'carrot', 3, 30],
    ['blue', 'grape', 4, 40], ['red', 'carrot', 5, 50], ['black', 'carrot', 6, 60],
    ['red', 'banana', 7, 70], ['red', 'grape', 8, 80]], schema=['color', 'fruit', 'v1', 'v2'])
df.show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  1| 10|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|black|carrot|  6| 60|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
+-----+------+---+---+



In [53]:
df.groupBy('color').avg().show()

+-----+-------+-------+
|color|avg(v1)|avg(v2)|
+-----+-------+-------+
|  red|    4.8|   48.0|
| blue|    3.0|   30.0|
|black|    6.0|   60.0|
+-----+-------+-------+



In [54]:
def plus_mean(pandas_df):
    return pandas_df.assign(v1 = pandas_df.v1-pandas_df.v1.mean())

df.groupBy('color').applyInPandas(plus_mean, schema=df.schema).show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|black|carrot|  0| 60|
| blue|banana| -1| 20|
| blue| grape|  1| 40|
|  red|banana| -3| 10|
|  red|carrot| -1| 30|
|  red|carrot|  0| 50|
|  red|banana|  2| 70|
|  red| grape|  3| 80|
+-----+------+---+---+



In [61]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import pandas_udf

# import sys
# import os
# import pyarrow
# import pandas as pd
# from pyspark.sql import SparkSession
# from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType


# # Set environment variables
# os.environ['PYSPARK_PYTHON'] = sys.executable
# os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# # Create SparkSession with explicit configurations
# spark = SparkSession.builder \
#     .appName("ApplyInPandasExample") \
#     .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
#     .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "true") \
#     .config("spark.python.worker.reuse", "false") \
#     .config("spark.pyspark.python", sys.executable) \
#     .config("spark.pyspark.driver.python", sys.executable) \
#     .getOrCreate()



# Create a DataFrame
df1 = spark.createDataFrame([
    Row(a=1, b=2.0),
    Row(a=2, b=3.0),
    Row(a=3, b=5.0),
    Row(a=4, b=8.0),
    Row(a=5, b=13.0)
])

# Define a pandas UDF for row-wise normalization
#@pandas_udf("a long, b double")
def normalize(iterator):
    for pdf in iterator:
        yield pdf.assign(b=(pdf.b - pdf.b.mean()) / pdf.b.std())

# Apply mapInPandas on DataFrame
df1.mapInPandas(normalize, schema=df1.schema).show()



+---+----+
|  a|   b|
+---+----+
|  1|NULL|
|  2|NULL|
|  3|NULL|
|  4|NULL|
|  5|NULL|
+---+----+

