In [83]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
columns = ['seqno', 'Name']
data = [(1, "VJ"), (2, "bnm"), (3, "mnb")]
df = spark.createDataFrame(data, columns)
df.show(truncate=True)

+-----+----+
|seqno|Name|
+-----+----+
|    1|  VJ|
|    2| bnm|
|    3| mnb|
+-----+----+



In [84]:
def to_upper_case(str):
    rel=""
    arr = str.split(" ")
    for x in arr:
        rel += x[0:1].upper()+x[1:]+" "
    return rel

from pyspark.sql.functions import col, udf
convert = udf(lambda z:to_upper_case(z))

df.select(col('seqno'), convert(col('Name')).alias('Changed Name')).show()

+-----+------------+
|seqno|Changed Name|
+-----+------------+
|    1|         VJ |
|    2|        Bnm |
|    3|        Mnb |
+-----+------------+



In [85]:
df.withColumn("changed_name", convert(col('Name'))).show()

+-----+----+------------+
|seqno|Name|changed_name|
+-----+----+------------+
|    1|  VJ|         VJ |
|    2| bnm|        Bnm |
|    3| mnb|        Mnb |
+-----+----+------------+



In [86]:
spark.udf.register("changed_to_upper", to_upper_case)
df.createOrReplaceTempView("new_name")
spark.sql('select seqno, changed_to_upper(Name) as Names from new_name').show()

+-----+-----+
|seqno|Names|
+-----+-----+
|    1|  VJ |
|    2| Bnm |
|    3| Mnb |
+-----+-----+



In [87]:
import sys
import os
import pyarrow
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType


# Set environment variables
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Create SparkSession with explicit configurations
spark = SparkSession.builder \
    .appName("ApplyInPandasExample") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "true") \
    .config("spark.python.worker.reuse", "false") \
    .config("spark.pyspark.python", sys.executable) \
    .config("spark.pyspark.driver.python", sys.executable) \
    .getOrCreate()



# Create a simple DataFrame
data = [(1, "red", 10), (2, "blue", 20), (3, "red", 30), (4, "blue", 40)]
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("color", StringType(), True),
    StructField("value", IntegerType(), True)
])
df = spark.createDataFrame(data, schema)

# Define a simple function to apply
def subtract_mean(pdf):
    pdf['value_minus_mean'] = pdf['value'] - pdf['value'].mean()
    return pdf

# Define complete output schema
output_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("color", StringType(), True),
    StructField("value", IntegerType(), True),
    StructField("value_minus_mean", DoubleType(), True)
])

# Apply the function using applyInPandas
result = df.groupBy("color").applyInPandas(subtract_mean, schema=output_schema)

# Show the result
result.show()

# Stop the SparkSession
spark.stop()

+---+-----+-----+----------------+
| id|color|value|value_minus_mean|
+---+-----+-----+----------------+
|  2| blue|   20|           -10.0|
|  4| blue|   40|            10.0|
|  1|  red|   10|           -10.0|
|  3|  red|   30|            10.0|
+---+-----+-----+----------------+



In [88]:
spark = SparkSession.builder \
    .appName("ApplyInPandasExample") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "true") \
    .config("spark.python.worker.reuse", "false") \
    .config("spark.pyspark.python", sys.executable) \
    .config("spark.pyspark.driver.python", sys.executable) \
    .getOrCreate()
df = spark.createDataFrame([
    ['red', 'banana', 1, 10], ['blue', 'banana', 2, 20], ['red', 'carrot', 3, 30],
    ['blue', 'grape', 4, 40], ['red', 'carrot', 5, 50], ['black', 'carrot', 6, 60],
    ['red', 'banana', 7, 70], ['red', 'grape', 8, 80]], schema=['color', 'fruit', 'v1', 'v2'])
df.show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  1| 10|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|black|carrot|  6| 60|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
+-----+------+---+---+



In [89]:
df.groupBy('color').avg().show()

+-----+-------+-------+
|color|avg(v1)|avg(v2)|
+-----+-------+-------+
|  red|    4.8|   48.0|
| blue|    3.0|   30.0|
|black|    6.0|   60.0|
+-----+-------+-------+



In [90]:
def plus_mean(pandas_df):
    return pandas_df.assign(v1 = pandas_df.v1-pandas_df.v1.mean())

df.groupBy('color').applyInPandas(plus_mean, schema=df.schema).show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|black|carrot|  0| 60|
| blue|banana| -1| 20|
| blue| grape|  1| 40|
|  red|banana| -3| 10|
|  red|carrot| -1| 30|
|  red|carrot|  0| 50|
|  red|banana|  2| 70|
|  red| grape|  3| 80|
+-----+------+---+---+



In [92]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, LongType, DoubleType
import pandas as pd

# Create a SparkSession
spark = SparkSession.builder \
    .appName("NormalizationExample") \
    .getOrCreate()

# Create a DataFrame
df1 = spark.createDataFrame([
    Row(a=1, b=2.0),
    Row(a=2, b=3.0),
    Row(a=3, b=5.0),
    Row(a=4, b=8.0),
    Row(a=5, b=13.0)
])

# Define a schema for the output DataFrame
schema = StructType([
    StructField("a", LongType(), True),
    StructField("b", DoubleType(), True)
])

# Define a pandas UDF for row-wise normalization
def normalize(pdf):
    mean = pdf['b'].mean()
    std = pdf['b'].std()
    if std != 0:
        pdf['b'] = (pdf['b'] - mean) / std
    else:
        pdf['b'] = 0  # Handle the case where std is zero
    return pdf

# Apply applyInPandas on DataFrame
df_normalized = df1.groupby().applyInPandas(normalize, schema=schema)

# Show the DataFrame
print("Normalized DataFrame:")
df_normalized.show()

# Stop the SparkSession
spark.stop()


Normalized DataFrame:
+---+--------------------+
|  a|                   b|
+---+--------------------+
|  1| -0.9462724090245991|
|  2| -0.7209694544949327|
|  3|-0.27036354543559976|
|  4| 0.40554531815339956|
|  5|  1.5320600908017319|
+---+--------------------+



In [93]:
from pyspark.sql import SparkSession, Row

spark = SparkSession.builder.appName('celcius to farhenhite').getOrCreate()
c_f_df = spark.createDataFrame([(1,25.0),(2,30.0),(3,35.0)],schema='id bigint, celcius double')
c_f_df.show()


+---+-------+
| id|celcius|
+---+-------+
|  1|   25.0|
|  2|   30.0|
|  3|   35.0|
+---+-------+



In [94]:
from pyspark.sql.types import StructType, StructField, LongType
schema = StructType([
                     StructField('id', LongType(), True), 
                     StructField('celcius', LongType(), True), 
                     StructField('Farhenite', LongType(), True)
                    ])

def cel_far(series):
    # for val in series:
    #     series[F]
    series['Farhenite'] = series['celcius'] * (9/5) + 32
    return series

c_f_df.groupBy().applyInPandas(cel_far, schema=schema).show()

+---+-------+---------+
| id|celcius|Farhenite|
+---+-------+---------+
|  1|     25|       77|
|  2|     30|       86|
|  3|     35|       95|
+---+-------+---------+



In [95]:
df1 = spark.createDataFrame([(202407130921, 1, 1.0),
                             (202407130921, 2, 2.0),
                             (202407130922, 1, 3.0),
                             (202407130922, 2, 4.0)],('time', 'id', 'v1'))
df2 = spark.createDataFrame([(202407130921, 1, 'x'),
                             (202407130921, 2, 'y'),(202407130922, 1, 'x'),(202407130922, 2, 'y')],('time', 'id', 'v2'))
def merge_ordered(l, r):
    return pd.merge_ordered(l, r)

df1.groupby('id').cogroup(df2.groupby('id')).applyInPandas(merge_ordered, schema='time long, id int, v1 double, v2 string').show()

+------------+---+---+---+
|        time| id| v1| v2|
+------------+---+---+---+
|202407130921|  1|1.0|  x|
|202407130922|  1|3.0|  x|
|202407130921|  2|2.0|  y|
|202407130922|  2|4.0|  y|
+------------+---+---+---+



In [96]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
columns = ['seqno', 'Name']
data = [(1, "VJ"), (2, "bnm"), (3, "mnb")]
df = spark.createDataFrame(data, columns)
df.show(truncate=True)

+-----+----+
|seqno|Name|
+-----+----+
|    1|  VJ|
|    2| bnm|
|    3| mnb|
+-----+----+



In [97]:
def to_upper_case(str):
    rel=""
    arr = str.split(" ")
    for x in arr:
        rel += x[0:1].upper()+x[1:]+" "
    return rel

from pyspark.sql.functions import col, udf
convert = udf(lambda z:to_upper_case(z))

df.select(col('seqno'), convert(col('Name')).alias('Changed Name')).show()

+-----+------------+
|seqno|Changed Name|
+-----+------------+
|    1|         VJ |
|    2|        Bnm |
|    3|        Mnb |
+-----+------------+



In [98]:
df.withColumn("changed_name", convert(col('Name'))).show()

+-----+----+------------+
|seqno|Name|changed_name|
+-----+----+------------+
|    1|  VJ|         VJ |
|    2| bnm|        Bnm |
|    3| mnb|        Mnb |
+-----+----+------------+



In [99]:
spark.udf.register("changed_to_upper", to_upper_case)
df.createOrReplaceTempView("new_name")
spark.sql('select seqno, changed_to_upper(Name) as Names from new_name').show()

+-----+-----+
|seqno|Names|
+-----+-----+
|    1|  VJ |
|    2| Bnm |
|    3| Mnb |
+-----+-----+



In [100]:
import sys
import os
import pyarrow
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType


# Set environment variables
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Create SparkSession with explicit configurations
spark = SparkSession.builder \
    .appName("ApplyInPandasExample") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "true") \
    .config("spark.python.worker.reuse", "false") \
    .config("spark.pyspark.python", sys.executable) \
    .config("spark.pyspark.driver.python", sys.executable) \
    .getOrCreate()



# Create a simple DataFrame
data = [(1, "red", 10), (2, "blue", 20), (3, "red", 30), (4, "blue", 40)]
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("color", StringType(), True),
    StructField("value", IntegerType(), True)
])
df = spark.createDataFrame(data, schema)

# Define a simple function to apply
def subtract_mean(pdf):
    pdf['value_minus_mean'] = pdf['value'] - pdf['value'].mean()
    return pdf

# Define complete output schema
output_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("color", StringType(), True),
    StructField("value", IntegerType(), True),
    StructField("value_minus_mean", DoubleType(), True)
])

# Apply the function using applyInPandas
result = df.groupBy("color").applyInPandas(subtract_mean, schema=output_schema)

# Show the result
result.show()

# Stop the SparkSession
spark.stop()

+---+-----+-----+----------------+
| id|color|value|value_minus_mean|
+---+-----+-----+----------------+
|  2| blue|   20|           -10.0|
|  4| blue|   40|            10.0|
|  1|  red|   10|           -10.0|
|  3|  red|   30|            10.0|
+---+-----+-----+----------------+



In [101]:
spark = SparkSession.builder \
    .appName("ApplyInPandasExample") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "true") \
    .config("spark.python.worker.reuse", "false") \
    .config("spark.pyspark.python", sys.executable) \
    .config("spark.pyspark.driver.python", sys.executable) \
    .getOrCreate()
df = spark.createDataFrame([
    ['red', 'banana', 1, 10], ['blue', 'banana', 2, 20], ['red', 'carrot', 3, 30],
    ['blue', 'grape', 4, 40], ['red', 'carrot', 5, 50], ['black', 'carrot', 6, 60],
    ['red', 'banana', 7, 70], ['red', 'grape', 8, 80]], schema=['color', 'fruit', 'v1', 'v2'])
df.show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  1| 10|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|black|carrot|  6| 60|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
+-----+------+---+---+



In [102]:
df.groupBy('color').avg().show()

+-----+-------+-------+
|color|avg(v1)|avg(v2)|
+-----+-------+-------+
|  red|    4.8|   48.0|
| blue|    3.0|   30.0|
|black|    6.0|   60.0|
+-----+-------+-------+



In [103]:
def plus_mean(pandas_df):
    return pandas_df.assign(v1 = pandas_df.v1-pandas_df.v1.mean())

df.groupBy('color').applyInPandas(plus_mean, schema=df.schema).show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|black|carrot|  0| 60|
| blue|banana| -1| 20|
| blue| grape|  1| 40|
|  red|banana| -3| 10|
|  red|carrot| -1| 30|
|  red|carrot|  0| 50|
|  red|banana|  2| 70|
|  red| grape|  3| 80|
+-----+------+---+---+



In [105]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, LongType, DoubleType
import pandas as pd

# Create a SparkSession
spark = SparkSession.builder \
    .appName("NormalizationExample") \
    .getOrCreate()

# Create a DataFrame
df1 = spark.createDataFrame([
    Row(a=1, b=2.0),
    Row(a=2, b=3.0),
    Row(a=3, b=5.0),
    Row(a=4, b=8.0),
    Row(a=5, b=13.0)
])

# Define a schema for the output DataFrame
schema = StructType([
    StructField("a", LongType(), True),
    StructField("b", DoubleType(), True)
])

# Define a pandas UDF for row-wise normalization
def normalize(pdf):
    mean = pdf['b'].mean()
    std = pdf['b'].std()
    if std != 0:
        pdf['b'] = (pdf['b'] - mean) / std
    else:
        pdf['b'] = 0  # Handle the case where std is zero
    return pdf

# Apply applyInPandas on DataFrame
df_normalized = df1.groupby().applyInPandas(normalize, schema=schema)

# Show the DataFrame
print("Normalized DataFrame:")
df_normalized.show()

# Stop the SparkSession
spark.stop()


Normalized DataFrame:
+---+--------------------+
|  a|                   b|
+---+--------------------+
|  1| -0.9462724090245991|
|  2| -0.7209694544949327|
|  3|-0.27036354543559976|
|  4| 0.40554531815339956|
|  5|  1.5320600908017319|
+---+--------------------+



In [106]:
from pyspark.sql import SparkSession, Row

spark = SparkSession.builder.appName('celcius to farhenhite').getOrCreate()
c_f_df = spark.createDataFrame([(1,25.0),(2,30.0),(3,35.0)],schema='id bigint, celcius double')
c_f_df.show()


+---+-------+
| id|celcius|
+---+-------+
|  1|   25.0|
|  2|   30.0|
|  3|   35.0|
+---+-------+



In [107]:
from pyspark.sql.types import StructType, StructField, LongType
schema = StructType([
                     StructField('id', LongType(), True), 
                     StructField('celcius', LongType(), True), 
                     StructField('Farhenite', LongType(), True)
                    ])

def cel_far(series):
    # for val in series:
    #     series[F]
    series['Farhenite'] = series['celcius'] * (9/5) + 32
    return series

c_f_df.groupBy().applyInPandas(cel_far, schema=schema).show()

+---+-------+---------+
| id|celcius|Farhenite|
+---+-------+---------+
|  1|     25|       77|
|  2|     30|       86|
|  3|     35|       95|
+---+-------+---------+



In [119]:
df1 = spark.createDataFrame([(202407130921, 1, 1.0),
                             (202407130921, 2, 2.0),
                             (202407130922, 1, 3.0),
                             (202407130922, 2, 4.0)],('time', 'id', 'v1'))
df2 = spark.createDataFrame([(202407130921, 1, 'x'),
                             (202407130921, 2, 'y'),(202407130922, 1, 'x'),(202407130922, 2, 'y')],('time', 'id', 'v2'))
def merge_ordered(l, r):
    return pd.merge_ordered(l, r)

new_df = df1.groupby('id').cogroup(df2.groupby('id')).applyInPandas(merge_ordered, schema='time long, id int, v1 double, v2 string')
new_df.show()

+------------+---+---+---+
|        time| id| v1| v2|
+------------+---+---+---+
|202407130921|  1|1.0|  x|
|202407130922|  1|3.0|  x|
|202407130921|  2|2.0|  y|
|202407130922|  2|4.0|  y|
+------------+---+---+---+

