In [1]:
# 1. Install Java
!apt-get install openjdk-11-jdk-headless -qq > /dev/null

# 2. Download Spark 3.5.0 with Hadoop 3
!wget -q https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz

# 3. Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"

In [2]:
# 4. Install findspark
!pip install -q findspark
import findspark
findspark.init()

In [3]:
# 5. Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("Colab-PySpark").getOrCreate()

In [4]:
spark

#Arrays in Pyspark

Useful for storing lists within columns

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import array, lit, col,udf

In [6]:
# Create simple DataFrame
data = [("Alice",), ("Bob",), ("Charlie",)]
df1 = spark.createDataFrame(data, ["Name"])

# Add an array column
df1 = df1.withColumn("Scores", array(lit(85), lit(90), lit(78)))

df1.show(truncate=False)

+-------+------------+
|Name   |Scores      |
+-------+------------+
|Alice  |[85, 90, 78]|
|Bob    |[85, 90, 78]|
|Charlie|[85, 90, 78]|
+-------+------------+



In [7]:
#Access the first element in array
df1.select("Name", col("Scores")[0].alias("First_Score")).show()


+-------+-----------+
|   Name|First_Score|
+-------+-----------+
|  Alice|         85|
|    Bob|         85|
|Charlie|         85|
+-------+-----------+



#MAPS In Pyspark

Key-value pairs, helpful for dictionary like data

In [8]:
from pyspark.sql.types import StructType, StructField, StringType, MapType

schema = StructType([
    StructField("name", StringType(), True),
    StructField("properties", MapType(StringType(), StringType()), True)
])


In [9]:

data = [
    ("Alice", {"city": "Austin", "age": "25"}),
    ("Bob", {"city": "Dallas", "age": "30"})
]

df = spark.createDataFrame(data, schema=schema)
df.show(truncate=False)

+-----+---------------------------+
|name |properties                 |
+-----+---------------------------+
|Alice|{city -> Austin, age -> 25}|
|Bob  |{city -> Dallas, age -> 30}|
+-----+---------------------------+



#User Defined Functions



*   Reuse and repeat common tasks.
*   Registered directly with spark & can be shared.


*   Pyspark UDFs for smaller datasets.
*   Pandas UDFs for large datasets.









In [10]:
#Defining and registering a UDF

#define a function
def to_uppercase(s):
  return s.upper() if s else None

In [13]:
#register the function
to_uppercase_udf=udf(to_uppercase,StringType())

In [15]:
# apply udf to dataframe
df1=df1.withColumn("name_upper",to_uppercase_udf(df1["name"]))

In [16]:
df1.show()

+-------+------------+----------+
|   Name|      Scores|name_upper|
+-------+------------+----------+
|  Alice|[85, 90, 78]|     ALICE|
|    Bob|[85, 90, 78]|       BOB|
|Charlie|[85, 90, 78]|   CHARLIE|
+-------+------------+----------+



In [18]:
#Example-#Step 1:define a function
def add_prefix(name):
    return f"Hello {name}" if name else None

# Step 2: Register the function as a UDF
add_prefix_udf = udf(add_prefix, StringType())

# Step 3: Apply the UDF to a column
df1= df1.withColumn("greeting", add_prefix_udf(df1["name"]))

# Step 4: Show results
df1.show()

+-------+------------+----------+-------------+
|   Name|      Scores|name_upper|     greeting|
+-------+------------+----------+-------------+
|  Alice|[85, 90, 78]|     ALICE|  Hello Alice|
|    Bob|[85, 90, 78]|       BOB|    Hello Bob|
|Charlie|[85, 90, 78]|   CHARLIE|Hello Charlie|
+-------+------------+----------+-------------+



**Pandas UDF**(for large datasets)



*   Eliminates costly conversions of code and data.
*   Does not need to be registered to SparkSession.




In [19]:
from pyspark.sql.functions import pandas_udf

In [21]:
@pandas_udf("float")
def fahrenheit_to_celsius_pandas(temp_f):
  return (temp_f-32) * 5.0/9.0

In [22]:
data1 = [(32.0,), (68.0,), (100.0,), (212.0,)]
df3 = spark.createDataFrame(data1, ["temp_f"])

In [23]:
df3.show()

+------+
|temp_f|
+------+
|  32.0|
|  68.0|
| 100.0|
| 212.0|
+------+

