In [1]:
# 1. Install Java
!apt-get install openjdk-11-jdk-headless -qq > /dev/null

# 2. Download Spark 3.5.0 with Hadoop 3
!wget -q https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz

# 3. Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"

In [2]:
# 4. Install findspark
!pip install -q findspark
import findspark
findspark.init()

In [3]:
# 5. Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("Colab-Spark").getOrCreate()


In [4]:
spark

**Lazy Evaluation & Action**

In [78]:
#To see spark jobs and DAGs
spark.sparkContext.uiWebUrl

'http://d895f6773331:4040'

In [80]:
!pip install -q pyngrok
from pyngrok import ngrok, conf

# Replace with your token
!ngrok config add-authtoken 33Ofpjn1uGGV5JZne7NM8uLDw0d_4pfoUSsGk6QRPEs9YaoUT

# Connect to Spark UI (4040)
spark_ui = ngrok.connect(4040)
print("Spark UI link:", spark_ui.public_url)

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Spark UI link: https://arlo-unscorching-grumbly.ngrok-free.dev


In [8]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [9]:
# Test DataFrame
data = [
       ("Alice",25,"New York"),
       ("Bob",30,"San Francisco"),
       ("Charlie",25,"Chicago")
]

schema = StructType([
    StructField("name",StringType(),True),
    StructField("age",IntegerType(),True),
    StructField("city",StringType(),True)
])
df= spark.createDataFrame(data,schema=schema)

**Narrow Transformations**

In [10]:
df = df.filter(col('city')=='New York')

In [11]:
df=df.select('city')

In [12]:
df.show()

+--------+
|    city|
+--------+
|New York|
+--------+



In [13]:
df.explain()

== Physical Plan ==
*(1) Project [city#2]
+- *(1) Filter (isnotnull(city#2) AND (city#2 = New York))
   +- *(1) Scan ExistingRDD[name#0,age#1,city#2]




**Wide Transformations**

In [14]:
# Test DataFrame
data = [
       ("Alice",25,"New York"),
       ("Bob",30,"San Francisco"),
       ("Charlie",25,"Chicago")
]

schema = StructType([
    StructField("name",StringType(),True),
    StructField("age",IntegerType(),True),
    StructField("city",StringType(),True)
])
df= spark.createDataFrame(data,schema=schema)

In [15]:
df = df.groupBy('city').agg(max(col('age')))

In [16]:
df.show()

+-------------+--------+
|         city|max(age)|
+-------------+--------+
|     New York|      25|
|San Francisco|      30|
|      Chicago|      25|
+-------------+--------+



In [17]:
df.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[city#14], functions=[max(age#13)])
   +- Exchange hashpartitioning(city#14, 200), ENSURE_REQUIREMENTS, [plan_id=85]
      +- HashAggregate(keys=[city#14], functions=[partial_max(age#13)])
         +- Project [age#13, city#14]
            +- Scan ExistingRDD[name#12,age#13,city#14]




****Repartition and Coalesce**

In [23]:
df.rdd.getNumPartitions()

1

In [19]:
#repartition
df=df.repartition(3)

In [22]:
#Coalesce
df=df.coalesce(1)

In [24]:
df.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   Coalesce 1
   +- ShuffleQueryStage 1
      +- Exchange RoundRobinPartitioning(3), REPARTITION_BY_NUM, [plan_id=231]
         +- *(2) HashAggregate(keys=[city#14], functions=[max(age#13)])
            +- AQEShuffleRead coalesced
               +- ShuffleQueryStage 0
                  +- Exchange hashpartitioning(city#14, 200), ENSURE_REQUIREMENTS, [plan_id=207]
                     +- *(1) HashAggregate(keys=[city#14], functions=[partial_max(age#13)])
                        +- *(1) Project [age#13, city#14]
                           +- *(1) Scan ExistingRDD[name#12,age#13,city#14]
+- == Initial Plan ==
   Coalesce 1
   +- Exchange RoundRobinPartitioning(3), REPARTITION_BY_NUM, [plan_id=194]
      +- HashAggregate(keys=[city#14], functions=[max(age#13)])
         +- Exchange hashpartitioning(city#14, 200), ENSURE_REQUIREMENTS, [plan_id=192]
            +- HashAggregate(keys=[city#14], functions=[partial_max(a