In [None]:
# Install Java
!apt-get install openjdk-11-jdk-headless -qq > /dev/null

# Download Spark 3.5.0 with Hadoop 3
!wget -q https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

# Extract the Spark tar file
!tar -xzf spark-3.5.0-bin-hadoop3.tgz

# Move to /opt for standard setup
!mv spark-3.5.0-bin-hadoop3 /opt/spark

# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/opt/spark"

# Install findspark
!pip install -q findspark

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Test Spark Session") \
    .getOrCreate()

spark.range(5).show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Test_SQL").getOrCreate()
spark

In [2]:
df = spark.read.csv("gas_prices.csv", header=True, inferSchema=True)

In [3]:
df.createOrReplaceTempView("Gas_Prices_View")

In [4]:
result = spark.sql("SELECT * FROM Gas_Prices_View")
result.show()

+----+---------+------+------+-------+-----+-----+------+-----------+----+----+
|Year|Australia|Canada|France|Germany|Italy|Japan|Mexico|South Korea|  UK| USA|
+----+---------+------+------+-------+-----+-----+------+-----------+----+----+
|1990|     NULL|  1.87|  3.63|   2.65| 4.59| 3.16|   1.0|       2.05|2.82|1.16|
|1991|     1.96|  1.92|  3.45|    2.9|  4.5| 3.46|   1.3|       2.49|3.01|1.14|
|1992|     1.89|  1.73|  3.56|   3.27| 4.53| 3.58|   1.5|       2.65|3.06|1.13|
|1993|     1.73|  1.57|  3.41|   3.07| 3.68| 4.16|  1.56|       2.88|2.84|1.11|
|1994|     1.84|  1.45|  3.59|   3.52|  3.7| 4.36|  1.48|       2.87|2.99|1.11|
|1995|     1.95|  1.53|  4.26|   3.96|  4.0| 4.43|  1.11|       2.94|3.21|1.15|
|1996|     2.12|  1.61|  4.41|   3.94| 4.39| 3.64|  1.25|       3.18|3.34|1.23|
|1997|     2.05|  1.62|   4.0|   3.53| 4.07| 3.26|  1.47|       3.34|3.83|1.23|
|1998|     1.63|  1.38|  3.87|   3.34| 3.84| 2.82|  1.49|       3.04|4.06|1.06|
|1999|     1.72|  1.52|  3.85|   3.42| 3

In [8]:
spark.sql("SELECT Year, Canada, UK, USA FROM Gas_Prices_View WHERE Year >= 2000").show()

+----+------+----+----+
|Year|Canada|  UK| USA|
+----+------+----+----+
|2000|  1.86|4.58|1.51|
|2001|  1.72|4.13|1.46|
|2002|  1.69|4.16|1.36|
|2003|  1.99| 4.7|1.59|
|2004|  2.37|5.56|1.88|
|2005|  2.89|5.97| 2.3|
|2006|  3.26|6.36|2.59|
|2007|  3.59|7.13| 2.8|
|2008|  4.08|7.42|3.27|
+----+------+----+----+



In [None]:
spark.sql("SELECT Year, (Canada, UK, USA FROM Gas_Prices_View WHERE Year >= 2000").show()

In [12]:
#df.select(col("Year"), col("UK") + 1).show()

In [13]:
df.write.mode("overwrite").csv("output/gas_price_csv", header=True)

In [29]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import col

df_clean=df.na.drop(how="any",subset=['Australia'])
def year_group(Year):
    return "Modern" if Year >= 2000 else "Vintage"

def price_group(price):
    if price is None:
        return "Unknown"
    try:
        if int(price < 3):
            return "Cheap"
        else:
            return "Expensive"
    except Exception:
        return "Invalid"

year_udf = udf(year_group, StringType())
price_udf = udf(price_group, StringType())

df_clean.withColumn("year_group", year_udf(col("Year"))).show()
df_clean.withColumn("price_group", price_udf(col("UK"))).show()


+----+---------+------+------+-------+-----+-----+------+-----------+----+----+----------+
|Year|Australia|Canada|France|Germany|Italy|Japan|Mexico|South Korea|  UK| USA|year_group|
+----+---------+------+------+-------+-----+-----+------+-----------+----+----+----------+
|1991|     1.96|  1.92|  3.45|    2.9|  4.5| 3.46|   1.3|       2.49|3.01|1.14|   Vintage|
|1992|     1.89|  1.73|  3.56|   3.27| 4.53| 3.58|   1.5|       2.65|3.06|1.13|   Vintage|
|1993|     1.73|  1.57|  3.41|   3.07| 3.68| 4.16|  1.56|       2.88|2.84|1.11|   Vintage|
|1994|     1.84|  1.45|  3.59|   3.52|  3.7| 4.36|  1.48|       2.87|2.99|1.11|   Vintage|
|1995|     1.95|  1.53|  4.26|   3.96|  4.0| 4.43|  1.11|       2.94|3.21|1.15|   Vintage|
|1996|     2.12|  1.61|  4.41|   3.94| 4.39| 3.64|  1.25|       3.18|3.34|1.23|   Vintage|
|1997|     2.05|  1.62|   4.0|   3.53| 4.07| 3.26|  1.47|       3.34|3.83|1.23|   Vintage|
|1998|     1.63|  1.38|  3.87|   3.34| 3.84| 2.82|  1.49|       3.04|4.06|1.06|   Vintage|