In [None]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=81704dccf37261f313c21705a07a21fe6c8efe6c6ac513d6062563d0b78d3f70
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


Task 1: Vehicle Maintenance Data Ingestion

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import input_file_name
from pyspark.sql.types import StructType, StructField, StringType, FloatType, DateType
import os

# Create a Spark session
spark = SparkSession.builder.appName("VehicleMaintenanceIngestion").getOrCreate()

# Define the schema for the vehicle maintenance data
schema = StructType([
    StructField("VehicleID", StringType(), True),
    StructField("Date", DateType(), True),
    StructField("ServiceType", StringType(), True),
    StructField("ServiceCost", FloatType(), True),
    StructField("Mileage", FloatType(), True)
])

# Define paths
raw_data_path = "/content/sample_data/vehicle_maintenance.csv"
delta_table_path = "/content/sample_data/delta/vehicle_maintenance"

# Check if the raw data file exists
if os.path.exists(raw_data_path):
    try:
        # Read the CSV file into a DataFrame
        vehicle_df = spark.read.csv(raw_data_path, schema=schema, header=True).withColumn("file_name", input_file_name())

        # Write the DataFrame to a Delta table
        vehicle_df.write.format("delta").mode("overwrite").save(delta_table_path)
        print("Data loaded and saved as Delta table.")
    except Exception as e:
        print(f"Error: {e}")
else:
    print(f"File not found: {raw_data_path}")


Error: An error occurred while calling o67.save.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: delta. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:725)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:863)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:257)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:240)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.refl

Task 2: Data Cleaning

In [None]:
# Read the raw Delta table
raw_df = spark.read.format("delta").load(delta_table_path)

# Clean the DataFrame
cleaned_df = raw_df.filter((raw_df.ServiceCost > 0) & (raw_df.Mileage > 0))

# Remove duplicates based on VehicleID and Date
cleaned_df = cleaned_df.dropDuplicates(["VehicleID", "Date"])

# Save the cleaned data to a new Delta table
cleaned_delta_path = "/content/sample_data/delta/cleaned_vehicle_maintenance"
cleaned_df.write.format("delta").mode("overwrite").save(cleaned_delta_path)
print("Cleaned data saved to Delta table.")


Py4JJavaError: An error occurred while calling o71.load.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: delta. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:725)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:208)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:186)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.ClassNotFoundException: delta.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:476)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:594)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:527)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:633)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:633)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:633)
	... 15 more


Task 3: Vehicle Maintenance Analysis

In [None]:
# Read the cleaned data
cleaned_df = spark.read.format("delta").load(cleaned_delta_path)

# Calculate total maintenance cost for each vehicle
maintenance_costs = cleaned_df.groupBy("VehicleID").agg({"ServiceCost": "sum"}).withColumnRenamed("sum(ServiceCost)", "TotalMaintenanceCost")
maintenance_costs.show()

# Identify vehicles exceeding mileage threshold (30,000 miles)
threshold = 30000
vehicles_needing_service = cleaned_df.filter(cleaned_df.Mileage > threshold).select("VehicleID", "Mileage")
vehicles_needing_service.show()

# Save analysis results to Delta tables
maintenance_costs.write.format("delta").mode("overwrite").save("/content/sample_data/delta/maintenance_costs")
vehicles_needing_service.write.format("delta").mode("overwrite").save("/content/sample_data/delta/vehicles_needing_service")


Task 4: Data Governance with Delta Lake

In [None]:
# Use VACUUM to clean up old data
spark.sql("VACUUM '/content/sample_data/delta/vehicle_maintenance' RETAIN 168 HOURS")

# Check the history of updates
history_df = spark.sql("DESCRIBE HISTORY delta.`/content/sample_data/delta/vehicle_maintenance`")
history_df.show()
