# Part 2 - Data Lake Implementation

- ## Use Schema fromJSON
- ## Read parquet files
- ## Print schema and dataframe

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, BooleanType

# Start Spark SessionL
spark = SparkSession.builder.appName("OSV Data Lake ").getOrCreate()

# Define Correct Schema
schema = StructType([
    StructField("id", StringType(), True),
    StructField("summary", StringType(), True),
    StructField("details", StringType(), True),
    StructField("aliases", ArrayType(StringType()), True),
    StructField("modified", StringType(), True),
    StructField("published", StringType(), True),
    StructField("database_specific", StructType([
        StructField("nvd_published_at", StringType(), True),  # Force as String
        StructField("severity", StringType(), True),
        StructField("github_reviewed", BooleanType(), True),
        StructField("github_reviewed_at", StringType(), True)
    ]), True),
    StructField("affected", ArrayType(StructType([
        StructField("package", StructType([
            StructField("name", StringType(), True),
            StructField("ecosystem", StringType(), True),
            StructField("purl", StringType(), True)
        ]), True),
        StructField("ranges", ArrayType(StructType([
            StructField("type", StringType(), True),
            StructField("events", ArrayType(StructType([
                StructField("introduced", StringType(), True),
                StructField("fixed", StringType(), True)
            ]), True))
        ]), True), True)
    ]), True), True),
    StructField("schema_version", StringType(), True),
    StructField("severity", ArrayType(StructType([
        StructField("type", StringType(), True),
        StructField("score", StringType(), True)
    ])), True)
])

# Load Parquet with Correct Schema
df = spark.read.schema(schema).parquet("# Enter Path to Parquet Files")

# Show Schema & Data
df.printSchema()
df.show()


StatementMeta(, , -1, Finished, , Finished)

LIVY_JOB_TIMED_OUT: Livy session has failed. Session state: Dead. Error code: LIVY_JOB_TIMED_OUT. Job failed during run time with state=[dead]. Source: Unknown.

## Store parquet files in a delta table

In [None]:
from delta import DeltaTable

delta_path = "# Enter Delta Path Here"

# Convert Parquet to Delta
df.write.format("delta").mode("overwrite").save(delta_path)

# Read the delta table

In [None]:
df_delta = spark.read.format("delta").load(delta_path)
df_delta.show()

## Generate ecosystem and year columns from the data avaialble

In [29]:
from pyspark.sql.functions import col, expr

# Extract the first ecosystem from affected.package.ecosystem
df = df.withColumn("ecosystem", expr("affected[0].package.ecosystem"))
df = df.withColumn("year", col("published").substr(1, 4))



StatementMeta(osv, 0, 30, Finished, Available, Finished)

## Partitioning by ecosystem and year

In [30]:
df.write.format("delta") \
    .partitionBy("ecosystem", "year") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(delta_path)


StatementMeta(osv, 0, 31, Finished, Available, Finished)

# Time travel to see previous version

In [32]:
df_time_travel = spark.read.format("delta").option("versionAsOf", 1).load(delta_path)
df_time_travel.show()


StatementMeta(osv, 0, 33, Finished, Available, Finished)

+--------------+-------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------+--------------------+---------+----+
|            id|summary|             details|             aliases|            modified|           published|database_specific|            affected|schema_version|            severity|ecosystem|year|
+--------------+-------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------+--------------------+---------+----+
| PYSEC-2017-84|   null|An issue was disc...|[CVE-2017-16613, ...|2024-05-01T11:41:...|2017-11-21T13:29:00Z|             null|[{{swauth, PyPI, ...|         1.6.0|                null|     PyPI|2017|
| PYSEC-2017-36|   null|Directory travers...|[CVE-2017-14695, ...|2024-04-22T23:26:...|2017-10-24T17:29:00Z|             null|[{{salt, PyPI, pk...|         1.6.0|                null|     PyPI|2017|
|PYSE

## Using sort within paritions as an indexing strategy

In [37]:
from pyspark.sql.functions import col

# Repartition by ecosystem (to align partitions properly)
df = df.repartition("ecosystem")

# Sort data within each partition manually
df = df.sortWithinPartitions("ecosystem", "year")

# Write sorted data into Delta format
df.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("ecosystem", "year") \
    .save(delta_path)


StatementMeta(osv, 0, 38, Finished, Available, Finished)

#

## Enablng vaccum and retentiion policies

In [39]:
deltaTable.vacuum(75)  # Keep only the last 75 days of data


StatementMeta(, , -1, Finished, , Finished)

LIVY_JOB_TIMED_OUT: Livy session has failed. Session state: Dead. Error code: LIVY_JOB_TIMED_OUT. Job failed during run time with state=[dead]. Source: Unknown.