In [36]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()


In [37]:
df = spark.read.json('C:/Users/vrjav/Downloads/pyspark/learning1.ndjson')

In [38]:
df.cache()

DataFrame[age: bigint, name: string]

In [39]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [40]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [41]:
df.select('name').show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



In [42]:
df.createOrReplaceTempView("details")
spark.sql('select * from details where name="Michael"').show()

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
+----+-------+



# Generic Load and Save Functions

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [2]:
path = "C:/spark/spark-3.5.1-bin-hadoop3/examples/src/main/resources/people.json"

df = spark.read.load(path, format='json')

In [3]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [4]:
import os

path = "C:/spark/spark-3.5.1-bin-hadoop3/examples/src/main/resources"

df1 = spark.read.load(os.path.join(path, "people.json"), format = 'json')
df1.show()

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [5]:
df2 = spark.read.load(os.path.join(path, "users.parquet"))
df2.show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          NULL|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



### Running SQL on files directly

In [18]:
df = spark.sql("select * from parquet.`C:/spark/spark-3.5.1-bin-hadoop3/examples/src/main/resources/users.parquet`")

In [19]:
df.show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          NULL|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



In [6]:
df = spark.read.load(os.path.join(path,"users.orc"), format='orc')

In [7]:
df_orc = df.write \
    .format("orc") \
    .mode("overwrite") \
    .save("users_without_bloom.orc")


In [8]:
df_bloom = df.write \
    .format("orc") \
    .option("orc.bloom.filter.columns", "favorite_color") \
    .mode("overwrite") \
    .save("users_with_bloom.orc")


In [9]:
df_orc = spark.read.format("orc").load("users_without_bloom.orc")
df_bloom = spark.read.format("orc").load("users_with_bloom.orc")


In [10]:
from pyspark.sql.functions import col
import time

# Time query function
def time_query(df):
    start = time.time()
    df.filter(col("favorite_color") == "blue").count()
    return time.time() - start

# Without Bloom filter
time_no_bloom = time_query(df_orc)

# With Bloom filter
time_with_bloom = time_query(df_bloom)

# Print the results
print(f"Time without Bloom filter: {time_no_bloom:.3f} seconds")
print(f"Time with Bloom filter: {time_with_bloom:.3f} seconds")


Time without Bloom filter: 1.999 seconds
Time with Bloom filter: 0.462 seconds
