In [1]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import os
import sys

In [2]:
current_dir = os.getcwd()
jar_folder = os.path.join(current_dir, "jars")

# Paths to the specific files
delta_core = os.path.join(jar_folder, "delta-core_2.12-2.4.0.jar")
delta_storage = os.path.join(jar_folder, "delta-storage-2.4.0.jar")

# Verify they exist
if not os.path.exists(delta_core):
    print(f"‚ùå Error: Could not find {delta_core}")
    sys.exit(1)

In [3]:
# Optional if hadoop is not install in machine
import os
import sys

# 1. Set the path to the folder containing 'bin'
os.environ['HADOOP_HOME'] = "C:\\hadoop"

# 2. Add the bin directory to the system path so Java can find the DLL
sys.path.append("C:\\hadoop\\bin")
os.environ['PATH'] += os.pathsep + "C:\\hadoop\\bin"

In [4]:

spark=(
    SparkSession
    .builder
    .appName('deltalakehouse')
    .config("spark.jars", f"{delta_core},{delta_storage}")
    # Delta Configuration
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .master('local[*]')
    .getOrCreate()
)
spark

### Read delta file

In [5]:
lakehouse_path=os.path.join(os.getcwd(),'lakehouse')

delta_df=spark.read.format('delta').load(lakehouse_path)

#delta_df.show(truncate=False)

### Faurd transactions

In [7]:
delta_df.filter(col('is_fraud') == 'Yes').show(4)

+------------+--------------------+-------+--------------+--------+--------+------------+-------------------+-------------------+--------+---------------+--------------------+
|    merchant|      transaction_id|user_id|     user_name|  amount|currency|    location|          timestamp|         risk_level|is_fraud|         Reason|         ingest_time|
+------------+--------------------+-------+--------------+--------+--------+------------+-------------------+-------------------+--------+---------------+--------------------+
|Thompson PLC|38aaedc7-e97b-40c...|    270|Rodney Baldwin|17312.67|     USD| Chadchester|2025-12-29 23:47:53|               null|     Yes|    High Amount|2025-12-29 23:47:...|
|Olson-Romero|d28272d7-6f32-42a...|    987|   John Walker|12300.36|     USD|Mitchellbury|2025-12-29 23:47:54|               null|     Yes|    High Amount|2025-12-29 23:47:...|
|    Scam Hub|f0f5c65f-2ff8-448...|    147|    Amy Harris|  225.11|     USD| Sharonshire|2025-12-29 23:51:00|Under Inves

In [8]:
delta_df.groupBy(col('is_fraud')).count().show()

+--------+-----+
|is_fraud|count|
+--------+-----+
|      No|  213|
|     Yes|   48|
+--------+-----+



### Total Fraud amount percentage

In [17]:
fraud_amount=delta_df.filter(col('is_fraud') == 'Yes').agg(sum(col('amount')).alias('total_fraud_amount')).collect()[0]['total_fraud_amount']
total_amount=delta_df.agg(sum(col('amount')).alias('total_amount')).collect()[0]['total_amount']
percent=fraud_amount/total_amount*100
fraud_amount=f"{percent:.2f}%"
print(f"Total fraud amount percent: {fraud_amount}")

Total fraud amount percent: 88.63%


In [19]:
spark.stop()