In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("FraudAnalytics") \
    .enableHiveSupport() \
    .getOrCreate()


df_raw_sample = spark.read.csv("C:\\Users\\pradh\\Desktop\\archive (1)\\Synthetic_Financial_datasets_log.csv", header=True, inferSchema=True)
df_raw_sample.printSchema()
df_raw_sample.show(5, truncate=False)


root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- isFlaggedFraud: integer (nullable = true)

+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|type    |amount  |nameOrig   |oldbalanceOrg|newbalanceOrig|nameDest   |oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|1   |PAYMENT |9839.64 |C1231006815|170136.0     |160296.36     |M1979787155|0.0           |0.0           |0      |0             |
|1   |PAY

In [3]:
from pyspark.sql.types import *

schema = StructType([
    StructField("step", IntegerType(), True),
    StructField("type", StringType(), True),
    StructField("amount", DoubleType(), True),
    StructField("nameOrig", StringType(), True),
    StructField("oldbalanceOrg", DoubleType(), True),
    StructField("newbalanceOrg", DoubleType(), True),
    StructField("nameDest", StringType(), True),
    StructField("oldbalanceDest", DoubleType(), True),
    StructField("newbalanceDest", DoubleType(), True),
    StructField("isFraud", IntegerType(), True),
    StructField("isFlaggedFraud", IntegerType(), True),
])

df = spark.read.csv("C:\\Users\\pradh\\Desktop\\archive (1)\\Synthetic_Financial_datasets_log.csv", header=True, schema=schema)


In [4]:
# Row count
total_rows = df.count(); print("rows:", total_rows)

# Distinct values & sanity of key columns
df.select("type").distinct().show(50)
df.groupBy("isFraud").count().orderBy("isFraud").show()
df.groupBy("isFlaggedFraud").count().orderBy("isFlaggedFraud").show()

# Null counts column 
from pyspark.sql.functions import sum as _sum
df.select([_sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]).show()

# Numeric summaries
df.select("amount","oldbalanceOrg","newbalanceOrg","oldbalanceDest","newbalanceDest").summary("count","min","max","mean","stddev").show()


rows: 6362620
+--------+
|    type|
+--------+
|TRANSFER|
| CASH_IN|
|CASH_OUT|
| PAYMENT|
|   DEBIT|
+--------+

+-------+-------+
|isFraud|  count|
+-------+-------+
|      0|6354407|
|      1|   8213|
+-------+-------+

+--------------+-------+
|isFlaggedFraud|  count|
+--------------+-------+
|             0|6362604|
|             1|     16|
+--------------+-------+

+----+----+------+--------+-------------+-------------+--------+--------------+--------------+-------+--------------+
|step|type|amount|nameOrig|oldbalanceOrg|newbalanceOrg|nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+----+------+--------+-------------+-------------+--------+--------------+--------------+-------+--------------+
|   0|   0|     0|       0|            0|            0|       0|             0|             0|      0|             0|
+----+----+------+--------+-------------+-------------+--------+--------------+--------------+-------+--------------+

+-------+------------------+-------

In [5]:
from pyspark.sql.functions import when

# Negative checks
neg_amount = df.filter(col("amount") < 0).count()
neg_org = df.filter((col("oldbalanceOrg") < 0) | (col("newbalanceOrg") < 0)).count()
neg_dest = df.filter((col("oldbalanceDest") < 0) | (col("newbalanceDest") < 0)).count()
print("neg_amount:", neg_amount, "neg_org:", neg_org, "neg_dest:", neg_dest)

# Duplicates
dups = df.count() - df.dropDuplicates().count()
print("exact_duplicate_rows:", dups)

# Weird flags
df.groupBy("isFraud","isFlaggedFraud").count().orderBy("isFraud","isFlaggedFraud").show()


neg_amount: 0 neg_org: 0 neg_dest: 0
exact_duplicate_rows: 0
+-------+--------------+-------+
|isFraud|isFlaggedFraud|  count|
+-------+--------------+-------+
|      0|             0|6354407|
|      1|             0|   8197|
|      1|             1|     16|
+-------+--------------+-------+



In [6]:

df_clean = df.dropDuplicates()

print("After cleaning:", df_clean.count())


After cleaning: 6362620


In [7]:
# Fraud vs non-fraud 
df_clean.groupBy("isFraud").count().show()

# Amount summary
df_clean.select("amount").summary("count","min","max","mean","stddev").show()


+-------+-------+
|isFraud|  count|
+-------+-------+
|      1|   8213|
|      0|6354407|
+-------+-------+

+-------+------------------+
|summary|            amount|
+-------+------------------+
|  count|           6362620|
|    min|               0.0|
|    max|     9.244551664E7|
|   mean|179861.90354913048|
| stddev| 603858.2314629363|
+-------+------------------+



In [8]:

df_clean.createOrReplaceTempView("transactions_clean")                                             #HIVEQL type queries 


fraud_by_type = spark.sql("""
SELECT type,
       COUNT(*) as tx_count,             
       SUM(isFraud) as fraud_count,
       SUM(amount) as total_amount,
       SUM(CASE WHEN isFraud=1 THEN amount ELSE 0 END) as fraud_amount,
       ROUND(SUM(isFraud)/COUNT(*)*100, 4) as fraud_rate_pct
FROM transactions_clean
GROUP BY type
ORDER BY fraud_rate_pct DESC
""")
fraud_by_type.show()



+--------+--------+-----------+--------------------+-------------------+--------------+
|    type|tx_count|fraud_count|        total_amount|       fraud_amount|fraud_rate_pct|
+--------+--------+-----------+--------------------+-------------------+--------------+
|TRANSFER|  532909|       4097|4.852919872631704E11|    6.06721318401E9|        0.7688|
|CASH_OUT| 2237500|       4116|3.944129952244925E11|5.989202243829999E9|         0.184|
| CASH_IN| 1399284|          0|2.363673919124594...|                0.0|           0.0|
| PAYMENT| 2151495|          0|2.809337113836992...|                0.0|           0.0|
|   DEBIT|   41432|          0|2.2719922127999997E8|                0.0|           0.0|
+--------+--------+-----------+--------------------+-------------------+--------------+



In [9]:
fraud_by_step = spark.sql("""
SELECT step,
       COUNT(*) as tx_count,
       SUM(isFraud) as fraud_count,
       SUM(amount) as total_amount,
       SUM(CASE WHEN isFraud=1 THEN amount ELSE 0 END) as fraud_amount
FROM transactions_clean
GROUP BY step
ORDER BY step
""")
fraud_by_step.show(10)   # shows first 10 steps




+----+--------+-----------+--------------------+------------------+
|step|tx_count|fraud_count|        total_amount|      fraud_amount|
+----+--------+-----------+--------------------+------------------+
|   1|    2708|         16|2.8542918115000004E8|3740247.0100000002|
|   2|    1014|          8| 8.592160401999998E7|4186592.4800000004|
|   3|     552|          4|4.3293884419999994E7|          66832.74|
|   4|     565|         10| 7.291002857000001E7|      2.64002749E7|
|   5|     665|          6|       4.554808975E7|         381841.54|
|   6|    1660|         22|1.6431055121999997E8| 974869.6799999999|
|   7|    6837|         12|      8.3293081424E8|     1.241469406E7|
|   8|   21097|         12|3.4396024073500004E9|        1589040.41|
|   9|   37628|         19| 7.008379239430001E9|     1.147663022E7|
|  10|   35991|         11| 7.124214893709999E9|        6935977.72|
+----+--------+-----------+--------------------+------------------+
only showing top 10 rows


In [10]:
top_orig = spark.sql("""
SELECT nameOrig,
       COUNT(*) as tx_count,
       SUM(isFraud) as fraud_count,
       SUM(CASE WHEN isFraud=1 THEN amount ELSE 0 END) as fraud_amount
FROM transactions_clean
GROUP BY nameOrig
ORDER BY fraud_amount DESC
LIMIT 20
""")
top_orig.show()


+-----------+--------+-----------+------------+
|   nameOrig|tx_count|fraud_count|fraud_amount|
+-----------+--------+-----------+------------+
|C1619838170|       1|          1|       1.0E7|
|C1295280435|       1|          1|       1.0E7|
| C525906402|       1|          1|       1.0E7|
|C1677039996|       1|          1|       1.0E7|
|C1552522980|       1|          1|       1.0E7|
|C1853514800|       1|          1|       1.0E7|
| C523152614|       1|          1|       1.0E7|
| C819618584|       1|          1|       1.0E7|
|C1930318116|       1|          1|       1.0E7|
| C180127057|       1|          1|       1.0E7|
|C1531278091|       1|          1|       1.0E7|
|C1028530067|       1|          1|       1.0E7|
|C1057439889|       1|          1|       1.0E7|
|C1274141620|       1|          1|       1.0E7|
|C1577275521|       1|          1|       1.0E7|
|C1952386173|       1|          1|       1.0E7|
| C179802055|       1|          1|       1.0E7|
|C1041060645|       1|          1|      

In [34]:
top_dest = spark.sql("""
SELECT nameDest,
       COUNT(*) as tx_count,
       SUM(isFraud) as fraud_count,
       SUM(CASE WHEN isFraud=1 THEN amount ELSE 0 END) as fraud_amount
FROM transactions_clean
GROUP BY nameDest
ORDER BY fraud_amount DESC
LIMIT 5
""")
top_dest.show()


+-----------+--------+-----------+-------------+
|   nameDest|tx_count|fraud_count| fraud_amount|
+-----------+--------+-----------+-------------+
| C668046170|       5|          2|1.016008868E7|
| C380259496|      24|          1|        1.0E7|
|C1236804041|       1|          1|        1.0E7|
|C1877706055|       3|          1|        1.0E7|
| C574552283|      16|          1|        1.0E7|
+-----------+--------+-----------+-------------+



In [12]:
flag_vs_fraud = spark.sql("""
SELECT isFraud, isFlaggedFraud, COUNT(*) as cnt
FROM transactions_clean
GROUP BY isFraud, isFlaggedFraud
ORDER BY isFraud, isFlaggedFraud
""")
flag_vs_fraud.show()


+-------+--------------+-------+
|isFraud|isFlaggedFraud|    cnt|
+-------+--------------+-------+
|      0|             0|6354407|
|      1|             0|   8197|
|      1|             1|     16|
+-------+--------------+-------+



In [31]:
fraud_by_bucket = spark.sql("""
SELECT
  CASE 
    WHEN amount <= 1000 THEN '0-1K'
    WHEN amount > 1000 AND amount <= 10000 THEN '1K-10K'
    WHEN amount > 10000 AND amount <= 100000 THEN '10K-100K'
    WHEN amount > 100000 AND amount <= 1000000 THEN '100K-1M'
    ELSE '1M+'
  END AS amount_bucket,
  COUNT(*) AS tx_count,
  SUM(isFraud) AS fraud_count,
  SUM(amount) AS total_amount,
  SUM(CASE WHEN isFraud=1 THEN amount ELSE 0 END) AS fraud_amount,
  ROUND(SUM(isFraud)/COUNT(*)*100, 4) AS fraud_rate_percentage
FROM transactions_clean
GROUP BY
  CASE 
    WHEN amount <= 1000 THEN '0-1K'
    WHEN amount > 1000 AND amount <= 10000 THEN '1K-10K'
    WHEN amount > 10000 AND amount <= 100000 THEN '10K-100K'
    WHEN amount > 100000 AND amount <= 1000000 THEN '100K-1M'
    ELSE '1M+'
  END
ORDER BY fraud_rate_percentage DESC
""")

fraud_by_bucket.show()
fraud_by_bucket.toPandas().to_csv("fraud_by_bucket.csv", index=False)


+-------------+--------+-----------+--------------------+--------------------+---------------------+
|amount_bucket|tx_count|fraud_count|        total_amount|        fraud_amount|fraud_rate_percentage|
+-------------+--------+-----------+--------------------+--------------------+---------------------+
|          1M+|  130626|       2706|3.337635258004104E11|1.046072230895000...|               2.0716|
|      100K-1M| 2706696|       3800|7.151167026037612E11|1.5226730667999997E9|               0.1404|
|     10K-100K| 2239207|       1429|8.946644599596988E10| 7.181822573000002E7|               0.0638|
|         0-1K|  142646|         58| 7.156829399000004E7|  14039.180000000002|               0.0407|
|       1K-10K| 1143445|        220| 5.974702065639999E9|  1187787.1800000002|               0.0192|
+-------------+--------+-----------+--------------------+--------------------+---------------------+



In [28]:
df_clean.createOrReplaceTempView("transactions_clean")

fraud_balance_stats = spark.sql("""
SELECT 
    isFraud,
    COUNT(*) AS transaction_count,
    SUM(CASE WHEN ABS((oldbalanceOrg - amount) - newbalanceOrg) > 0.01 THEN 1 ELSE 0 END) AS mismatch_count,
    ROUND(SUM(CASE WHEN ABS((oldbalanceOrg - amount) - newbalanceOrg) > 0.01 THEN 1 ELSE 0 END) / COUNT(*) * 100, 4) AS mismatch_rate_percentage
FROM transactions_clean
GROUP BY isFraud
""")

fraud_balance_stats.show()
fraud_balance_stats.toPandas().to_csv("fraud_balance_stats.csv", index=False)



+-------+-----------------+--------------+------------------------+
|isFraud|transaction_count|mismatch_count|mismatch_rate_percentage|
+-------+-----------------+--------------+------------------------+
|      1|             8213|            45|                  0.5479|
|      0|          6354407|       5077646|                 79.9075|
+-------+-----------------+--------------+------------------------+



In [18]:
#fraud over time 

fraud_by_step = spark.sql("""
SELECT
    step,
    COUNT(*) AS transaction_count,
    SUM(isFraud) AS fraud_count,
    ROUND(SUM(isFraud) / COUNT(*) * 100, 4) AS fraud_rate_percentage
FROM transactions_clean
GROUP BY step
ORDER BY step
""")

fraud_by_step.show()
fraud_by_step.toPandas().to_csv("fraud_by_step_trend.csv", index=False)



+----+-----------------+-----------+---------------------+
|step|transaction_count|fraud_count|fraud_rate_percentage|
+----+-----------------+-----------+---------------------+
|   1|             2708|         16|               0.5908|
|   2|             1014|          8|                0.789|
|   3|              552|          4|               0.7246|
|   4|              565|         10|               1.7699|
|   5|              665|          6|               0.9023|
|   6|             1660|         22|               1.3253|
|   7|             6837|         12|               0.1755|
|   8|            21097|         12|               0.0569|
|   9|            37628|         19|               0.0505|
|  10|            35991|         11|               0.0306|
|  11|            37241|          7|               0.0188|
|  12|            36153|         14|               0.0387|
|  13|            37515|         14|               0.0373|
|  14|            41485|         12|               0.028

In [32]:
# Fraud transactions with same non-zero amount repeated multiple times
fraud_amount_repeats = spark.sql("""
SELECT 
    amount,
    COUNT(*) AS freq
FROM transactions_clean
WHERE isFraud = 1
  AND amount > 0
GROUP BY amount
HAVING COUNT(*) > 1
ORDER BY freq DESC, amount DESC
""")

fraud_amount_repeats.show(20, False)
fraud_amount_repeats.toPandas().to_csv("fraud_amount_repeats.csv", index=False)



+----------+----+
|amount    |freq|
+----------+----+
|1.0E7     |287 |
|1165187.89|4   |
|429257.45 |4   |
|9996886.64|2   |
|9977761.05|2   |
|9960382.4 |2   |
|9887819.06|2   |
|9811104.49|2   |
|9772559.35|2   |
|9749042.95|2   |
|9725837.08|2   |
|9639524.7 |2   |
|9639052.83|2   |
|9593838.63|2   |
|9468064.05|2   |
|9465988.82|2   |
|9453680.72|2   |
|9421856.87|2   |
|9345700.07|2   |
|9301213.46|2   |
+----------+----+
only showing top 20 rows


In [35]:
 # Top Fraud Destinations (Receivers)
fraud_destinations = spark.sql("""
SELECT 
    nameDest,
    COUNT(*) AS fraud_transaction_count,
    SUM(amount) AS fraud_total_amount
FROM transactions_clean
WHERE isFraud = 1
GROUP BY nameDest
ORDER BY fraud_total_amount DESC
LIMIT 10
""")

fraud_destinations.show()
fraud_destinations.toPandas().to_csv("fraud_destinations_by_amount.csv", index=False)




+-----------+-----------------------+------------------+
|   nameDest|fraud_transaction_count|fraud_total_amount|
+-----------+-----------------------+------------------+
| C668046170|                      2|     1.016008868E7|
|C1595458981|                      1|             1.0E7|
|C1622860679|                      1|             1.0E7|
| C103172881|                      1|             1.0E7|
|C1423246212|                      1|             1.0E7|
| C709815552|                      1|             1.0E7|
|C1806199534|                      1|             1.0E7|
|C1270029603|                      1|             1.0E7|
|C2065262017|                      1|             1.0E7|
|C1732619349|                      1|             1.0E7|
+-----------+-----------------------+------------------+



In [24]:
# Flagged vs Actual Fraud 
flag_vs_fraud = spark.sql("""
SELECT
    isFraud,
    isFlaggedFraud,
    COUNT(*) AS transactioon_count
FROM transactions_clean
GROUP BY isFraud, isFlaggedFraud
ORDER BY isFraud DESC, isFlaggedFraud DESC
""")

flag_vs_fraud.show()
flag_vs_fraud.toPandas().to_csv("flag_vs_fraud_analysis.csv", index=False)


+-------+--------------+------------------+
|isFraud|isFlaggedFraud|transactioon_count|
+-------+--------------+------------------+
|      1|             1|                16|
|      1|             0|              8197|
|      0|             0|           6354407|
+-------+--------------+------------------+

