In [2]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=d7a8a2180b5b314a67c11ad204850fec7c2c8a9007a4e66b36a39a46027852e1
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType
from pyspark.sql.functions import col, unix_timestamp, lag, avg, count, month, dayofmonth, hour, when

# Create Spark session
spark = SparkSession.builder.appName("FraudDetection").getOrCreate()

# Define the schema for user data
user_schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("user_name", StringType(), True),
    StructField("location", StringType(), True),
    StructField("age_group", StringType(), True)
])

# Define the schema for transaction data
transaction_schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("transaction_amount", FloatType(), True),
    StructField("transaction_time", TimestampType(), True),
    StructField("location", StringType(), True),
    StructField("fraud_label", StringType(), True)
])

# Load user data from CSV
user_dim = spark.read.csv("/content/sample_data/user_dim.csv", schema=user_schema, header=True)

# Load transaction data from CSV
transaction_fact = spark.read.csv("/content/sample_data/transaction_data.csv", schema=transaction_schema, header=True)

# Data Cleaning: Remove duplicates and null values
user_dim = user_dim.dropDuplicates().na.drop()
transaction_fact = transaction_fact.dropDuplicates().na.drop()

# Data Cleaning: Remove negative transaction amounts
transaction_fact = transaction_fact.filter(col("transaction_amount") >= 0)

# Feature Engineering
# 1. Transaction Day, Month, and Hour
transaction_fact = transaction_fact.withColumn("transaction_day", dayofmonth(col("transaction_time"))) \
                                     .withColumn("transaction_month", month(col("transaction_time"))) \
                                     .withColumn("transaction_hour", hour(col("transaction_time")))

# 2. Total Transactions per User
total_transactions = transaction_fact.groupBy("user_id").agg(count("transaction_id").alias("total_transactions"))
transaction_fact = transaction_fact.join(total_transactions, "user_id", "left")

# 3. Average Transaction Amount per User
average_transaction = transaction_fact.groupBy("user_id").agg(avg("transaction_amount").alias("avg_transaction_amount"))
transaction_fact = transaction_fact.join(average_transaction, "user_id", "left")

# 4. Flagging Suspicious Transactions
transaction_fact = transaction_fact.withColumn("is_suspicious", when(
    (col("transaction_amount") > 10000) |
    (col("total_transactions") > 5), 1).otherwise(0))

# Show anomalies
transaction_fact.show(truncate=False)


+-------+--------------+------------------+-------------------+-----------+-----------+---------------+-----------------+----------------+------------------+----------------------+-------------+
|user_id|transaction_id|transaction_amount|transaction_time   |location   |fraud_label|transaction_day|transaction_month|transaction_hour|total_transactions|avg_transaction_amount|is_suspicious|
+-------+--------------+------------------+-------------------+-----------+-----------+---------------+-----------------+----------------+------------------+----------------------+-------------+
|2      |103           |15000.0           |2024-08-28 11:00:00|Los Angeles|1          |28             |8                |11              |1                 |15000.0               |1            |
|1      |101           |5000.0            |2024-08-28 10:15:00|New York   |0          |28             |8                |10              |2                 |8500.0                |0            |
|4      |105           |7