In [None]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=96c381ea40d79616e4fe96e24243c34cd6ab3b72c3e9db111231dff02cca285d
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType
from pyspark.sql.functions import col, window, avg, count, stddev, unix_timestamp, lag, dayofmonth, month, hour, when

# Create Spark session
spark = SparkSession.builder.appName("FraudDetection").getOrCreate()

# Define the schema for transaction data
transaction_schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("transaction_amount", FloatType(), True),
    StructField("transaction_time", TimestampType(), True),
    StructField("payment_method", StringType(), True),
    StructField("location", StringType(), True),
    StructField("fraud_label", StringType(), True)
])

# Define the schema for user data
user_schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("user_name", StringType(), True),
    StructField("user_location", StringType(), True)
])

# Read transaction data from CSV as a static DataFrame
transaction_df = spark.read.schema(transaction_schema).csv("/content/sample_data/transaction_fact.csv", header=True)

# Read user data from CSV as a static DataFrame
user_dim = spark.read.schema(user_schema).csv("/content/sample_data/user_dim.csv", header=True)
user_dim = user_dim.withColumnRenamed("location", "user_location")

# Data Cleaning
transaction_df = transaction_df.dropDuplicates().na.fill({
    "user_id": "unknown",
    "transaction_amount": 0.0,
    "transaction_time": "1970-01-01 00:00:00"
})

# Feature Engineering
transaction_df = transaction_df.withColumn("transaction_day", dayofmonth(col("transaction_time"))) \
                                 .withColumn("transaction_month", month(col("transaction_time"))) \
                                 .withColumn("transaction_hour", hour(col("transaction_time")))

# Join transaction data with user data
joined_data = transaction_df.join(user_dim, "user_id")

# Detect anomalies: flag large transactions or mismatched locations
anomalies = joined_data.filter(
    (col("transaction_amount") > 1000) |
    (col("location") != col("user_location"))
)

# Feature 1: Calculate the average transaction amount per user
average_transaction = transaction_df.groupBy("user_id").agg(
    avg("transaction_amount").alias("avg_transaction_amount"),
    stddev("transaction_amount").alias("stddev_transaction_amount")
)

# Feature 2: Calculate transaction frequency
transaction_frequency = transaction_df.groupBy(
    "user_id", window("transaction_time", "1 hour")
).agg(count("transaction_id").alias("transaction_count"))

# Feature 3: Flag multiple transactions from different locations within a short time window
suspicious_location_changes = transaction_df.groupBy(
    "user_id", window("transaction_time", "30 minutes"), "location"
).agg(count("transaction_id").alias("location_transaction_count")).filter(
    col("location_transaction_count") > 1
)

# Feature 4: Total transactions
user_transaction_count = transaction_df.groupBy("user_id").agg(count("transaction_id").alias("total_transactions"))

transaction_df = joined_data.withColumn(
    "fraud_label",
    when((col("transaction_amount") > 1000) |
         (col("location") != col("user_location")) |
         (col("user_id").isin([row["user_id"] for row in suspicious_location_changes.collect()])), 1
    ).otherwise(col("fraud_label"))
)

# Output
transaction_df.show(truncate=False)


+-------+--------------+------------------+-------------------+--------------+-------------+-----------+---------------+-----------------+----------------+-------------+-------------+
|user_id|transaction_id|transaction_amount|transaction_time   |payment_method|location     |fraud_label|transaction_day|transaction_month|transaction_hour|user_name    |user_location|
+-------+--------------+------------------+-------------------+--------------+-------------+-----------+---------------+-----------------+----------------+-------------+-------------+
|7      |207           |700.0             |2024-08-21 13:50:00|Debit Card    |Seattle      |0          |21             |8                |13              |Grace Green  |Seattle      |
|3      |212           |1200.0            |2024-08-26 11:30:00|Credit Card   |Chicago      |1          |26             |8                |11              |Charlie Brown|Chicago      |
|4      |213           |75.0              |2024-08-27 14:50:00|Debit Card    |Ho