In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col

# Initialize Spark Session
spark = SparkSession.builder.appName("CreditCardFraudDetection").getOrCreate()

# Load dataset 
df = spark.read.csv("dbfs:/FileStore/shared_uploads/vidhikarayate@gmail.com/creditcard.csv", header=True, inferSchema=True)

# Display first few rows
df.show(5)
df.printSchema()

# Check class imbalance (Count fraud vs non-fraud cases)
df.groupBy("Class").count().show()

# Convert class label to "label" column (Required by Spark ML)
indexer = StringIndexer(inputCol="Class", outputCol="label")
df = indexer.fit(df).transform(df)


# Select features (excluding "Class")
feature_columns = [col for col in df.columns if col not in ["Class", "label"]]


# Assemble features into a single feature vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df).select("features", "label")



# Handle class imbalance using **Undersampling**
fraud_cases = df.filter(col("label") == 1.0)
non_fraud_cases = df.filter(col("label") == 0.0)


# Get the count of fraud cases
fraud_count = fraud_cases.count()


# Undersample the majority class (non-fraud) to match fraud cases
non_fraud_sample = non_fraud_cases.sample(withReplacement=False, fraction=fraud_count / non_fraud_cases.count())

# Combine fraud and undersampled non-fraud cases
balanced_df = fraud_cases.union(non_fraud_sample)
balanced_df.show(2)

# Split data into training and testing sets (80% train, 20% test)
train_data, test_data = balanced_df.randomSplit([0.8, 0.2], seed=42)


# Train Decision Tree model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
model = dt.fit(train_data)


# Print the decision tree structure
print("Learned classification tree model:")
print(model.toDebugString)


# Make predictions on test data
predictions = model.transform(test_data)


# Display sample predictions
predictions.select("features", "label", "prediction").show(5)


# Evaluate using AUC (Better for fraud detection)
evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)


# Print AUC Score
print(f"AUC Score: {auc:.2f}")

+----+------------------+-------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+
|Time|                V1|                 V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|               V16|               V17|                V18|               V19|                V20|                 V21|                V22|     