In [1]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!pip install -q pyspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["PATH"] += ":/usr/lib/jvm/java-11-openjdk-amd64/bin"

In [3]:
import pyspark
sc = pyspark.SparkContext(appName="FraudDetection")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/17 16:33:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("FraudDetection") \
    .getOrCreate()

spark.sparkContext.getConf().getAll()
spark

In [5]:
import numpy as np 
import pandas as pd 
import json
import matplotlib.pyplot as plt
import warnings
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum

warnings.filterwarnings("ignore")
data_path='/kaggle/input/transactions-fraud-datasets'

In [6]:
with open(f'{data_path}/train_fraud_labels.json', 'r') as file:
    train_fraud_labels = json.load(file)
target_dict = {int(k):v for k,v in train_fraud_labels['target'].items()}

# Convert target_dict to PySpark DataFrame
target_df = spark.createDataFrame(target_dict.items(), ["id", "target"])

In [7]:
target_df.count() 

25/04/17 16:35:32 WARN TaskSetManager: Stage 0 contains a task of very large size (26183 KiB). The maximum recommended task size is 1000 KiB.
                                                                                                    

8914963

In [8]:
# Read CSV as PySpark DataFrame
transactions_df = spark.read.csv(f'{data_path}/transactions_data.csv', header=True, inferSchema=True)
print(transactions_df.count())
transactions_df.select([_sum(col(c).isNull().cast("int")).alias(c) for c in transactions_df.columns]).show()

                                                                                                    

13305915




+---+----+---------+-------+------+--------+-----------+-------------+--------------+-------+---+--------+
| id|date|client_id|card_id|amount|use_chip|merchant_id|merchant_city|merchant_state|    zip|mcc|  errors|
+---+----+---------+-------+------+--------+-----------+-------------+--------------+-------+---+--------+
|  0|   0|        0|      0|     0|       0|          0|            0|       1563700|1652706|  0|13094522|
+---+----+---------+-------+------+--------+-----------+-------------+--------------+-------+---+--------+



                                                                                                    

In [9]:
transactions_df = transactions_df.fillna({'errors': 'No Error'})

# Drop unnecessary columns
transactions_df = transactions_df.drop('merchant_state', 'zip')

# Join transactions_df with target_df on 'id' column
transactions_df = transactions_df.join(target_df, on="id", how="inner")

In [10]:
print(transactions_df.count())
transactions_df.select([_sum(col(c).isNull().cast("int")).alias(c) for c in transactions_df.columns]).show()

25/04/17 16:36:50 WARN TaskSetManager: Stage 12 contains a task of very large size (26183 KiB). The maximum recommended task size is 1000 KiB.
                                                                                                    

8914963


25/04/17 16:37:32 WARN TaskSetManager: Stage 21 contains a task of very large size (26183 KiB). The maximum recommended task size is 1000 KiB.

+---+----+---------+-------+------+--------+-----------+-------------+---+------+------+
| id|date|client_id|card_id|amount|use_chip|merchant_id|merchant_city|mcc|errors|target|
+---+----+---------+-------+------+--------+-----------+-------------+---+------+------+
|  0|   0|        0|      0|     0|       0|          0|            0|  0|     0|     0|
+---+----+---------+-------+------+--------+-----------+-------------+---+------+------+



                                                                                                    

In [11]:
transactions_df.limit(100).toPandas()

25/04/17 16:38:19 WARN TaskSetManager: Stage 30 contains a task of very large size (26183 KiB). The maximum recommended task size is 1000 KiB.
                                                                                                    

Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,mcc,errors,target
0,7475341,2010-01-01 00:27:00,1797,1127,$43.33,Swipe Transaction,33326,Kahului,4121,No Error,No
1,7475347,2010-01-01 00:36:00,114,3398,$-64.00,Swipe Transaction,61195,North Hollywood,5541,No Error,No
2,7475378,2010-01-01 01:19:00,1575,2112,$17.14,Swipe Transaction,29232,Osprey,4121,No Error,No
3,7475398,2010-01-01 01:53:00,63,60,$30.69,Swipe Transaction,32175,Newark,7538,No Error,No
4,7475424,2010-01-01 02:15:00,1142,4674,$3.68,Swipe Transaction,59397,Cohasset,5812,No Error,No
...,...,...,...,...,...,...,...,...,...,...,...
95,7477442,2010-01-01 13:01:00,921,4646,$2.48,Swipe Transaction,88646,Scottsville,5812,No Error,No
96,7477479,2010-01-01 13:09:00,639,129,$16.82,Swipe Transaction,69034,Republic,5411,No Error,No
97,7477498,2010-01-01 13:13:00,89,2998,$6.98,Swipe Transaction,61195,San Miguel,5541,No Error,No
98,7477499,2010-01-01 13:13:00,90,1100,$73.81,Swipe Transaction,61195,Cocoa,5541,No Error,No


In [14]:
# Load CSV as Spark DataFrame
users_df = spark.read.csv(f"{data_path}/users_data.csv", header=True, inferSchema=True)

# Rename column 'id' to 'client_id'
users_df = users_df.withColumnRenamed("id", "client_id")

# Show DataFrame info
print("Number of rows:", users_df.count())
users_df.select([_sum(col(c).isNull().cast("int")).alias(c) for c in users_df.columns]).show()

Number of rows: 2000
+---------+-----------+--------------+----------+-----------+------+-------+--------+---------+-----------------+-------------+----------+------------+----------------+
|client_id|current_age|retirement_age|birth_year|birth_month|gender|address|latitude|longitude|per_capita_income|yearly_income|total_debt|credit_score|num_credit_cards|
+---------+-----------+--------------+----------+-----------+------+-------+--------+---------+-----------------+-------------+----------+------------+----------------+
|        0|          0|             0|         0|          0|     0|      0|       0|        0|                0|            0|         0|           0|               0|
+---------+-----------+--------------+----------+-----------+------+-------+--------+---------+-----------------+-------------+----------+------------+----------------+



In [15]:
users_df = users_df.join(transactions_df.select("client_id").distinct(), on="client_id", how="left_semi")
# Show results
print("Number of rows:", users_df.count())
users_df.printSchema()

25/04/17 16:38:48 WARN TaskSetManager: Stage 44 contains a task of very large size (26183 KiB). The maximum recommended task size is 1000 KiB.
25/04/17 16:38:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:38:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:38:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:38:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:38:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:38:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:39:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:39:00 WARN RowBasedKeyValu

Number of rows: 1219
root
 |-- client_id: integer (nullable = true)
 |-- current_age: integer (nullable = true)
 |-- retirement_age: integer (nullable = true)
 |-- birth_year: integer (nullable = true)
 |-- birth_month: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- address: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- per_capita_income: string (nullable = true)
 |-- yearly_income: string (nullable = true)
 |-- total_debt: string (nullable = true)
 |-- credit_score: integer (nullable = true)
 |-- num_credit_cards: integer (nullable = true)



In [16]:
# Join transactions and users on client_id
users_transactions_df = transactions_df.join(users_df, on='client_id', how='inner')


# Show schema and data types (like info in pandas)
users_transactions_df.printSchema()

# Optionally show row count
print("Number of rows:", users_transactions_df.count())

root
 |-- client_id: integer (nullable = true)
 |-- id: integer (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- card_id: integer (nullable = true)
 |-- amount: string (nullable = true)
 |-- use_chip: string (nullable = true)
 |-- merchant_id: integer (nullable = true)
 |-- merchant_city: string (nullable = true)
 |-- mcc: integer (nullable = true)
 |-- errors: string (nullable = false)
 |-- target: string (nullable = true)
 |-- current_age: integer (nullable = true)
 |-- retirement_age: integer (nullable = true)
 |-- birth_year: integer (nullable = true)
 |-- birth_month: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- address: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- per_capita_income: string (nullable = true)
 |-- yearly_income: string (nullable = true)
 |-- total_debt: string (nullable = true)
 |-- credit_score: integer (nullable = true)
 |-- num_credit_cards: integer (nullable =

25/04/17 16:39:21 WARN TaskSetManager: Stage 58 contains a task of very large size (26183 KiB). The maximum recommended task size is 1000 KiB.
25/04/17 16:39:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:39:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:39:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:39:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:39:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:39:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:39:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:39:32 WARN RowBasedKeyValu

Number of rows: 8914963


                                                                                                    

In [17]:
# Load CSV as Spark DataFrame
cards_df = spark.read.csv(f"{data_path}/cards_data.csv", header=True, inferSchema=True)

# Rename column 'id' to 'card_id'
cards_df = cards_df.withColumnRenamed("id", "card_id")

# Show DataFrame info
print("Number of rows:", cards_df.count())
cards_df.select([_sum(col(c).isNull().cast("int")).alias(c) for c in cards_df.columns]).show()

Number of rows: 6146
+-------+---------+----------+---------+-----------+-------+---+--------+----------------+------------+--------------+---------------------+----------------+
|card_id|client_id|card_brand|card_type|card_number|expires|cvv|has_chip|num_cards_issued|credit_limit|acct_open_date|year_pin_last_changed|card_on_dark_web|
+-------+---------+----------+---------+-----------+-------+---+--------+----------------+------------+--------------+---------------------+----------------+
|      0|        0|         0|        0|          0|      0|  0|       0|               0|           0|             0|                    0|               0|
+-------+---------+----------+---------+-----------+-------+---+--------+----------------+------------+--------------+---------------------+----------------+



In [18]:
cards_df = cards_df.join(users_df.select("client_id").distinct(), on="client_id", how="left_semi")
# Show results
print("Number of rows:", cards_df.count())
users_df.printSchema()

25/04/17 16:40:05 WARN TaskSetManager: Stage 86 contains a task of very large size (26183 KiB). The maximum recommended task size is 1000 KiB.
25/04/17 16:40:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:40:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:40:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:40:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:40:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:40:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:40:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:40:16 WARN RowBasedKeyValu

Number of rows: 4514
root
 |-- client_id: integer (nullable = true)
 |-- current_age: integer (nullable = true)
 |-- retirement_age: integer (nullable = true)
 |-- birth_year: integer (nullable = true)
 |-- birth_month: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- address: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- per_capita_income: string (nullable = true)
 |-- yearly_income: string (nullable = true)
 |-- total_debt: string (nullable = true)
 |-- credit_score: integer (nullable = true)
 |-- num_credit_cards: integer (nullable = true)



In [20]:
output_path = "/kaggle/working"

# Save the Data to HDFS
users_df.write.csv(f"{output_path}/users.csv", header=True, mode="overwrite")
cards_df.write.csv(f"{output_path}/cards.csv", header=True, mode="overwrite")
transactions_df.write.csv(f"{output_path}/transactions.csv", header=True, mode="overwrite")

25/04/17 16:40:58 WARN TaskSetManager: Stage 101 contains a task of very large size (26183 KiB). The maximum recommended task size is 1000 KiB.
25/04/17 16:41:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:41:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:41:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:41:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:41:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:41:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:41:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/17 16:41:09 WARN RowBasedKeyVal

In [25]:
# Ensure saving to HDFS 
test_df = spark.read.csv(f"{output_path}/transactions.csv", header=True, inferSchema=True)
test_df.count()

                                                                                                    

8914963