In [1]:
import hsfs
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.types import * 

Starting Spark application


ID,Application ID,Kind,State,Spark UI,Driver log
11,application_1709657904861_0024,pyspark,idle,Link,Link


SparkSession available as 'spark'.


In [2]:
from fraud.features.transactions.transactions import transaction_abroad

# Defined here for completeness
# def transaction_abroad(issuer_country: pd.Series, transaction_country: pd.Series) -> pd.Series:
#     return (issuer_country != transaction_country).replace({True: 1, False: 0})

In [3]:
connection = hsfs.connection()
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [4]:
transactions_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("hdfs:///Projects/live_coding/RawData/transactions.csv")

In [5]:
transactions_df.printSchema()

root
 |-- tid: string (nullable = true)
 |-- datetime: string (nullable = true)
 |-- cc_num: long (nullable = true)
 |-- category: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- fraud_label: integer (nullable = true)

In [6]:
transactions_df.count()

419120

In [7]:
transactions_df.show(1)

+--------------------+-------------------+----------------+-------------+------+--------+---------+------+-------+-----------+
|                 tid|           datetime|          cc_num|     category|amount|latitude|longitude|  city|country|fraud_label|
+--------------------+-------------------+----------------+-------------+------+--------+---------+------+-------+-----------+
|11df919988c134d97...|2022-01-01 00:00:24|4473593503484549|Health/Beauty| 62.95|42.30865|-83.48216|Canton|     US|          0|
+--------------------+-------------------+----------------+-------------+------+--------+---------+------+-------+-----------+
only showing top 1 row

In [8]:
# Read the profiles feature group to get the location of the credit card owner
profiles_fg = fs.get_feature_group("profiles", 1)
profiles_df = profiles_fg.select(["cc_num", "country"]).read()

In [9]:
transactions_df = transactions_df.join(
    profiles_df.selectExpr(["cc_num", "country as issuer_country"]), 
    on="cc_num"
)

In [10]:
transactions_df

DataFrame[cc_num: bigint, tid: string, datetime: string, category: string, amount: double, latitude: double, longitude: double, city: string, country: string, fraud_label: int, issuer_country: string]

In [11]:
transaction_abroad_udf = F.pandas_udf(transaction_abroad, IntegerType())



In [12]:
df = (
    transactions_df.withColumn(
        "event_time",
        F.to_timestamp(F.lit("2024-03-07 00:00:24"), format="yyyy-MM-dd HH:mm:ss"),
    )
    .withColumn(
        "is_transaction_abroad", transaction_abroad_udf("issuer_country", "country")
    )
    .select(
        "tid",
        "event_time",
        "fraud_label",
        "cc_num",
        "amount",
        "category",
        "is_transaction_abroad",
    )
)

In [13]:
df.show(1)

+--------------------+-------------------+-----------+----------------+------+--------+---------------------+
|                 tid|         event_time|fraud_label|          cc_num|amount|category|is_transaction_abroad|
+--------------------+-------------------+-----------+----------------+------+--------+---------------------+
|6ebf5f57f64179693...|2024-03-07 00:00:24|          0|4115917870347217| 49.44| Grocery|                    0|
+--------------------+-------------------+-----------+----------------+------+--------+---------------------+
only showing top 1 row

In [14]:
transactions = fs.get_or_create_feature_group(
    name="transactions",
    version=1,
    description="Credit card transactions with label",
    primary_key=['tid'],
    event_time="event_time",
    online_enabled=True,
    parents=[profiles_fg],
    statistics_config={'histograms': True, 'correlations': True},
)

In [15]:
transactions.insert(df)

(None, None)