In [0]:
%run ./Setup

In [0]:
spark.conf.set("spark.datasource.singlestore.ddlEndpoint", cluster)
spark.conf.set("spark.datasource.singlestore.user", "admin")
spark.conf.set("spark.datasource.singlestore.password", password)
spark.conf.set("spark.datasource.singlestore.disablePushdown", "false")

In [0]:
%%time

df = (spark.read
      .format("singlestore")
      .load("fraud_detection.credit_card_tx"))

In [0]:
df.count()

In [0]:
display(df)

In [0]:
df = df.dropna()
df.count()

In [0]:
is_fraud = df.select("*").filter("Class == 1")
no_fraud = df.select("*").filter("Class == 0")

In [0]:
no_fraud = no_fraud.sample(False, 0.01, seed = 123)

In [0]:
df_concat = no_fraud.union(is_fraud)
df = df_concat.sort("Time")
df.count()

In [0]:
display(df)

In [0]:
train, test = df.randomSplit([0.7, 0.3], seed = 123)

print("train =", train.count(), " test =", test.count())

In [0]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf

is_fraud = udf(lambda fraud: 1.0 if fraud > 0 else 0.0, DoubleType())
train = train.withColumn("is_fraud", is_fraud(train.Class))

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

# Create the feature vectors.

assembler = VectorAssembler(
  inputCols = [x for x in train.columns if x not in ["Time", "Class", "is_fraud"]],
  outputCol = "features")

# Use Logistic Regression.

lr = LogisticRegression().setParams(
    maxIter = 100000,
    labelCol = "is_fraud",
    predictionCol = "prediction")

model = Pipeline(stages = [assembler, lr]).fit(train)

In [0]:
predicted = model.transform(test)

In [0]:
display(predicted)

In [0]:
predicted = predicted.withColumn("is_fraud", is_fraud(predicted.Class))
predicted.crosstab("is_fraud", "prediction").show()

Benefits of SingleStore Connector:
- Implemented as a native Spark SQL plugin.
- Accelerates ingest from Spark via compression.
- Supports data loading and extraction from database tables and Spark Dataframes.
- Integrates with the Catalyst query optimiser and supports robust SQL Pushdown.
- Accelerates ML workloads.