<img src = "https://github.com/VeryFatBoy/notebooks/blob/main/common/images/img_github_singlestore-jupyter_featured_2.png?raw=true">

<div id="singlestore-header" style="display: flex; background-color: rgba(235, 249, 245, 0.25); padding: 5px;">
    <div id="icon-image" style="width: 90px; height: 90px;">
        <img width="100%" height="100%" src="https://raw.githubusercontent.com/singlestore-labs/spaces-notebooks/master/common/images/header-icons/browser.png" />
    </div>
    <div id="text" style="padding: 5px; margin-left: 10px;">
        <div id="badge" style="display: inline-block; background-color: rgba(0, 0, 0, 0.15); border-radius: 4px; padding: 4px 8px; align-items: center; margin-top: 6px; margin-bottom: -2px; font-size: 80%">SingleStore Notebooks</div>
        <h1 style="font-weight: 500; margin: 8px 0 0 4px;">How to use SingleStore with Spark ML for Fraud Detection</h1>
    </div>
</div>

In [6]:
!pip cache purge --quiet

[0m

In [7]:
!conda install -y --quiet -c conda-forge openjdk=8

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [8]:
!pip install pyspark --quiet

In [9]:
import os
import pandas as pd
import requests

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from singlestoredb import notebook as nb
from singlestoredb.management import get_secret

In [10]:
os.makedirs("jars", exist_ok = True)

In [11]:
def download_jar(url, destination):
    response = requests.get(url)
    with open(destination, "wb") as f:
        f.write(response.content)

jar_urls = [
    ("https://repo1.maven.org/maven2/com/singlestore/singlestore-jdbc-client/1.2.4/singlestore-jdbc-client-1.2.4.jar", "jars/singlestore-jdbc-client-1.2.4.jar"),
    ("https://repo1.maven.org/maven2/com/singlestore/singlestore-spark-connector_2.12/4.1.8-spark-3.5.0/singlestore-spark-connector_2.12-4.1.8-spark-3.5.0.jar", "jars/singlestore-spark-connector_2.12-4.1.8-spark-3.5.0.jar"),
    ("https://repo1.maven.org/maven2/org/apache/commons/commons-dbcp2/2.12.0/commons-dbcp2-2.12.0.jar", "jars/commons-dbcp2-2.12.0.jar"),
    ("https://repo1.maven.org/maven2/org/apache/commons/commons-pool2/2.12.0/commons-pool2-2.12.0.jar", "jars/commons-pool2-2.12.0.jar"),
    ("https://repo1.maven.org/maven2/io/spray/spray-json_3/1.3.6/spray-json_3-1.3.6.jar", "jars/spray-json_3-1.3.6.jar")
]

for url, destination in jar_urls:
    download_jar(url, destination)

print("JAR files downloaded successfully")

JAR files downloaded successfully


In [12]:
# Create a Spark session
spark = (SparkSession
             .builder
             .config("spark.jars", ",".join([destination for _, destination in jar_urls]))
             .appName("Spark Fraud Detection")
             .getOrCreate()
        )

spark.sparkContext.setLogLevel("ERROR")

24/10/13 12:18:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


<div class="alert alert-block alert-warning">
    <b class="fa fa-solid fa-exclamation-circle"></b>
    <div>
        <p><b>Action Required</b></p>
        <p>Select the database from the drop-down menu at the top of this notebook. It updates the <b>connection_url</b> which is used by SQLAlchemy to make connections to the selected database.</p>
    </div>
</div>

In [13]:
from sqlalchemy import *

db_connection = create_engine(connection_url)
url = db_connection.url

In [14]:
password = get_secret("password")
host = url.host
port = url.port
cluster = host + ":" + str(port)

In [15]:
spark.conf.set("spark.datasource.singlestore.ddlEndpoint", cluster)
spark.conf.set("spark.datasource.singlestore.user", "admin")
spark.conf.set("spark.datasource.singlestore.password", password)
spark.conf.set("spark.datasource.singlestore.disablePushdown", "false")

In [16]:
df = (spark.read
          .format("singlestore")
          .load("fraud_detection.credit_card_tx")
)

In [17]:
df.count()

                                                                                

284807

In [18]:
df = df.dropna()
df.count()

                                                                                

284807

In [19]:
is_fraud = df.select("*").filter("Class == 1")
no_fraud = df.select("*").filter("Class == 0")

In [20]:
no_fraud = no_fraud.sample(False, 0.01, seed = 123)

In [21]:
df_concat = no_fraud.union(is_fraud)
df = df_concat.sort("Time")
df.count()

                                                                                

3274

In [22]:
df.limit(5).toPandas()

                                                                                

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,313.0,-1.038356,0.756122,0.719942,0.988402,-0.23729,-0.378862,0.413668,0.436787,-0.448178,...,0.18955,0.477872,0.298492,0.029368,-0.30042,-0.2941,-0.009905,0.038861,68.0,0
1,406.0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.17784,0.261145,-0.143276,0.0,1
2,427.0,-0.856567,0.842156,1.716677,-0.016178,1.054486,-0.440338,1.463574,-0.378757,-0.728544,...,0.049032,0.297741,-0.456502,0.197724,0.610523,-0.420531,-0.458258,-0.381306,12.98,0
3,472.0,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.0,1
4,553.0,1.189482,0.237247,0.24498,1.081824,-0.165736,-0.515929,0.090389,-0.047797,0.107288,...,0.048803,0.168335,-0.064208,0.06346,0.583806,-0.296477,0.023572,0.016121,10.08,0


In [23]:
train, test = df.cache().randomSplit([0.7, 0.3], seed = 123)

print("train =", train.count(), " test =", test.count())



train = 2276  test = 998


                                                                                

In [24]:
is_fraud = udf(lambda fraud: 1.0 if fraud > 0 else 0.0, DoubleType())
train = train.withColumn("is_fraud", is_fraud(train.Class))

In [25]:
# Create the feature vectors.
assembler = VectorAssembler(
    inputCols = [x for x in train.columns if x not in ["Time", "Class", "is_fraud"]],
    outputCol = "features"
)

# Use Logistic Regression.
lr = LogisticRegression().setParams(
    maxIter = 100000,
    labelCol = "is_fraud",
    predictionCol = "prediction"
)

model = Pipeline(stages = [assembler, lr]).fit(train)

                                                                                

In [26]:
predicted = model.transform(test)

In [27]:
predicted.limit(5).toPandas()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V25,V26,V27,V28,Amount,Class,features,rawPrediction,probability,prediction
0,427.0,-0.856567,0.842156,1.716677,-0.016178,1.054486,-0.440338,1.463574,-0.378757,-0.728544,...,0.610523,-0.420531,-0.458258,-0.381306,12.98,0,"[-0.856567135357546, 0.842155881785836, 1.7166...","[2.640307465008142, -2.640307465008142]","[0.9334110774094962, 0.06658892259050375]",0.0
1,619.0,1.121028,-0.626311,0.105123,0.634477,1.219799,4.700368,-1.488651,1.287611,1.498948,...,0.700225,-0.264955,0.102378,0.033509,24.9,0,"[1.12102817333862, -0.626310530655786, 0.10512...","[5.2697359893960325, -5.2697359893960325]","[0.994881366489263, 0.005118633510737047]",0.0
2,704.0,-1.246853,0.577702,1.790394,0.621528,0.847138,-0.630923,0.271991,-0.278619,-0.305083,...,-0.187873,0.249143,-0.381316,-0.148349,1.29,0,"[-1.24685277185499, 0.577701703273181, 1.79039...","[4.133631313458445, -4.133631313458445]","[0.9842281544857183, 0.01577184551428168]",0.0
3,1268.0,-2.581234,0.668009,-0.727711,-1.970741,-0.334986,-1.194353,-1.277241,-1.856832,0.378789,...,-0.228015,-0.125798,0.283612,-0.022305,1.0,0,"[-2.58123387829892, 0.668009392175487, -0.7277...","[7.1160099797869085, -7.1160099797869085]","[0.9991886586160917, 0.0008113413839082595]",0.0
4,1298.0,-4.22088,-5.414581,2.598473,-0.517059,3.675051,-2.11317,-3.999805,0.978199,0.222774,...,0.847367,-0.081219,0.134231,0.208687,52.52,0,"[-4.22088012284511, -5.4145811480233, 2.598472...","[6.281385340958, -6.281385340958]","[0.998132686291225, 0.0018673137087750202]",0.0


In [28]:
predicted = predicted.withColumn("is_fraud", is_fraud(predicted.Class))
predicted.crosstab("is_fraud", "prediction").show()



+-------------------+---+---+
|is_fraud_prediction|0.0|1.0|
+-------------------+---+---+
|                1.0| 14|125|
|                0.0|856|  3|
+-------------------+---+---+



                                                                                

In [31]:
spark.stop()