In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [2]:
spark = SparkSession.builder.appName("ML lib").getOrCreate()
spark

In [None]:
import os, sys
from os.path import isfile, join

loc = os.path.abspath("")
data_path = join(loc, 'ignoreFiles','Synthetic Financial Datasets.csv')
data_path

In [5]:
# Dataset https://www.kaggle.com/ntnu-testimon/paysim1/data
df = spark.read.csv(data_path, inferSchema=True, header= True).sample(withReplacement=True, fraction=0.1, seed=0)
df.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- isFlaggedFraud: integer (nullable = true)



In [6]:
df.limit(10).show()

+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|   amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT|  3099.97| C249177573|      20771.0|      17671.03|M2096539129|           0.0|           0.0|      0|             0|
|   1| PAYMENT|  4098.78|C1026483832|     503264.0|     499165.22|M1635378213|           0.0|           0.0|      0|             0|
|   1| PAYMENT|  1157.86|C1237762639|      21156.0|      19998.14|M1877062907|           0.0|           0.0|      0|             0|
|   1|TRANSFER| 215310.3|C1670993182|        705.0|           0.0|C1100439041|       22425.0|           0.0|      0|             0|
|   1| PAYMENT|  6444.64|C1262609629|      12019.0|       5574.36| M58718031

In [7]:
df = df.select('type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'isFraud') # the df using, 'isFraud' is the label
df.limit(10).show()

+--------+---------+-------------+--------------+-------+
|    type|   amount|oldbalanceOrg|newbalanceOrig|isFraud|
+--------+---------+-------------+--------------+-------+
| PAYMENT|  3099.97|      20771.0|      17671.03|      0|
| PAYMENT|  4098.78|     503264.0|     499165.22|      0|
| PAYMENT|  1157.86|      21156.0|      19998.14|      0|
|TRANSFER| 215310.3|        705.0|           0.0|      0|
| PAYMENT|  6444.64|      12019.0|       5574.36|      0|
| PAYMENT|  2998.04|      12030.0|       9031.96|      0|
|TRANSFER|358831.92|          0.0|           0.0|      0|
| PAYMENT|   661.43|      14078.0|      13416.57|      0|
|TRANSFER|679502.24|        290.0|           0.0|      0|
| PAYMENT|   8550.9|      40060.0|       31509.1|      0|
+--------+---------+-------------+--------------+-------+



In [8]:
# Train/Test split
train, test = df.randomSplit([0.7,0.3], seed = 7)

In [9]:
print(train.count()) # cost 1.3 min on my local machine for 6 million data tuples

445270


In [10]:
train.dtypes

[('type', 'string'),
 ('amount', 'double'),
 ('oldbalanceOrg', 'double'),
 ('newbalanceOrig', 'double'),
 ('isFraud', 'int')]

In [11]:
# category features waiting to encoding
col_cat = [x[0] for x in train.dtypes if x[1] == 'string']
col_cat

['type']

In [12]:
# numeric features
col_num = [x[0] for x in train.dtypes if ((x[1] == 'double') & (x[0] != 'isFraud'))]
col_num

['amount', 'oldbalanceOrg', 'newbalanceOrig']

In [13]:
# OneHot Encoding
# train data in a nutshell
train.groupBy(F.col('type')).agg(F.count(F.col('type')), 
                                 F.avg(F.col('oldbalanceOrg')),
                                 F.avg(F.col('newbalanceOrig'))
                                 ).show()

+--------+-----------+------------------+-------------------+
|    type|count(type)|avg(oldbalanceOrg)|avg(newbalanceOrig)|
+--------+-----------+------------------+-------------------+
|TRANSFER|      37221| 55395.47344402353|  10964.96704333575|
| CASH_IN|      97944|3624019.4883186365| 3793578.2936142087|
|CASH_OUT|     156412| 46732.76179890292|  17636.13890845972|
| PAYMENT|     150753|  69400.7541342461|  62998.74784448732|
|   DEBIT|       2940|  65146.0245782313|  61939.99417346938|
+--------+-----------+------------------+-------------------+



In [14]:
# MLib import

from pyspark.ml.feature import OneHotEncoder, StringIndexer

In [15]:
help(StringIndexer)

Help on class StringIndexer in module pyspark.ml.feature:

class StringIndexer(pyspark.ml.wrapper.JavaEstimator, _StringIndexerParams, pyspark.ml.util.JavaMLReadable, pyspark.ml.util.JavaMLWritable)
 |  StringIndexer(inputCol=None, outputCol=None, inputCols=None, outputCols=None, handleInvalid='error', stringOrderType='frequencyDesc')
 |  
 |  A label indexer that maps a string column of labels to an ML column of label indices.
 |  If the input column is numeric, we cast it to string and index the string values.
 |  The indices are in [0, numLabels). By default, this is ordered by label frequencies
 |  so the most frequent label gets index 0. The ordering behavior is controlled by
 |  setting :py:attr:`stringOrderType`. Its default value is 'frequencyDesc'.
 |  
 |  >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed",
 |  ...     stringOrderType="frequencyDesc")
 |  >>> stringIndexer.setHandleInvalid("error")
 |  StringIndexer...
 |  >>> model = stringIndexer.fit(st

In [16]:
# a general way to encode(multi features making at once)
string_indexer = [
    StringIndexer(inputCol=x, outputCol=x+'_StringIndexer', handleInvalid= "skip")
    for x in col_cat
] # comperhension expression

In [17]:
on_hot_encoder = [
    OneHotEncoder(
        inputCols=[f"{x}_StringIndexer" for x in col_cat],
        outputCols=[f"{x}_OneHotEncoder" for x in col_cat]
    )
]
on_hot_encoder

[OneHotEncoder_318ab543bb7f]

In [18]:
# vector assembler -- spark ML only accept vector inputs -- which mean need another data convert
from pyspark.ml.feature import VectorAssembler

In [19]:
assembler_input = [x for x in col_num]
assembler_input += [f"{x}_OneHotEncoder" for x in col_cat]
assembler_input

['amount', 'oldbalanceOrg', 'newbalanceOrig', 'type_OneHotEncoder']

In [20]:
vector_assembler = VectorAssembler(
    inputCols=assembler_input, outputCol="VectorAssembly_features"
)
vector_assembler

VectorAssembler_e60cd0add28f

In [21]:
# more OneHotEncoder

df_example = spark.createDataFrame([
    (0.0, 'x'),
    (1.0, 'y'),
    (2.0, 'z'),
    (3.0, 'o'),
    (4.0, 'p'),
    (2.0, 'z')
], ["categoryIndex", "value"])

encoder = OneHotEncoder(inputCols=["categoryIndex"],
                        outputCols=["categoryVec"])
model = encoder.fit(df_example)
encoded = model.transform(df_example)
encoded.show() # to interpret refer to RDD data type "Local vector"

+-------------+-----+-------------+
|categoryIndex|value|  categoryVec|
+-------------+-----+-------------+
|          0.0|    x|(4,[0],[1.0])|
|          1.0|    y|(4,[1],[1.0])|
|          2.0|    z|(4,[2],[1.0])|
|          3.0|    o|(4,[3],[1.0])|
|          4.0|    p|    (4,[],[])|
|          2.0|    z|(4,[2],[1.0])|
+-------------+-----+-------------+



In [22]:
# spark ML pipeline -- similar to sklearn
# good practice to organize stages in this way
stages = []
stages += string_indexer
stages += on_hot_encoder
stages += [vector_assembler]

In [23]:
stages

[StringIndexer_6ff6fb2ba74e,
 OneHotEncoder_318ab543bb7f,
 VectorAssembler_e60cd0add28f]

In [24]:
from pyspark.ml import Pipeline

In [25]:
pipeline = Pipeline().setStages(stages)
model = pipeline.fit(train) # fit train data to generate a transform model -- string_indexer, one_hot_encoder, vector_assembler all are preparing data

In [26]:
pp_df = model.transform(test) # transform test data

In [27]:
pp_df.show(truncate = False)

+-------+------+-------------+--------------+-------+------------------+------------------+-----------------------+
|   type|amount|oldbalanceOrg|newbalanceOrig|isFraud|type_StringIndexer|type_OneHotEncoder|VectorAssembly_features|
+-------+------+-------------+--------------+-------+------------------+------------------+-----------------------+
|CASH_IN| 53.28|   6317325.17|    6317378.45|      0|               2.0|     (4,[2],[1.0])|   [53.28,6317325.17...|
|CASH_IN| 83.13|        120.0|        203.13|      0|               2.0|     (4,[2],[1.0])|   [83.13,120.0,203....|
|CASH_IN|103.95|     79317.56|      79421.51|      0|               2.0|     (4,[2],[1.0])|   [103.95,79317.56,...|
|CASH_IN|176.14|1.134638683E7| 1.134656297E7|      0|               2.0|     (4,[2],[1.0])|   [176.14,1.1346386...|
|CASH_IN|226.37|    6095573.3|    6095799.67|      0|               2.0|     (4,[2],[1.0])|   [226.37,6095573.3...|
|CASH_IN|251.31|   4997835.18|    4998086.48|      0|               2.0|

In [29]:
# Logistic Regression
from pyspark.ml.classification import LogisticRegression

In [30]:
data = pp_df.select(
    F.col("VectorAssembly_features").alias("features"),
    F.col("isFraud").alias("label")
)

In [32]:
data.show(5, truncate = False)

+----------------------------------------------------+-----+
|features                                            |label|
+----------------------------------------------------+-----+
|[53.28,6317325.17,6317378.45,0.0,0.0,1.0,0.0]       |0    |
|[83.13,120.0,203.13,0.0,0.0,1.0,0.0]                |0    |
|[103.95,79317.56,79421.51,0.0,0.0,1.0,0.0]          |0    |
|[176.14,1.134638683E7,1.134656297E7,0.0,0.0,1.0,0.0]|0    |
|[226.37,6095573.3,6095799.67,0.0,0.0,1.0,0.0]       |0    |
+----------------------------------------------------+-----+
only showing top 5 rows



In [33]:
%%time
model = LogisticRegression().fit(data)

Wall time: 8.75 s


In [38]:
model.summary.areaUnderROC ## how ??? taking into model's original inputs?? No wonder the score is this high!

0.976321534386822