In [1]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.linalg import Vectors
from pyspark.ml.pipeline import Pipeline
from pyspark.mllib.classification import StreamingLogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import udf
from pyspark.sql.session import SparkSession
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.types import Row, StringType

In [2]:
def convert_value_to_row(df: DataFrame):
    temp = df.rdd.map(lambda x: Vectors.dense(x.split(',')[:-1])).toDF(['features'])
    return temp

In [3]:
spark = SparkSession \
        .builder \
        .appName("CustomerProfiling") \
        .getOrCreate()

In [4]:
historic_data = spark\
        .read\
        .csv('invoice_history.csv', header=True, inferSchema=True)

In [5]:
historic_data.summary().show()

+-------+------------------+--------------------+--------------------+--------+--------------------+--------------------+--------------------+----------+------------+----------+
|summary|               _c0|          INVOICE_ID|            PARTY_ID|CURRENCY|  TAXEXCLUSIVEAMOUNT|  TAXINCLUSIVEAMOUNT|       PAYABLEAMOUNT| ISSUEDATE|TAXPOINTDATE|   DUEDATE|
+-------+------------------+--------------------+--------------------+--------+--------------------+--------------------+--------------------+----------+------------+----------+
|  count|            636914|              636914|              636914|  636914|              636914|              636914|              636914|    636914|      636914|    636914|
|   mean|          318456.5| 5.205687464214855E8| 3.531682868110498E8|    null|   583090.5946509555|   697110.8891747192|   697110.8896912725|      null|        null|      null|
| stddev|183861.37901283128|1.0706134881682546E8|1.0502332473460259E8|    null|1.1069859421498958E7|1.33930708

## Create INV_PERIOD feature

In [64]:
# historic_data= historic_data.drop("INV_PERIOD")
# historic_data= historic_data.drop("DUEDATE")
# historic_data= historic_data.drop("TAXPOINTDATE")
# historic_data= historic_data.drop("ISSUEDATE")

In [43]:
historic_data.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- INVOICE_ID: integer (nullable = true)
 |-- PARTY_ID: integer (nullable = true)
 |-- CURRENCY: string (nullable = true)
 |-- TAXEXCLUSIVEAMOUNT: double (nullable = true)
 |-- TAXINCLUSIVEAMOUNT: double (nullable = true)
 |-- PAYABLEAMOUNT: double (nullable = true)
 |-- ISSUEDATE: string (nullable = true)
 |-- TAXPOINTDATE: string (nullable = true)
 |-- DUEDATE: string (nullable = true)



In [44]:
import pyspark.sql.functions as F

# dt1 = historic_data.select(F.to_timestamp(historic_data.DUEDATE, 'yyyy-MM-dd').alias('DUEDATE_dt'))
# dt2 = historic_data.select(F.to_timestamp(historic_data.ISSUEDATE, 'yyyy-MM-dd').alias('ISSUEDATE_dt'))
dt1 = F.to_timestamp(F.col("DUEDATE"), 'yyyy-MM-dd')
historic_data = historic_data.withColumn("DUEDATE_dt",dt1)

historic_data.printSchema()


root
 |-- _c0: integer (nullable = true)
 |-- INVOICE_ID: integer (nullable = true)
 |-- PARTY_ID: integer (nullable = true)
 |-- CURRENCY: string (nullable = true)
 |-- TAXEXCLUSIVEAMOUNT: double (nullable = true)
 |-- TAXINCLUSIVEAMOUNT: double (nullable = true)
 |-- PAYABLEAMOUNT: double (nullable = true)
 |-- ISSUEDATE: string (nullable = true)
 |-- TAXPOINTDATE: string (nullable = true)
 |-- DUEDATE: string (nullable = true)
 |-- DUEDATE_dt: timestamp (nullable = true)



In [55]:
# historic_data.select(F.col("DUEDATE_dt")).show()
dt2 = F.to_timestamp(F.col("ISSUEDATE"), 'yyyy-MM-dd')

# dt2
historic_data = historic_data.withColumn("ISSUEDATE_dt",dt2)
historic_data.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- INVOICE_ID: integer (nullable = true)
 |-- PARTY_ID: integer (nullable = true)
 |-- CURRENCY: string (nullable = true)
 |-- TAXEXCLUSIVEAMOUNT: double (nullable = true)
 |-- TAXINCLUSIVEAMOUNT: double (nullable = true)
 |-- PAYABLEAMOUNT: double (nullable = true)
 |-- ISSUEDATE: string (nullable = true)
 |-- TAXPOINTDATE: string (nullable = true)
 |-- DUEDATE: string (nullable = true)
 |-- DUEDATE_dt: timestamp (nullable = true)
 |-- ISSUEDATE_dt: timestamp (nullable = true)



In [66]:
# historic_data.printSchema()
timeDiff = (F.unix_timestamp(F.col('DUEDATE_dt'), "yyyy-MM-dd HH:mm:ss") - F.unix_timestamp(F.col('ISSUEDATE_dt'), "yyyy-MM-dd HH:mm:ss"))

#return the difference in days
historic_data = historic_data.withColumn("INV_PERIOD",timeDiff/(24*3600))
historic_data.show()

+---+----------+---------+--------+------------------+------------------+-------------+----------+------------+----------+-------------------+-------------------+----------+
|_c0|INVOICE_ID| PARTY_ID|CURRENCY|TAXEXCLUSIVEAMOUNT|TAXINCLUSIVEAMOUNT|PAYABLEAMOUNT| ISSUEDATE|TAXPOINTDATE|   DUEDATE|         DUEDATE_dt|       ISSUEDATE_dt|INV_PERIOD|
+---+----------+---------+--------+------------------+------------------+-------------+----------+------------+----------+-------------------+-------------------+----------+
|  0| 214341229|233824028|     HUF|          838701.0|         1065150.0|    1065150.0|2017-05-01|  2017-05-01|2017-05-08|2017-05-08 00:00:00|2017-05-01 00:00:00|       7.0|
|  1| 214464014|233824108|     HUF|          175000.0|          222250.0|     222250.0|2017-05-05|  2017-05-31|2017-06-05|2017-06-05 00:00:00|2017-05-05 00:00:00|      31.0|
|  2| 215035971|233824067|     EUR|             510.0|             647.7|        647.7|2017-05-07|  2017-05-07|2017-05-15|2017-05-

In [67]:
historic_data.select(F.col("INV_PERIOD")).show()

+----------+
|INV_PERIOD|
+----------+
|       7.0|
|      31.0|
|       8.0|
|       8.0|
|       8.0|
|       8.0|
|       8.0|
|       8.0|
|       8.0|
|       8.0|
|       8.0|
|      -7.0|
|      -5.0|
|      30.0|
|      30.0|
|      -2.0|
|      15.0|
|      15.0|
|      15.0|
|       8.0|
+----------+
only showing top 20 rows



## One-Hot Encoder for CURRENCY

In [68]:
customers_1 = historic_data.groupBy('CURRENCY').count().orderBy('count')
customers_1.show()

+--------+------+
|CURRENCY| count|
+--------+------+
|     CNY|     3|
|     RON|     4|
|     PLN|     6|
|     SEK|     9|
|     CAD|    12|
|     CHF|    35|
|     GBP|   171|
|     USD|   959|
|     EUR| 26453|
|     HUF|609262|
+--------+------+



In [71]:
# from pyspark.ml.feature import StringIndexer

# indexer = StringIndexer(inputCol="CURRENCY", outputCol="CURRENCY_INDEX")
# indexed = indexer.fit(historic_data).transform(historic_data)
# indexed.show()

+---+----------+---------+--------+------------------+------------------+-------------+----------+------------+----------+-------------------+-------------------+----------+--------------+
|_c0|INVOICE_ID| PARTY_ID|CURRENCY|TAXEXCLUSIVEAMOUNT|TAXINCLUSIVEAMOUNT|PAYABLEAMOUNT| ISSUEDATE|TAXPOINTDATE|   DUEDATE|         DUEDATE_dt|       ISSUEDATE_dt|INV_PERIOD|CURRENCY_INDEX|
+---+----------+---------+--------+------------------+------------------+-------------+----------+------------+----------+-------------------+-------------------+----------+--------------+
|  0| 214341229|233824028|     HUF|          838701.0|         1065150.0|    1065150.0|2017-05-01|  2017-05-01|2017-05-08|2017-05-08 00:00:00|2017-05-01 00:00:00|       7.0|           0.0|
|  1| 214464014|233824108|     HUF|          175000.0|          222250.0|     222250.0|2017-05-05|  2017-05-31|2017-06-05|2017-06-05 00:00:00|2017-05-05 00:00:00|      31.0|           0.0|
|  2| 215035971|233824067|     EUR|             510.0| 

In [80]:
# from pyspark.ml.feature import OneHotEncoder

# encoder = OneHotEncoder(inputCols=["CURRENCY_INDEX"],
#                         outputCols=["CURRENCY_ENCODED"], dropLast=False)
# model = encoder.fit(indexed)
# encoded = model.transform(indexed)
# encoded.show()

+---+----------+---------+--------+------------------+------------------+-------------+----------+------------+----------+-------------------+-------------------+----------+--------------+----------------+
|_c0|INVOICE_ID| PARTY_ID|CURRENCY|TAXEXCLUSIVEAMOUNT|TAXINCLUSIVEAMOUNT|PAYABLEAMOUNT| ISSUEDATE|TAXPOINTDATE|   DUEDATE|         DUEDATE_dt|       ISSUEDATE_dt|INV_PERIOD|CURRENCY_INDEX|CURRENCY_ENCODED|
+---+----------+---------+--------+------------------+------------------+-------------+----------+------------+----------+-------------------+-------------------+----------+--------------+----------------+
|  0| 214341229|233824028|     HUF|          838701.0|         1065150.0|    1065150.0|2017-05-01|  2017-05-01|2017-05-08|2017-05-08 00:00:00|2017-05-01 00:00:00|       7.0|           0.0|  (10,[0],[1.0])|
|  1| 214464014|233824108|     HUF|          175000.0|          222250.0|     222250.0|2017-05-05|  2017-05-31|2017-06-05|2017-06-05 00:00:00|2017-05-05 00:00:00|      31.0|   

In [81]:
encoded.select(F.col("CURRENCY_ENCODED")).where(F.col("CURRENCY") == "SEK").show()
# encoded= encoded.drop("CURRENCY_ENCODED")


+----------------+
|CURRENCY_ENCODED|
+----------------+
|  (10,[6],[1.0])|
|  (10,[6],[1.0])|
|  (10,[6],[1.0])|
|  (10,[6],[1.0])|
|  (10,[6],[1.0])|
|  (10,[6],[1.0])|
|  (10,[6],[1.0])|
|  (10,[6],[1.0])|
|  (10,[6],[1.0])|
+----------------+



In [82]:
#Easier way to do the OnehotEncoding compared to the previous method

from pyspark.ml.feature import CountVectorizer

grouped_curr = encoded.groupBy('PARTY_ID') \
   .agg(F.collect_list('CURRENCY').alias('curr'))

cv = CountVectorizer(inputCol='curr', outputCol='CURRENCY_VEC')

transformed_df = cv.fit(grouped_curr).transform(grouped_curr)
transformed_df.show()

+---------+--------------------+--------------------+
| PARTY_ID|                curr|        CURRENCY_VEC|
+---------+--------------------+--------------------+
|240595845|[HUF, HUF, HUF, H...|     (10,[0],[37.0])|
|250610671|[HUF, HUF, HUF, H...|     (10,[0],[34.0])|
|251574624|[HUF, HUF, HUF, H...|   (10,[0],[1377.0])|
|251660351|[HUF, HUF, HUF, H...|     (10,[0],[54.0])|
|319454627|               [HUF]|      (10,[0],[1.0])|
|356354761|[HUF, HUF, HUF, H...|     (10,[0],[38.0])|
|356882768|[HUF, HUF, HUF, H...|      (10,[0],[6.0])|
|362100367|[HUF, HUF, HUF, H...|    (10,[0],[211.0])|
|364296518|[HUF, HUF, HUF, H...|     (10,[0],[44.0])|
|378336934|[HUF, HUF, HUF, H...|     (10,[0],[37.0])|
|390888051|[HUF, HUF, HUF, H...|     (10,[0],[90.0])|
|399979245|[HUF, HUF, HUF, H...|(10,[0,1],[1234.0...|
|406155593|     [HUF, HUF, HUF]|      (10,[0],[3.0])|
|414669076|[HUF, HUF, HUF, H...|    (10,[0],[109.0])|
|420642369|[HUF, HUF, HUF, H...|     (10,[0],[92.0])|
|445708791|[EUR, EUR, EUR, E

In [83]:
transformed_df.count()

4974

## TAXINCUSIVE AMOUNT aggregated features

In [86]:
sum_TIA = historic_data.groupBy('PARTY_ID').sum('TAXINCLUSIVEAMOUNT')
sum_TIA.show()

+---------+-----------------------+
| PARTY_ID|sum(TAXINCLUSIVEAMOUNT)|
+---------+-----------------------+
|251574624|           5.67594283E8|
|356354761|              2140176.0|
|399979245|           2.86142703E8|
|250610671|             3.240494E7|
|378336934|              4353500.0|
|364296518|            1.0304351E7|
|240595845|            1.6279933E7|
|420642369|             2.095044E7|
|362100367|            1.6300243E7|
|251660351|            8.0383367E7|
|390888051|              3117094.0|
|414669076|            9.5956361E7|
|319454627|                 7000.0|
|406155593|               106400.0|
|445708791|              849773.69|
|542035460|              2332500.0|
|628051557|              1367580.0|
|356882768|              2235200.0|
|570481682|                 3840.0|
|584876018|                 6976.0|
+---------+-----------------------+
only showing top 20 rows



In [88]:
max_TIA = historic_data.groupBy('PARTY_ID').max('TAXINCLUSIVEAMOUNT')
max_TIA.show()

+---------+-----------------------+
| PARTY_ID|max(TAXINCLUSIVEAMOUNT)|
+---------+-----------------------+
|251574624|             1.898198E7|
|356354761|               889924.0|
|399979245|                 1.27E7|
|250610671|              1686560.0|
|378336934|               275000.0|
|364296518|              1382630.0|
|240595845|              1544880.0|
|420642369|              2540000.0|
|362100367|               480000.0|
|251660351|              5080000.0|
|390888051|               658117.0|
|414669076|            1.6224345E7|
|319454627|                 7000.0|
|406155593|                62000.0|
|445708791|               217602.0|
|542035460|               702500.0|
|628051557|               190490.0|
|356882768|               457200.0|
|570481682|                 3840.0|
|584876018|                 6976.0|
+---------+-----------------------+
only showing top 20 rows



In [94]:
min_TIA = historic_data.groupBy('PARTY_ID').min('TAXINCLUSIVEAMOUNT')
min_TIA.show()

+---------+-----------------------+
| PARTY_ID|min(TAXINCLUSIVEAMOUNT)|
+---------+-----------------------+
|251574624|             -6538946.0|
|356354761|              -889924.0|
|399979245|             -7500000.0|
|250610671|             -1681480.0|
|378336934|                38925.0|
|364296518|              -267400.0|
|240595845|                18500.0|
|420642369|              -520000.0|
|362100367|              -260000.0|
|251660351|             -3841000.0|
|390888051|               -39563.0|
|414669076|             -4208463.0|
|319454627|                 7000.0|
|406155593|                11600.0|
|445708791|              -157651.2|
|542035460|                30000.0|
|628051557|               -37000.0|
|356882768|               304800.0|
|570481682|                 3840.0|
|584876018|                 6976.0|
+---------+-----------------------+
only showing top 20 rows



In [90]:
avg_TIA = historic_data.groupBy('PARTY_ID').mean('TAXINCLUSIVEAMOUNT')
avg_TIA.show()

+---------+-----------------------+
| PARTY_ID|avg(TAXINCLUSIVEAMOUNT)|
+---------+-----------------------+
|251574624|      412196.2839506173|
|356354761|      56320.42105263158|
|399979245|     231507.04126213593|
|250610671|      953086.4705882353|
|378336934|     117662.16216216216|
|364296518|     234189.79545454544|
|240595845|      439998.1891891892|
|420642369|      227722.1739130435|
|362100367|        77252.336492891|
|251660351|     1488580.8703703703|
|390888051|      34634.37777777778|
|414669076|      880333.5871559633|
|319454627|                 7000.0|
|406155593|     35466.666666666664|
|445708791|      65367.20692307692|
|542035460|     212045.45454545456|
|628051557|                56982.5|
|356882768|      372533.3333333333|
|570481682|                 3840.0|
|584876018|                 6976.0|
+---------+-----------------------+
only showing top 20 rows



In [96]:
TIA_df= TIA_df.drop("max(TAXINCLUSIVEAMOUNT)")
TIA_df.printSchema()

root
 |-- PARTY_ID: integer (nullable = true)
 |-- sum(TAXINCLUSIVEAMOUNT): double (nullable = true)
 |-- avg(TAXINCLUSIVEAMOUNT): double (nullable = true)



In [97]:
# TIA_df = sum_TIA.join(avg_TIA, on=["PARTY_ID"], how="inner")
TIA_df = TIA_df.join(min_TIA, on=["PARTY_ID"], how="inner")
TIA_df = TIA_df.join(max_TIA, on=["PARTY_ID"], how="inner")
TIA_df.show()

+---------+-----------------------+-----------------------+-----------------------+-----------------------+
| PARTY_ID|sum(TAXINCLUSIVEAMOUNT)|avg(TAXINCLUSIVEAMOUNT)|min(TAXINCLUSIVEAMOUNT)|max(TAXINCLUSIVEAMOUNT)|
+---------+-----------------------+-----------------------+-----------------------+-----------------------+
|251574624|           5.67594283E8|      412196.2839506173|             -6538946.0|             1.898198E7|
|356354761|              2140176.0|      56320.42105263158|              -889924.0|               889924.0|
|399979245|           2.86142703E8|     231507.04126213593|             -7500000.0|                 1.27E7|
|250610671|             3.240494E7|      953086.4705882353|             -1681480.0|              1686560.0|
|378336934|              4353500.0|     117662.16216216216|                38925.0|               275000.0|
|364296518|            1.0304351E7|     234189.79545454544|              -267400.0|              1382630.0|
|240595845|            1.627

In [98]:
TIA_df.count()

4974

## TAXEXCUSIVE AMOUNT aggregated features

In [100]:
sum_TEA = historic_data.groupBy('PARTY_ID').sum('TAXEXCLUSIVEAMOUNT')
min_TEA = historic_data.groupBy('PARTY_ID').min('TAXEXCLUSIVEAMOUNT')
max_TEA = historic_data.groupBy('PARTY_ID').max('TAXEXCLUSIVEAMOUNT')
avg_TEA = historic_data.groupBy('PARTY_ID').mean('TAXEXCLUSIVEAMOUNT')
# sum_TEA.show()

In [101]:
TEA_df = sum_TEA.join(avg_TEA, on=["PARTY_ID"], how="inner")
TEA_df = TEA_df.join(min_TEA, on=["PARTY_ID"], how="inner")
TEA_df = TEA_df.join(max_TEA, on=["PARTY_ID"], how="inner")
TEA_df.show()

+---------+-----------------------+-----------------------+-----------------------+-----------------------+
| PARTY_ID|sum(TAXEXCLUSIVEAMOUNT)|avg(TAXEXCLUSIVEAMOUNT)|min(TAXEXCLUSIVEAMOUNT)|max(TAXEXCLUSIVEAMOUNT)|
+---------+-----------------------+-----------------------+-----------------------+-----------------------+
|251574624|           4.71201881E8|      342194.5395787945|             -5148776.0|            1.4946441E7|
|356354761|              1685178.0|      44346.78947368421|              -700728.0|               700728.0|
|399979245|           2.26351793E8|     183132.51860841425|             -5905512.0|                  1.0E7|
|250610671|               2.5622E7|      753588.2352941176|             -1324000.0|              1328000.0|
|378336934|              4353500.0|     117662.16216216216|                38925.0|               275000.0|
|364296518|              8113660.0|     184401.36363636365|              -210551.0|              1088685.0|
|240595845|            1.627

## Create INV_COUNT feature

In [110]:
invCount = historic_data.groupBy('PARTY_ID').count().orderBy('count')
invCount.show()


+---------+-----+
| PARTY_ID|count|
+---------+-----+
|327135843|    1|
|328809756|    1|
|542617759|    1|
|469375481|    1|
|334628148|    1|
|626648173|    1|
|327135895|    1|
|500318631|    1|
|538413920|    1|
|654066866|    1|
|368786765|    1|
|384598997|    1|
|349698943|    1|
|588538184|    1|
|240656510|    1|
|357294275|    1|
|570481682|    1|
|319454627|    1|
|584876018|    1|
|684845708|    1|
+---------+-----+
only showing top 20 rows



## INV_PERIOD aggregated features

In [111]:
# inv_count = customers_1.select(F.col("count")).where(F.col('count') > 200)
# inv_count.show()

# sum_period = historic_data.groupBy('PARTY_ID').sum('TAXEXCLUSIVEAMOUNT')
min_period = historic_data.groupBy('PARTY_ID').min('INV_PERIOD')
max_period = historic_data.groupBy('PARTY_ID').max('INV_PERIOD')
avg_period = historic_data.groupBy('PARTY_ID').mean('INV_PERIOD')

min_period.show()

+---------+---------------+
| PARTY_ID|min(INV_PERIOD)|
+---------+---------------+
|251574624|         -392.0|
|356354761|            0.0|
|399979245|         -128.0|
|250610671|            0.0|
|378336934|            3.0|
|364296518|            0.0|
|240595845|            0.0|
|420642369|            0.0|
|362100367|          -18.0|
|251660351|          -13.0|
|390888051|          -29.0|
|414669076|          -15.0|
|319454627|            0.0|
|406155593|            0.0|
|445708791|           -6.0|
|542035460|            0.0|
|628051557|           -8.0|
|356882768|            0.0|
|570481682|            0.0|
|584876018|           15.0|
+---------+---------------+
only showing top 20 rows



In [113]:
period_df = min_period.join(max_period, on=["PARTY_ID"], how="inner")
period_df = period_df.join(avg_period, on=["PARTY_ID"], how="inner")
period_df.show()

+---------+---------------+---------------+------------------+
| PARTY_ID|min(INV_PERIOD)|max(INV_PERIOD)|   avg(INV_PERIOD)|
+---------+---------------+---------------+------------------+
|251574624|         -392.0|           30.0|1.9317356572258533|
|356354761|            0.0|           21.0|13.605263157894736|
|399979245|         -128.0|           45.0| 4.252427184466019|
|250610671|            0.0|            8.0| 6.088235294117647|
|378336934|            3.0|           24.0| 8.324324324324325|
|364296518|            0.0|           11.0| 5.681818181818182|
|240595845|            0.0|            8.0| 6.918918918918919|
|420642369|            0.0|           30.0|11.684782608695652|
|362100367|          -18.0|           30.0|14.175355450236967|
|251660351|          -13.0|           30.0| 6.833333333333333|
|390888051|          -29.0|           31.0| 2.188888888888889|
|414669076|          -15.0|           30.0|11.495412844036696|
|319454627|            0.0|            0.0|            

In [114]:
period_df.count()

4974

## Merging all transformations in one table

In [118]:
final_df = transformed_df.join(TIA_df, on=["PARTY_ID"], how="inner")
final_df.printSchema()

root
 |-- PARTY_ID: integer (nullable = true)
 |-- curr: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- CURRENCY_VEC: vector (nullable = true)
 |-- sum(TAXINCLUSIVEAMOUNT): double (nullable = true)
 |-- avg(TAXINCLUSIVEAMOUNT): double (nullable = true)
 |-- min(TAXINCLUSIVEAMOUNT): double (nullable = true)
 |-- max(TAXINCLUSIVEAMOUNT): double (nullable = true)



In [119]:
final_df = final_df.join(TEA_df, on=["PARTY_ID"], how="inner")
final_df.printSchema()

root
 |-- PARTY_ID: integer (nullable = true)
 |-- curr: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- CURRENCY_VEC: vector (nullable = true)
 |-- sum(TAXINCLUSIVEAMOUNT): double (nullable = true)
 |-- avg(TAXINCLUSIVEAMOUNT): double (nullable = true)
 |-- min(TAXINCLUSIVEAMOUNT): double (nullable = true)
 |-- max(TAXINCLUSIVEAMOUNT): double (nullable = true)
 |-- sum(TAXEXCLUSIVEAMOUNT): double (nullable = true)
 |-- avg(TAXEXCLUSIVEAMOUNT): double (nullable = true)
 |-- min(TAXEXCLUSIVEAMOUNT): double (nullable = true)
 |-- max(TAXEXCLUSIVEAMOUNT): double (nullable = true)



In [120]:
final_df = final_df.join(invCount, on=["PARTY_ID"], how="inner")
final_df.printSchema()

root
 |-- PARTY_ID: integer (nullable = true)
 |-- curr: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- CURRENCY_VEC: vector (nullable = true)
 |-- sum(TAXINCLUSIVEAMOUNT): double (nullable = true)
 |-- avg(TAXINCLUSIVEAMOUNT): double (nullable = true)
 |-- min(TAXINCLUSIVEAMOUNT): double (nullable = true)
 |-- max(TAXINCLUSIVEAMOUNT): double (nullable = true)
 |-- sum(TAXEXCLUSIVEAMOUNT): double (nullable = true)
 |-- avg(TAXEXCLUSIVEAMOUNT): double (nullable = true)
 |-- min(TAXEXCLUSIVEAMOUNT): double (nullable = true)
 |-- max(TAXEXCLUSIVEAMOUNT): double (nullable = true)
 |-- count: long (nullable = false)



In [121]:
final_df = final_df.join(period_df, on=["PARTY_ID"], how="inner")
final_df.printSchema()

root
 |-- PARTY_ID: integer (nullable = true)
 |-- curr: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- CURRENCY_VEC: vector (nullable = true)
 |-- sum(TAXINCLUSIVEAMOUNT): double (nullable = true)
 |-- avg(TAXINCLUSIVEAMOUNT): double (nullable = true)
 |-- min(TAXINCLUSIVEAMOUNT): double (nullable = true)
 |-- max(TAXINCLUSIVEAMOUNT): double (nullable = true)
 |-- sum(TAXEXCLUSIVEAMOUNT): double (nullable = true)
 |-- avg(TAXEXCLUSIVEAMOUNT): double (nullable = true)
 |-- min(TAXEXCLUSIVEAMOUNT): double (nullable = true)
 |-- max(TAXEXCLUSIVEAMOUNT): double (nullable = true)
 |-- count: long (nullable = false)
 |-- min(INV_PERIOD): double (nullable = true)
 |-- max(INV_PERIOD): double (nullable = true)
 |-- avg(INV_PERIOD): double (nullable = true)



In [122]:
final_df.show()

+---------+--------------------+--------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----+---------------+---------------+------------------+
| PARTY_ID|                curr|        CURRENCY_VEC|sum(TAXINCLUSIVEAMOUNT)|avg(TAXINCLUSIVEAMOUNT)|min(TAXINCLUSIVEAMOUNT)|max(TAXINCLUSIVEAMOUNT)|sum(TAXEXCLUSIVEAMOUNT)|avg(TAXEXCLUSIVEAMOUNT)|min(TAXEXCLUSIVEAMOUNT)|max(TAXEXCLUSIVEAMOUNT)|count|min(INV_PERIOD)|max(INV_PERIOD)|   avg(INV_PERIOD)|
+---------+--------------------+--------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----+---------------+---------------+------------------+
|240595845|[HUF, HUF, HUF, H...|     (10,[0],[37.0])|            1.6279933E7|      439998.1

In [125]:
# final_df.write.csv('pyspark_invoice_df.csv')
# final_df.repartition(1).write.csv("pyspark_invoice_df.csv", sep='|')
final_df.toPandas().to_csv('mycsv.csv')

In [22]:
from pyspark.ml.feature import VectorAssembler

vecAssembler = VectorAssembler(inputCols=["TAXEXCLUSIVEAMOUNT", "TAXINCLUSIVEAMOUNT"], outputCol="features")
new_df = vecAssembler.transform(historic_data)
new_df.show()

+---+----------+---------+------------------+------------------+-------------+--------------------+
|_c0|INVOICE_ID| PARTY_ID|TAXEXCLUSIVEAMOUNT|TAXINCLUSIVEAMOUNT|PAYABLEAMOUNT|            features|
+---+----------+---------+------------------+------------------+-------------+--------------------+
|  0| 214341229|233824028|          838701.0|         1065150.0|    1065150.0|[838701.0,1065150.0]|
|  1| 214464014|233824108|          175000.0|          222250.0|     222250.0| [175000.0,222250.0]|
|  2| 215035971|233824067|             510.0|             647.7|        647.7|       [510.0,647.7]|
|  3| 215035983|233824067|             130.0|             165.1|        165.1|       [130.0,165.1]|
|  4| 215035988|233824067|           40581.0|           51538.0|      51538.0|   [40581.0,51538.0]|
|  5| 215035998|233824067|            1040.0|            1320.8|       1320.8|     [1040.0,1320.8]|
|  6| 215035999|233824067|           31000.0|           39370.0|      39370.0|   [31000.0,39370.0]|


In [23]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=2, seed=1)  # 2 clusters here
model = kmeans.fit(new_df.select('features'))


In [24]:
transformed = model.transform(new_df)
transformed.show()    


+---+----------+---------+------------------+------------------+-------------+--------------------+----------+
|_c0|INVOICE_ID| PARTY_ID|TAXEXCLUSIVEAMOUNT|TAXINCLUSIVEAMOUNT|PAYABLEAMOUNT|            features|prediction|
+---+----------+---------+------------------+------------------+-------------+--------------------+----------+
|  0| 214341229|233824028|          838701.0|         1065150.0|    1065150.0|[838701.0,1065150.0]|         0|
|  1| 214464014|233824108|          175000.0|          222250.0|     222250.0| [175000.0,222250.0]|         0|
|  2| 215035971|233824067|             510.0|             647.7|        647.7|       [510.0,647.7]|         0|
|  3| 215035983|233824067|             130.0|             165.1|        165.1|       [130.0,165.1]|         0|
|  4| 215035988|233824067|           40581.0|           51538.0|      51538.0|   [40581.0,51538.0]|         0|
|  5| 215035998|233824067|            1040.0|            1320.8|       1320.8|     [1040.0,1320.8]|         0|
|

In [26]:
pred_count = transformed.groupBy('prediction').count().orderBy('count')


In [28]:
pred_count.show()

+----------+------+
|prediction| count|
+----------+------+
|         1|     9|
|         0|636905|
+----------+------+

