In [1]:
pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/f0/26/198fc8c0b98580f617cb03cb298c6056587b8f0447e20fa40c5b634ced77/pyspark-3.0.1.tar.gz (204.2MB)
[K     |████████████████████████████████| 204.2MB 76kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 46.8MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612242 sha256=a5d75c5c1b08301aa4345ee4c01b15f463e1a32fb55b8ea56d4b339216cc06ac
  Stored in directory: /root/.cache/pip/wheels/5e/bd/07/031766ca628adec8435bb40f0bd83bb676ce65ff4007f8e73f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1


In [2]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.linalg import Vectors
from pyspark.ml.pipeline import Pipeline
from pyspark.mllib.classification import StreamingLogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import udf
from pyspark.sql.session import SparkSession
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.types import Row, StringType

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
spark = SparkSession \
        .builder \
        .appName("CustomerProfiling") \
        .getOrCreate()

In [5]:
historic_data = spark\
        .read\
        .csv('/content/drive/MyDrive/static_data_v1.csv', header=True, inferSchema=True)

In [6]:
historic_data.summary().show()

+-------+------------------+--------------------+------------------+------------------+------------------+------------------+-----------------+
|summary|             index|            PARTY_ID|          LOB_CODE|              SIZE|           PACKAGE|            REGION|           COUNTY|
+-------+------------------+--------------------+------------------+------------------+------------------+------------------+-----------------+
|  count|             25978|               25978|             25978|             25978|             25978|             25978|            25978|
|   mean|13043.932057895143|4.3104829911390406E8| 55174.12579875279|14.875587035183617| 3.926283778581877| 3.975209792901686|9.635730233274309|
| stddev| 7516.558197906407|1.0519980499034752E8|16325.228882396343| 93.37616344223775|2.4452100085336257|2.7999336981645677|4.725322433670222|
|    min|                 0|           233823834|             10114|                 1|                 0|                 1|           

In [7]:
historic_data.printSchema()

root
 |-- index: integer (nullable = true)
 |-- PARTY_ID: integer (nullable = true)
 |-- LOB_CODE: integer (nullable = true)
 |-- SIZE: integer (nullable = true)
 |-- PACKAGE: integer (nullable = true)
 |-- REGION: integer (nullable = true)
 |-- COUNTY: integer (nullable = true)



In [8]:
from pyspark.ml.feature import VectorAssembler

vecAssembler = VectorAssembler(inputCols=["REGION", "COUNTY"], outputCol="features")
new_df = vecAssembler.transform(historic_data)
new_df.show()

+-----+---------+--------+----+-------+------+------+----------+
|index| PARTY_ID|LOB_CODE|SIZE|PACKAGE|REGION|COUNTY|  features|
+-----+---------+--------+----+-------+------+------+----------+
|    0|233823894|   49414|   9|      3|     1|    12|[1.0,12.0]|
|    1|233823896|   55174|4999|      3|     1|    12|[1.0,12.0]|
|    2|233823898|   70224|   1|      3|     1|    12|[1.0,12.0]|
|    3|233823900|   81104|   4|      3|     2|    12|[2.0,12.0]|
|    4|233823901|   46514|   9|      3|     1|    12|[1.0,12.0]|
|    5|233823903|   94994|   1|      3|     2|    11|[2.0,11.0]|
|    6|233823905|   55174|   1|      2|     6|     2| [6.0,2.0]|
|    7|233823907|   52214|  49|     10|     3|    11|[3.0,11.0]|
|    8|233831941|   55174|   1|      2|     2|    12|[2.0,12.0]|
|    9|233831943|   47914|   1|     10|     6|     2| [6.0,2.0]|
|   10|233831946|   47304|  19|     10|     9|     6| [9.0,6.0]|
|   11|233831947|   55174|   1|     10|     1|    12|[1.0,12.0]|
|   12|233831949|   55174

In [9]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=2, seed=1)  # 2 clusters here
model = kmeans.fit(new_df.select('features'))


In [10]:
transformed = model.transform(new_df)
transformed.show()    


+-----+---------+--------+----+-------+------+------+----------+----------+
|index| PARTY_ID|LOB_CODE|SIZE|PACKAGE|REGION|COUNTY|  features|prediction|
+-----+---------+--------+----+-------+------+------+----------+----------+
|    0|233823894|   49414|   9|      3|     1|    12|[1.0,12.0]|         1|
|    1|233823896|   55174|4999|      3|     1|    12|[1.0,12.0]|         1|
|    2|233823898|   70224|   1|      3|     1|    12|[1.0,12.0]|         1|
|    3|233823900|   81104|   4|      3|     2|    12|[2.0,12.0]|         1|
|    4|233823901|   46514|   9|      3|     1|    12|[1.0,12.0]|         1|
|    5|233823903|   94994|   1|      3|     2|    11|[2.0,11.0]|         1|
|    6|233823905|   55174|   1|      2|     6|     2| [6.0,2.0]|         0|
|    7|233823907|   52214|  49|     10|     3|    11|[3.0,11.0]|         1|
|    8|233831941|   55174|   1|      2|     2|    12|[2.0,12.0]|         1|
|    9|233831943|   47914|   1|     10|     6|     2| [6.0,2.0]|         0|
|   10|23383

In [11]:
pred_count = transformed.groupBy('prediction').count().orderBy('count')


In [12]:
pred_count.show()

+----------+-----+
|prediction|count|
+----------+-----+
|         0| 8373|
|         1|17605|
+----------+-----+



In [16]:
historic_data.toPandas().to_csv('/content/pyspark_static_data_v1.csv')