In [1]:
from pyspark.ml import Transformer
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, FloatType, IntegerType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Word2Vec, Tokenizer, RegexTokenizer
import math

In [2]:
from pyspark.sql import SparkSession

team = 16
nworkers = 3
cores = 1
warehouse = "project/hive/warehouse"

spark = SparkSession.builder\
        .appName("{} - spark ML".format(team))\
        .master("yarn")\
        .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883")\
        .config("spark.sql.warehouse.dir", warehouse)\
        .config("spark.sql.avro.compression.codec", "snappy")\
        .config('spark.executor.instances', nworkers)\
        .config("spark.executor.cores", cores)\
        .config("spark.executor.cpus", cores)\
        .config("spark.executor.memory", "4g")\
        .enableHiveSupport()\
        .getOrCreate()
        # 
spark

In [3]:
sc = spark.sparkContext # !!!!!!
sc.addPyFile('Net.py')

In [4]:
spark.sql("USE team16_projectdb").show()
spark.sql("SELECT * FROM ecom_part_buck").show()

++
||
++
++

+--------+-------------------+----------+-------------------+--------------------+----------+-------+---------+--------------------+-----------+
|event_id|         event_time|product_id|        category_id|       category_code|     brand|  price|  user_id|        user_session|event_types|
+--------+-------------------+----------+-------------------+--------------------+----------+-------+---------+--------------------+-----------+
|42441406|2019-10-31 22:58:52|   4804409|2053013554658804075|electronics.audio...|     elari|  61.65|512831449|528d70c6-44ef-44e...|       cart|
|42433956|2019-10-31 22:08:14|   5100570|2053013553341792533|  electronics.clocks|     apple|  447.6|520814382|06ec3176-a5c8-427...|       cart|
|42412263|2019-10-31 20:37:34|   1005116|2053013555631882655|electronics.smart...|     apple|1013.86|515926715|f5453671-cfd2-4f7...|       cart|
|42412081|2019-10-31 20:37:05|   1004888|2053013555631882655|electronics.smart...|   samsung| 224.46|562130094|1bf38b

In [4]:
data = spark.read.format("avro").table('team16_projectdb.ecom_part_buck')
data.printSchema()

root
 |-- event_id: integer (nullable = true)
 |-- event_time: timestamp (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: float (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)
 |-- event_types: string (nullable = true)



In [6]:
class CyclicTransformer(Transformer):
    def __init__(self, input_col):
        super(CyclicTransformer, self).__init__()
        self.input_col = input_col

    def _transform(self, df):
        extract_year = F.udf(lambda x: x.year)
        extract_month = F.udf(lambda x: x.month)
        extract_day = F.udf(lambda x: x.day)
        extract_hour = F.udf(lambda x: x.hour)
        extract_minute = F.udf(lambda x: x.minute)
        extract_second = F.udf(lambda x: x.second)

        return df.withColumn('year', extract_year(self.input_col))\
                 .withColumn('month', extract_month(self.input_col))\
                 .withColumn('day', extract_day(self.input_col))\
                 .withColumn('hour', extract_hour(self.input_col))\
                 .withColumn('minute', extract_minute(self.input_col))\
                 .withColumn('second', extract_second(self.input_col))\
                 .withColumn('month_sin', F.sin(F.col('month') * 2 * math.pi / 12))\
                 .withColumn('month_cos', F.cos(F.col('month') * 2 * math.pi / 12))\
                 .withColumn('day_sin', F.sin(F.col('day') * 2 * math.pi / 31))\
                 .withColumn('day_cos', F.cos(F.col('day') * 2 * math.pi / 31))\
                 .withColumn('hour_sin', F.sin(F.col('hour') * 2 * math.pi / 24))\
                 .withColumn('hour_cos', F.cos(F.col('hour') * 2 * math.pi / 24))\
                 .withColumn('minute_sin', F.sin(F.col('minute') * 2 * math.pi / 60))\
                 .withColumn('minute_cos', F.cos(F.col('minute') * 2 * math.pi / 60))\
                 .withColumn('second_sin', F.sin(F.col('second') * 2 * math.pi / 60))\
                 .withColumn('second_cos', F.cos(F.col('second') * 2 * math.pi / 60))\
                 .drop('month').drop('day')\
                 .drop('hour').drop('minute').drop('second')

In [7]:
cyclic_trans = CyclicTransformer('event_time')
data = cyclic_trans.transform(data)
data.show(1)

+--------+-------------------+----------+-------------------+--------------------+-----+-----+---------+--------------------+-----------+----+-------------------+------------------+--------------------+-------+-------------------+------------------+--------------------+------------------+------------------+------------------+
|event_id|         event_time|product_id|        category_id|       category_code|brand|price|  user_id|        user_session|event_types|year|          month_sin|         month_cos|             day_sin|day_cos|           hour_sin|          hour_cos|          minute_sin|        minute_cos|        second_sin|        second_cos|
+--------+-------------------+----------+-------------------+--------------------+-----+-----+---------+--------------------+-----------+----+-------------------+------------------+--------------------+-------+-------------------+------------------+--------------------+------------------+------------------+------------------+
|42441406|2019-1

In [8]:
data = data.na.drop(subset=data.columns)
data = data.filter(data.brand != '')
data = data.filter(data.category_code != '')

data.groupBy("category_code").count().withColumnRenamed("count", "num").orderBy(F.col("num").desc()).show(20)

In [9]:
event_type_to_rating = F.udf(lambda x: 1 if x == 'purchase' else 0 if x == 'cart' else -1, IntegerType())
data = data.withColumn('rating', event_type_to_rating('event_types')).drop('event_types')
data.printSchema()

root
 |-- event_id: integer (nullable = true)
 |-- event_time: timestamp (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: float (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month_sin: double (nullable = true)
 |-- month_cos: double (nullable = true)
 |-- day_sin: double (nullable = true)
 |-- day_cos: double (nullable = true)
 |-- hour_sin: double (nullable = true)
 |-- hour_cos: double (nullable = true)
 |-- minute_sin: double (nullable = true)
 |-- minute_cos: double (nullable = true)
 |-- second_sin: double (nullable = true)
 |-- second_cos: double (nullable = true)
 |-- rating: integer (nullable = true)



In [10]:
# brand preprocessing: take only popular brands (> 10000 interactions)

brand_counts = data.groupBy("brand").count()
rare_brands = brand_counts.filter(F.col("count") < 10000).select("brand").rdd.flatMap(lambda x: x).collect()
data = data.withColumn("brand", F.when(F.col("brand").isin(rare_brands), "other").otherwise(F.col("brand")))

In [11]:
# one hot encoding of brand

# indexer = StringIndexer(inputCol='brand', outputCol="brand_indexed")
# oh_encoder = OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="brand_encoded")

# brand_pipeline = Pipeline(stages=[indexer, oh_encoder]).fit(data)
# data = brand_pipeline.transform(data)

In [12]:
from pyspark.sql.types import IntegerType, FloatType

# Content-based RecSys

In [13]:
user_features = ['user_id']
item_features = ['product_id', 'category_code', 'brand', 'price']
session_features = ['year', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos', 'second_sin', 'second_cos']
target = 'rating'

# Filtering

In [14]:
# Leave users only with more than 5 events(purchase, cart, view), so that we can split to train/test for each user by event_time with 0.2 ratio
user_interaction_counts = data.groupBy('user_id').count()
active_users = user_interaction_counts.filter(F.col('count') > 5)
data_filtered = data.join(active_users, 'user_id', 'inner')
data_filtered.select(user_features + item_features).show(10)

+---------+----------+--------------------+-------+-------+
|  user_id|product_id|       category_code|  brand|  price|
+---------+----------+--------------------+-------+-------+
|337535108|  14100411|electronics.audio...|  other| 148.88|
|337535108|   1307285|  computers.notebook|     hp|1407.76|
|337535108|   1307285|  computers.notebook|     hp|1407.76|
|337535108|   1004529|electronics.smart...|samsung| 396.15|
|337535108|   1004529|electronics.smart...|samsung| 396.15|
|337535108|   1307472|  computers.notebook|  other|1363.19|
|404666934|   1306651|  computers.notebook|   acer| 488.82|
|404666934|   1307144|  computers.notebook|     hp|  599.5|
|404666934|   1307435|  computers.notebook| lenovo| 450.44|
|404666934|   1307293|  computers.notebook|  other| 868.93|
+---------+----------+--------------------+-------+-------+
only showing top 10 rows



# Scaling Numerical features

In [15]:
data_filtered = data_filtered.withColumn('year', F.col('year').cast('int'))

In [16]:
# MinMax Scaling numerical columns

from pyspark.ml import Pipeline
from pyspark.ml.feature import MinMaxScaler
from pyspark.sql.functions import udf

columns_to_scale = ['year', 'price']

assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_scale]
scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
pipeline = Pipeline(stages=assemblers + scalers)
scalerModel = pipeline.fit(data_filtered)
scaled_data = scalerModel.transform(data_filtered)

scaled_data = scaled_data.drop(*['year', 'price', 'year_vec', 'price_vec'])

udf_extract_double = udf(lambda vector: vector.tolist()[0], FloatType())
scaled_data = scaled_data.withColumn("year", udf_extract_double("year_scaled")).withColumn("price", udf_extract_double("price_scaled"))
scaled_data = scaled_data.drop(*['year_scaled','price_scaled','count', 'category_id'])

In [17]:
scaled_data.select(user_features + item_features).show(1)

+---------+----------+--------------------+-------+----------+
|  user_id|product_id|       category_code|  brand|     price|
+---------+----------+--------------------+-------+----------+
|337535108|   1004529|electronics.smart...|samsung|0.15361089|
+---------+----------+--------------------+-------+----------+
only showing top 1 row



In [18]:
scaled_data.select(session_features).show(1)

+----+-------------------+------------------+-------------------+------------------+-------------------+-------------------+------------------+------------------+------------------+------------------+
|year|          month_sin|         month_cos|            day_sin|           day_cos|           hour_sin|           hour_cos|        minute_sin|        minute_cos|        second_sin|        second_cos|
+----+-------------------+------------------+-------------------+------------------+-------------------+-------------------+------------------+------------------+------------------+------------------+
| 0.5|-0.8660254037844386|0.5000000000000001|0.20129852008866006|0.9795299412524945|-0.4999999999999997|-0.8660254037844388|0.3090169943749474|0.9510565162951535|0.6691306063588582|0.7431448254773942|
+----+-------------------+------------------+-------------------+------------------+-------------------+-------------------+------------------+------------------+------------------+---------------

# Categorical features

### User_id

In [19]:
unique_user_ids = scaled_data.select('user_id').distinct().rdd.flatMap(lambda x: x).collect()
user_id_mapping = {_id: idx for idx, _id in enumerate(unique_user_ids)}

In [20]:
user_id_mapper = F.udf(lambda x: user_id_mapping[x], IntegerType())
mapped_data = scaled_data.withColumn('user_id', user_id_mapper('user_id'))

### Product_id

In [21]:
unique_product_ids = mapped_data.select('product_id').distinct().rdd.flatMap(lambda x: x).collect()
product_id_mapping = {_id: idx for idx, _id in enumerate(unique_product_ids)}

In [22]:
product_id_mapper = F.udf(lambda x: product_id_mapping[x], IntegerType())
mapped_data = mapped_data.withColumn('product_id', product_id_mapper('product_id'))

### Category_code

In [23]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Word2Vec, RegexTokenizer

tokenizer = RegexTokenizer(inputCol='category_code', outputCol='tokenized_category', pattern="\.")
word2Vec = Word2Vec(vectorSize=16, seed=42, minCount=1, inputCol='tokenized_category', outputCol='category_embedding')
embedding_pipeline = Pipeline(stages=[tokenizer, word2Vec]).fit(mapped_data)
mapped_data = embedding_pipeline.transform(mapped_data)

In [24]:
mapped_data = mapped_data.drop('category_code').withColumnRenamed("category_embedding", "category_code")

### Brand

In [25]:
unique_brand_ids = mapped_data.select('brand').distinct().rdd.flatMap(lambda x: x).collect()
brand_id_mapping = {_id: idx for idx, _id in enumerate(unique_brand_ids)}

In [26]:
brand_id_mapper = F.udf(lambda x: brand_id_mapping[x], IntegerType())
mapped_data = mapped_data.withColumn('brand', brand_id_mapper('brand'))

In [27]:
mapped_data.select(user_features + item_features).show(1)

+-------+----------+--------------------+-----+----------+
|user_id|product_id|       category_code|brand|     price|
+-------+----------+--------------------+-----+----------+
|      0|     27733|[0.26988664269447...|   16|0.15361089|
+-------+----------+--------------------+-----+----------+
only showing top 1 row



# Train/Test split

In [28]:
# Train/Test split
from pyspark.sql.window import Window

window_spec = Window.partitionBy('user_id').orderBy('event_time')

df_with_row_number = mapped_data.withColumn('row_number', F.row_number().over(window_spec))

user_count_window = Window.partitionBy('user_id')
total_user_count = F.count('user_id').over(user_count_window)

# Calculate the 80% threshold for each user group
train_test_ratio = 0.8
split_threshold = (total_user_count * train_test_ratio).cast('int')

# Assign a 'train' or 'test' label based on the row number and the split threshold
df_labeled = df_with_row_number.withColumn('split', F.when(F.col('row_number') <= split_threshold, 'train').otherwise('test'))

# Split the DataFrame into train and test sets based on the label
train_df = df_labeled.filter(F.col('split') == 'train').drop('row_number', 'split')
test_df = df_labeled.filter(F.col('split') == 'test').drop('row_number', 'split')

In [29]:
train_cnt, test_cnt = train_df.count(), test_df.count()

assert train_df.select('user_id').distinct().count() == test_df.select('user_id').distinct().count()
assert train_test_ratio - 0.05 <= (train_cnt / (train_cnt + test_cnt)) <= train_test_ratio + 0.05

In [30]:
N_users = mapped_data.select('user_id').distinct().count()
N_products = mapped_data.select('product_id').distinct().count()
N_brands = mapped_data.select('brand').distinct().count()

In [31]:
train_df.show(1)

+-------+--------+-------------------+----------+-----+--------------------+-------------------+------------------+------------------+------------------+--------+--------------------+------------------+------------------+-------------------+------------------+------+----+---------+--------------------+--------------------+
|user_id|event_id|         event_time|product_id|brand|        user_session|          month_sin|         month_cos|           day_sin|           day_cos|hour_sin|            hour_cos|        minute_sin|        minute_cos|         second_sin|        second_cos|rating|year|    price|  tokenized_category|       category_code|
+-------+--------+-------------------+----------+-----+--------------------+-------------------+------------------+------------------+------------------+--------+--------------------+------------------+------------------+-------------------+------------------+------+----+---------+--------------------+--------------------+
|      1| 3426810|2019-10

In [32]:
user_features + item_features + session_features

['user_id',
 'product_id',
 'category_code',
 'brand',
 'price',
 'year',
 'month_sin',
 'month_cos',
 'day_sin',
 'day_cos',
 'hour_sin',
 'hour_cos',
 'minute_sin',
 'minute_cos',
 'second_sin',
 'second_cos']

In [33]:
vector_assembler = VectorAssembler(inputCols=user_features + item_features + session_features, outputCol='features')

train_df = vector_assembler.transform(train_df)
test_df = vector_assembler.transform(test_df)
# train_df = train_df.select(['features', target])
# test_df = test_df.select(['features', target])

In [34]:
train_df.select('features').take(1)

[Row(features=DenseVector([1.0, 16494.0, -1.3485, -0.5481, 0.6711, -0.0213, 0.4279, 0.6109, 0.5394, -0.5986, -0.4995, -1.1732, 0.0095, 0.0903, 0.4572, -1.5935, 0.7368, -0.1227, 26.0, 0.2326, 0.5, -0.866, 0.5, 0.5713, 0.8208, -1.0, -0.0, 0.866, 0.5, 0.1045, 0.9945]))]

In [35]:
train_df.take(1)

[Row(user_id=1, event_id=3426810, event_time=datetime.datetime(2019, 10, 3, 18, 10, 1), product_id=16494, brand=26, user_session='d95b3c6e-2490-4924-a851-a22f3cf78ec7', month_sin=-0.8660254037844386, month_cos=0.5000000000000001, day_sin=0.5712682150947923, day_cos=0.8207634412072763, hour_sin=-1.0, hour_cos=-1.8369701987210297e-16, minute_sin=0.8660254037844386, minute_cos=0.5000000000000001, second_sin=0.10452846326765346, second_cos=0.9945218953682733, rating=-1, year=0.5, price=0.23263730108737946, tokenized_category=['computers', 'notebook'], category_code=DenseVector([-1.3485, -0.5481, 0.6711, -0.0213, 0.4279, 0.6109, 0.5394, -0.5986, -0.4995, -1.1732, 0.0095, 0.0903, 0.4572, -1.5935, 0.7368, -0.1227]), features=DenseVector([1.0, 16494.0, -1.3485, -0.5481, 0.6711, -0.0213, 0.4279, 0.6109, 0.5394, -0.5986, -0.4995, -1.1732, 0.0095, 0.0903, 0.4572, -1.5935, 0.7368, -0.1227, 26.0, 0.2326, 0.5, -0.866, 0.5, 0.5713, 0.8208, -1.0, -0.0, 0.866, 0.5, 0.1045, 0.9945]))]

In [9]:
import os
from pyspark.ml.linalg import Vectors, VectorUDT
import pyspark.sql.functions as F

array_to_vector = F.udf(lambda x: Vectors.dense(x), VectorUDT())

def load_data(split, spark, path='project/data', feature='features'):
    df = spark.read.json(os.path.join(path, split))
    df = df.withColumn(feature, F.col(feature).getField("values"))\
           .withColumn(feature, array_to_vector(F.col(feature)))
    print(f'Loaded {os.path.join(path, split)}')
    return df

In [20]:
train_df = load_data(path='project/data', spark=spark, split='train', feature='category_code')
test_df = load_data(path='project/data', spark=spark, split='test', feature='category_code')

Loaded project/data/train
Loaded project/data/test


# Modeling

In [36]:
# to .sh scripts
# pip install torch
# pip install sparktorch

In [11]:
import torch
from Net import Content_based_filtering
# sample = list(train_df.take(1)[0]['features'])
# model = Content_based_filtering(n_brands=N_brands, n_items=N_products, n_users=N_users)
# model(torch.tensor(sample, dtype=torch.float32).unsqueeze(0))

In [15]:
target = 'rating'

In [13]:
from sparktorch import serialize_torch_obj, SparkTorch
import torch
import torch.nn as nn

model = Content_based_filtering(
    n_brands=34,
    n_items=39699,
    n_users=97917,
    brand_dim=32,
    dim=16,
)
print('created model')
torch_obj = serialize_torch_obj(
    model=model,
    criterion=nn.L1Loss(),
    optimizer=torch.optim.Adam,
    lr=0.0001,
)
print('created torch obj')
spark_model = SparkTorch(
    inputCol='features',
    labelCol=target,
    predictionCol='prediction',
    torchObj=torch_obj,
    iters=1,
    miniBatch=16,
    verbose=1,
)

In [16]:
spark_model = SparkTorch(
    inputCol='features',
    labelCol=target,
    predictionCol='predictions',
    torchObj=torch_obj,
    iters=1,
    miniBatch=16,
    verbose=1,
)

In [17]:
# train_df = train_df.repartition(cores)
# test_df = test_df.repartition(cores)

In [21]:
# %%time
p = Pipeline(stages=[spark_model]).fit(train_df)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.scheduler.BarrierJobRunWithDynamicAllocationException: [SPARK-24942]: Barrier execution mode does not support dynamic resource allocation for now. You can disable dynamic resource allocation by setting Spark conf "spark.dynamicAllocation.enabled" to "false".
	at org.apache.spark.scheduler.DAGScheduler.checkBarrierStageWithDynamicAllocation(DAGScheduler.scala:500)
	at org.apache.spark.scheduler.DAGScheduler.createResultStage(DAGScheduler.scala:588)
	at org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:1192)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2588)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2580)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2569)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:938)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2224)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2245)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2264)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2289)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:180)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)


In [44]:
spark_model.write().overwrite().save('../models/test_model')

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 50438)
Traceback (most recent call last):
  File "/usr/lib64/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib64/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib64/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib64/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/local/lib/python3.6/site-packages/pyspark/accumulators.py", line 262, in handle
    poll(accum_updates)
  File "/usr/local/lib/python3.6/site-packages/pyspark/accumulators.py", line 235, in poll
    if func():
  File "/usr/local/lib/python3.6/site-packages/pyspark/accumulators.py", line 239, in accum_updates
    num_updates = read_int(self.rfile)
  File 

Py4JError: java does not exist in the JVM

In [None]:
from sparktorch import PysparkPipelineWrapper
from pyspark.ml.pipeline import PipelineModel

trained_model = PysparkPipelineWrapper.unwrap(PipelineModel.load('../models/test_model'))

In [None]:
predictions = trained_model.transform(test_df).persist()

# Eval

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

In [None]:
spark.stop()