In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
spark = SparkSession.builder.appName('Ex4').getOrCreate()

In [4]:
data = spark.read.json('./Beauty_5.json')

In [5]:
data.columns

['asin',
 'helpful',
 'overall',
 'reviewText',
 'reviewTime',
 'reviewerID',
 'reviewerName',
 'summary',
 'unixReviewTime']

In [6]:
data_sub = data.select('asin','overall','reviewerID')

In [7]:
from pyspark.sql.functions import col,when,count,isnan,isnull,udf

In [10]:
data_sub.select([count(when(col(c).isNull(),c)).alias(c) for c in data_sub.columns]).toPandas()

Unnamed: 0,asin,overall,reviewerID
0,0,0,0


In [11]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [12]:
from pyspark.ml.feature import StandardScaler,StringIndexer
from pyspark.ml import Pipeline

In [14]:
index_1 = StringIndexer(inputCol='asin',outputCol='asin_idx')
index_2 = StringIndexer(inputCol='reviewerID',outputCol='reviewerID_idx')

In [15]:
pipeline = Pipeline(stages=(index_1,index_2))
data_indexed = pipeline.fit(data_sub).transform(data_sub)

In [16]:
(train,test) = data_indexed.randomSplit([0.8,0.2])

In [17]:
als = ALS(maxIter=5,regParam=0.09,rank=25,userCol='reviewerID_idx',itemCol='asin_idx',coldStartStrategy='drop',nonnegative=True,ratingCol='overall')

In [18]:
model = als.fit(train)

In [19]:
prediction = model.transform(test)
prediction.show()

+----------+-------+--------------+--------+--------------+----------+
|      asin|overall|    reviewerID|asin_idx|reviewerID_idx|prediction|
+----------+-------+--------------+--------+--------------+----------+
|B005TI7NQW|    3.0|A2E7RX6AFUDQEX|   148.0|         961.0|  3.686547|
|B005TI7NQW|    5.0| ACR4HKUT808U1|   148.0|         159.0|  3.954616|
|B005TI7NQW|    3.0| A3U1WPDQLP9CQ|   148.0|         995.0| 3.4416752|
|B005TI7NQW|    5.0| ACJT8MUC0LRF0|   148.0|         190.0| 4.5023875|
|B005TI7NQW|    4.0| AYB4ELCS5AM8P|   148.0|         225.0|  3.642216|
|B005TI7NQW|    5.0|A23GFTVIETX7DS|   148.0|         455.0|  4.003447|
|B005TI7NQW|    5.0|A1P2XYD265YE21|   148.0|          43.0|  4.786423|
|B005TI7NQW|    5.0|A33PVCHCQ2BTN0|   148.0|         179.0|  4.341035|
|B005TI7NQW|    5.0|A1SB9BNNGKNX2Z|   148.0|        6882.0|  4.542335|
|B005TI7NQW|    5.0|A3JT29L4YFEIMJ|   148.0|         505.0|  4.763384|
|B005TI7NQW|    5.0|A2BZ16RKE13PKV|   148.0|         890.0| 4.2951736|
|B005T

In [20]:
evaluator = RegressionEvaluator(metricName='rmse',labelCol='overall')

In [21]:
rmse = evaluator.evaluate(prediction)
rmse

1.346670922945394

### Recommend for all user

In [23]:
user_recommand = model.recommendForAllUsers(3)

In [24]:
for user in user_recommand.head(5):
  print(user)

Row(reviewerID_idx=1580, recommendations=[Row(asin_idx=7091, rating=5.682399272918701), Row(asin_idx=6059, rating=5.533816337585449), Row(asin_idx=4744, rating=5.524819374084473)])
Row(reviewerID_idx=4900, recommendations=[Row(asin_idx=8272, rating=6.774626731872559), Row(asin_idx=10417, rating=6.682068824768066), Row(asin_idx=3809, rating=6.563076496124268)])
Row(reviewerID_idx=5300, recommendations=[Row(asin_idx=10417, rating=7.152647972106934), Row(asin_idx=10281, rating=7.02388858795166), Row(asin_idx=3695, rating=6.900285243988037)])
Row(reviewerID_idx=6620, recommendations=[Row(asin_idx=10417, rating=7.158432960510254), Row(asin_idx=7480, rating=6.946445941925049), Row(asin_idx=8272, rating=6.931892395019531)])
Row(reviewerID_idx=7240, recommendations=[Row(asin_idx=4508, rating=5.90614652633667), Row(asin_idx=10409, rating=5.7474365234375), Row(asin_idx=3425, rating=5.6135759353637695)])


In [25]:
import pandas as pd
recs=model.recommendForAllUsers(10).toPandas()
nrecs=recs.recommendations.apply(pd.Series).merge(recs, right_index = True, left_index = True).drop(["recommendations"], axis = 1).melt(id_vars = ['reviewerID_idx'], value_name = "recommendation").drop("variable", axis = 1).dropna() 
nrecs=nrecs.sort_values('reviewerID_idx')
nrecs=pd.concat([nrecs['recommendation'].apply(pd.Series),                  
                 nrecs['reviewerID_idx']], axis = 1)
nrecs.columns = ['ProductID_index','Rating','UserID_index']
md=data_indexed.select(['reviewerID', 'reviewerID_idx','asin', 'asin_idx'])
md=md.toPandas()
dict1 =dict(zip(md['reviewerID_idx'],md['reviewerID']))
dict2=dict(zip(md['asin_idx'],md['asin']))
nrecs['reviewerID']=nrecs['UserID_index'].map(dict1)
nrecs['asin']=nrecs['ProductID_index'].map(dict2)
nrecs=nrecs.sort_values('reviewerID')
nrecs.reset_index(drop=True, inplace=True)
new=nrecs[['reviewerID','asin','Rating']]
new['recommendations'] = list(zip(new.asin, new.Rating))
res=new[['reviewerID','recommendations']]  
res_new=res['recommendations'].groupby([res.reviewerID]).apply(list).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['recommendations'] = list(zip(new.asin, new.Rating))


In [26]:
res_new

Unnamed: 0,reviewerID,recommendations
0,A00414041RD0BXM6WK0GX,"[(B00IC8JBIE, 5.359609127044678), (B00GYJWL7G,..."
1,A00473363TJ8YSZ3YAGG9,"[(B00IC9AG5A, 4.454488277435303), (B00IC8JBIE,..."
2,A00700212KB3K0MVESPIY,"[(B002X0WLEI, 6.599361419677734), (B00GYJWL7G,..."
3,A0078719IR14X3NNUG0F,"[(B002X0WLEI, 8.044244766235352), (B007L5P7YQ,..."
4,A01198201H0E3GHV2Z17I,"[(B000P8559S, 6.395101547241211), (B003GXC0T2,..."
...,...,...
22356,AZZNK89PXD006,"[(B000NIZZYK, 4.761874198913574), (B002C3UMLK,..."
22357,AZZQXL8VDCFTV,"[(B000P8559S, 6.816717147827148), (B007L9DRXK,..."
22358,AZZT1ERHBSNQ8,"[(B005LZT9GU, 6.380086898803711), (B004RLBCZ0,..."
22359,AZZU6NXB8YJN9,"[(B00GYJWL7G, 5.412257194519043), (B001CB2OQO,..."
