In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
spark = SparkSession.builder.appName('Ex3').getOrCreate()

In [5]:
data = spark.read.json('./Musical_Instruments_5.json')

In [7]:
data.columns

['asin',
 'helpful',
 'overall',
 'reviewText',
 'reviewTime',
 'reviewerID',
 'reviewerName',
 'summary',
 'unixReviewTime']

In [8]:
data_sub = data.select(['asin','overall','reviewerID'])

In [9]:
from pyspark.sql.functions import col,udf,isnan,isnull,when,count,col

In [10]:
data_sub.show(5)

+----------+-------+--------------+
|      asin|overall|    reviewerID|
+----------+-------+--------------+
|1384719342|    5.0|A2IBPI20UZIR0U|
|1384719342|    5.0|A14VAT5EAX3D9S|
|1384719342|    5.0|A195EZSQDW3E21|
|1384719342|    5.0|A2C00NNG1ZQQG2|
|1384719342|    5.0| A94QU4C90B1AX|
+----------+-------+--------------+
only showing top 5 rows



In [11]:
data_sub.select([count(when(col(c).isNull(),c)).alias(c) for c in data_sub.columns]).toPandas()

Unnamed: 0,asin,overall,reviewerID
0,0,0,0


In [13]:
user = data_sub.select('reviewerID').distinct().count()
products = data_sub.select('asin').distinct().count()
numerator = data_sub.count()

In [14]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [15]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

In [16]:
indexer = StringIndexer(inputCol='asin',outputCol='asin_idx')
indexer1 = StringIndexer(inputCol='reviewerID',outputCol='reviewerID_idx')

In [17]:
inde_model = Pipeline(stages=[indexer,indexer1])

In [18]:
data_indexed = inde_model.fit(data_sub).transform(data_sub)

In [20]:
(train,test) = data_indexed.randomSplit([0.8,0.2])

In [23]:
als = ALS(maxIter=5,regParam=0.09,rank=25,userCol='reviewerID_idx',itemCol='asin_idx',coldStartStrategy='drop',nonnegative=True,ratingCol='overall')

In [24]:
model = als.fit(train)

In [26]:
prediction = model.transform(test)
prediction.show()

+----------+-------+--------------+--------+--------------+----------+
|      asin|overall|    reviewerID|asin_idx|reviewerID_idx|prediction|
+----------+-------+--------------+--------+--------------+----------+
|B000MVYOZY|    4.0| AU3GYRAKBUAEU|   148.0|        1403.0| 3.2735934|
|B000MVYOZY|    5.0|A34LQ791ACZ0JC|   148.0|          67.0| 4.3631215|
|B000MVYOZY|    5.0|A2U7DXDNYBBLQJ|   148.0|        1117.0| 3.9738126|
|B0002E5266|    4.0|A2W8OGJDV7TCMT|   471.0|         720.0| 3.9046159|
|B0002E5266|    4.0|A319B090A2POEB|   471.0|          25.0| 4.1536956|
|B0002E5266|    4.0|A3CUYX2VXZM9KO|   471.0|         499.0|  4.974415|
|B000A2HDXA|    1.0|A3USVXMH3QNRLG|   243.0|         519.0| 2.6312935|
|B0002H0JZC|    5.0| AF1I90O6U8VCQ|   392.0|        1338.0| 3.1138525|
|B004FRHKA2|    5.0|A26U7I13QO5E0C|   540.0|         300.0| 3.7281342|
|B000T517Y4|    2.0|A2BB6IKKR4G40J|   623.0|         675.0| 2.7055118|
|B0002MJTZ8|    5.0| ATSVUBE2115N1|   737.0|         258.0| 4.2507157|
|B0050

In [27]:
evaluator = RegressionEvaluator(metricName='rmse',labelCol='overall')

In [29]:
rmse = evaluator.evaluate(prediction)
rmse

1.1664439930137867

### Recommend for all user

In [30]:
user_recommand = model.recommendForAllUsers(3)

In [31]:
for user in user_recommand.head(5):
  print(user)

Row(reviewerID_idx=471, recommendations=[Row(asin_idx=513, rating=5.862487316131592), Row(asin_idx=829, rating=5.696627616882324), Row(asin_idx=743, rating=5.684125900268555)])
Row(reviewerID_idx=1342, recommendations=[Row(asin_idx=832, rating=6.013309478759766), Row(asin_idx=327, rating=5.85429048538208), Row(asin_idx=263, rating=5.566719055175781)])
Row(reviewerID_idx=463, recommendations=[Row(asin_idx=832, rating=5.187957286834717), Row(asin_idx=743, rating=5.067473888397217), Row(asin_idx=424, rating=5.015849590301514)])
Row(reviewerID_idx=833, recommendations=[Row(asin_idx=775, rating=4.758092880249023), Row(asin_idx=534, rating=4.68939208984375), Row(asin_idx=832, rating=4.632532119750977)])
Row(reviewerID_idx=496, recommendations=[Row(asin_idx=525, rating=5.344379425048828), Row(asin_idx=832, rating=5.236875057220459), Row(asin_idx=459, rating=5.234774112701416)])


In [32]:
import pandas as pd
recs=model.recommendForAllUsers(10).toPandas()
nrecs=recs.recommendations.apply(pd.Series).merge(recs, right_index = True, left_index = True).drop(["recommendations"], axis = 1).melt(id_vars = ['reviewerID_idx'], value_name = "recommendation").drop("variable", axis = 1).dropna() 
nrecs=nrecs.sort_values('reviewerID_idx')
nrecs=pd.concat([nrecs['recommendation'].apply(pd.Series),                  
                 nrecs['reviewerID_idx']], axis = 1)
nrecs.columns = ['ProductID_index','Rating','UserID_index']
md=data_indexed.select(['reviewerID', 'reviewerID_idx','asin', 'asin_idx'])
md=md.toPandas()
dict1 =dict(zip(md['reviewerID_idx'],md['reviewerID']))
dict2=dict(zip(md['asin_idx'],md['asin']))
nrecs['reviewerID']=nrecs['UserID_index'].map(dict1)
nrecs['asin']=nrecs['ProductID_index'].map(dict2)
nrecs=nrecs.sort_values('reviewerID')
nrecs.reset_index(drop=True, inplace=True)
new=nrecs[['reviewerID','asin','Rating']]
new['recommendations'] = list(zip(new.asin, new.Rating))
res=new[['reviewerID','recommendations']]  
res_new=res['recommendations'].groupby([res.reviewerID]).apply(list).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['recommendations'] = list(zip(new.asin, new.Rating))


In [33]:
res_new

Unnamed: 0,reviewerID,recommendations
0,A00625243BI8W1SSZNLMD,"[(B0007WPCKE, 6.264864444732666), (B001C9R5P6,..."
1,A10044ECXDUVKS,"[(B00BHJNC4E, 5.555619239807129), (B00923G9Q0,..."
2,A102MU6ZC9H1N6,"[(B001J1JRN0, 5.685401916503906), (B00BHJNC4E,..."
3,A109JTUZXO61UY,"[(B003AYC1SM, 6.937452793121338), (B000RY68PA,..."
4,A109ME7C09HM2M,"[(B005M0TKL8, 6.085399627685547), (B000RYE5Y6,..."
...,...,...
1424,AZJPNK73JF3XP,"[(B0007WPCKE, 5.481085300445557), (B0002BACB4,..."
1425,AZMHABTPXVLG3,"[(B0002IAJ56, 4.021576404571533), (B003AYC1SM,..."
1426,AZMIKIG4BB6BZ,"[(B0002E4Z8M, 5.5941057205200195), (B0002E5518..."
1427,AZPDO6FLSMLFP,"[(B000RYE5Y6, 5.220370292663574), (B0002GXRF2,..."
