In [1]:
from pyspark.mllib.recommendation import Rating,ALS,MatrixFactorizationModel
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics

### load rating file

In [2]:
rating_path = 'data/mv_all_simple.txt'
recom_path = 'data/model_ALS'

In [3]:
rating = sc.textFile(rating_path)
rating.persist()

data/mv_all_simple.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [4]:
rating.first()

'1,1488844,3,2005-09-06'

- convert string to tuple(movie_id,user_id,rating)
- convert tuple to Rating(user_id,product_id,rating)

In [5]:
rating = rating.map(lambda x: x.split(',')).map(lambda x : Rating(x[1],x[0],x[2]))

In [6]:
rating.first()

Rating(user=1488844, product=1, rating=3.0)

- keep 80% as training data, keep 20% as testing data

In [7]:
train_d,test_d = rating.randomSplit([0.8,0.2])
train_d.persist()
test_d.persist()

PythonRDD[5] at RDD at PythonRDD.scala:48

- training ALS model

In [8]:
rank = 10
num_iter = 5
lamb = 0.01
model = ALS.train(train_d,rank,num_iter,lamb)

In [9]:
train_d.unpersist()

PythonRDD[4] at RDD at PythonRDD.scala:48

- recommend 10 product for user 30878 and print the result

In [10]:
recom = model.recommendProducts(30878,10)
recom

[Rating(user=30878, product=13478, rating=4.70016127837168),
 Rating(user=30878, product=13504, rating=4.676962329036305),
 Rating(user=30878, product=3180, rating=4.6586516913845415),
 Rating(user=30878, product=12398, rating=4.652359061677519),
 Rating(user=30878, product=1109, rating=4.64977404864549),
 Rating(user=30878, product=15104, rating=4.604248501783921),
 Rating(user=30878, product=7230, rating=4.601154843993213),
 Rating(user=30878, product=14961, rating=4.596233575632045),
 Rating(user=30878, product=7057, rating=4.584538103596762),
 Rating(user=30878, product=14807, rating=4.574259793488954)]

- filter what user 30878 really like

In [11]:
actual = rating.filter(lambda r:r.user=='30878')
actual.persist()

PythonRDD[127] at RDD at PythonRDD.scala:48

In [12]:
print(actual.count())
actual.saveAsTextFile('data/actual_recom_30878')

1242


- write the pridect top 10 rating to file

In [17]:
top10 = []
for r in recom:
    top10.append(str(r.product))
print(top10)
with open('data/predict_10_30878', 'w') as file_handler:
    for item in top10:
        file_handler.write("{}\n".format(item))

['13478', '13504', '3180', '12398', '1109', '15104', '7230', '14961', '7057', '14807']


- write the acutual 5 star rating to file

In [16]:
rating5 = actual.filter(lambda r: r.rating ==5.0)
rating5.saveAsTextFile('data/actual_5_30878')

In [18]:
rating.unpersist()

PythonRDD[139] at RDD at PythonRDD.scala:48

- Evaluate the ALS model on training data with RMSE error

In [19]:
testData = test_d.map(lambda p: (p.user, p.product))
predictions = model.predictAll(testData).map(lambda r: ((r.user, r.product), r.rating))
ratingsTuple = test_d.map(lambda r: ((r.user, r.product), r.rating))
scoreAndLabels = predictions.join(ratingsTuple).map(lambda tup: tup[1])
metrics = RegressionMetrics(scoreAndLabels)
print("RMSE = %s" % metrics.rootMeanSquaredError)

RMSE = 0.8586164668742967


In [20]:
test_d.unpersist()
actual.unpersist()

PythonRDD[127] at RDD at PythonRDD.scala:48

### task5 predict real data

In [21]:
qualify_path = 'data/qualifying_simple.txt'
qualify = sc.textFile(qualify_path)
qualify.first()

'1,1046323,2005-12-19'

In [22]:
qualify = qualify.map(lambda line: line.split(',')).map(lambda p:(p[1],p[0]))
preds = model.predictAll(qualify)
preds.saveAsTextFile('data/qualify_result')

In [23]:
!head data/qualify_result/part-00000

Rating(user=1393975, product=6736, rating=3.836511083206633)
Rating(user=1393975, product=3153, rating=4.096165177880911)
Rating(user=1393975, product=14187, rating=4.001249894022584)
Rating(user=1393975, product=4315, rating=3.6795937875739226)
Rating(user=1393975, product=2578, rating=3.4375543206061536)
Rating(user=1393975, product=6029, rating=4.314836631290705)
Rating(user=118733, product=8292, rating=3.312352961071772)
Rating(user=118733, product=9886, rating=3.738430557677331)
Rating(user=118733, product=16879, rating=4.126168543528493)
Rating(user=118733, product=10561, rating=3.4594395359271655)


In [46]:
model.save(sc,recom_path)

IllegalArgumentException: "Error while instantiating 'org.apache.spark.sql.hive.HiveSessionState':"