In [1]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [2]:
sc = SparkContext()

In [3]:
# 讀取客戶購買次數資料，作為評分RDD資料

rawRatings = sc.textFile('./prefer_raw.csv')

rawRatings = rawRatings.filter(lambda row: 'user_id' not in row)
rawRatings.take(30)

['11123,160475,7,Vitamin Water Zero Squeezed Lemonade',
 '29974,135004,3,Gluten Free Cinnamon French Toast Sticks',
 '48398,82943,7,Maca Cold Brew Coffee Super Herb Elixir',
 '49273,157805,2,Light and Lean Quinoa Black Beans with Butternut Squash and Chard',
 '26790,150317,15,Organic AppleApple',
 '28204,128089,4,Organic Fuji Apple',
 '13176,181571,3,Bag of Organic Bananas',
 '32655,92678,17,Organic Large Grade AA Brown Eggs',
 '3896,88698,62,Organic Honey Sweet Whole Wheat Bread',
 '27086,8031,2,Half & Half',
 '33342,193965,5,Organic Red Grape Tomato Package',
 '22035,131089,5,Organic Whole String Cheese',
 "29810,869,1,Torched Marshmellow S'more Gelato",
 '8277,108520,3,Apple Honeycrisp Organic',
 '30142,194400,9,Vegetable Dumpling',
 '13984,106483,6,Organic Mint',
 '40229,47306,2,Organic Whole Wheat Penne Rigate',
 '8580,46859,4,VitaminWater Zero™ XXX Acai Blueberry Pomegranate',
 '7139,37498,1,Sooo Good Light Roast Coffee',
 '21479,158552,7,Organic Turkish Figs',
 '28427,73538,4,Vi

In [4]:
# 整理格式

rawRatings = rawRatings.map(lambda row: row.split(',')[:3])
rawRatings.take(30)

[['11123', '160475', '7'],
 ['29974', '135004', '3'],
 ['48398', '82943', '7'],
 ['49273', '157805', '2'],
 ['26790', '150317', '15'],
 ['28204', '128089', '4'],
 ['13176', '181571', '3'],
 ['32655', '92678', '17'],
 ['3896', '88698', '62'],
 ['27086', '8031', '2'],
 ['33342', '193965', '5'],
 ['22035', '131089', '5'],
 ['29810', '869', '1'],
 ['8277', '108520', '3'],
 ['30142', '194400', '9'],
 ['13984', '106483', '6'],
 ['40229', '47306', '2'],
 ['8580', '46859', '4'],
 ['7139', '37498', '1'],
 ['21479', '158552', '7'],
 ['28427', '73538', '4'],
 ['31215', '203821', '2'],
 ['3913', '19670', '2'],
 ['11187', '182832', '5'],
 ['7700', '54276', '4'],
 ['33704', '4747', '1'],
 ['11512', '199054', '4'],
 ['31717', '23251', '2'],
 ['28515', '47562', '4'],
 ['28934', '200557', '2']]

In [5]:
# 匯入評分分析工具

import pyspark.mllib.recommendation as rd

In [27]:
ratings = rawRatings.map(lambda line: rd.Rating(int(line[1]), int(line[0]), float(line[2])))
ratings.take(20)

[Rating(user=160475, product=11123, rating=7.0),
 Rating(user=135004, product=29974, rating=3.0),
 Rating(user=82943, product=48398, rating=7.0),
 Rating(user=157805, product=49273, rating=2.0),
 Rating(user=150317, product=26790, rating=15.0),
 Rating(user=128089, product=28204, rating=4.0),
 Rating(user=181571, product=13176, rating=3.0),
 Rating(user=92678, product=32655, rating=17.0),
 Rating(user=88698, product=3896, rating=62.0),
 Rating(user=8031, product=27086, rating=2.0),
 Rating(user=193965, product=33342, rating=5.0),
 Rating(user=131089, product=22035, rating=5.0),
 Rating(user=869, product=29810, rating=1.0),
 Rating(user=108520, product=8277, rating=3.0),
 Rating(user=194400, product=30142, rating=9.0),
 Rating(user=106483, product=13984, rating=6.0),
 Rating(user=47306, product=40229, rating=2.0),
 Rating(user=46859, product=8580, rating=4.0),
 Rating(user=37498, product=7139, rating=1.0),
 Rating(user=158552, product=21479, rating=7.0)]

In [29]:
# 開始建立模型

from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

In [35]:
# rank: 矩陣分解時對應的低維的維數，這個值會影響矩陣分解的性能，
#       越大則算法運行的時間和占用的內存可能會越多。通常需要進行調參，一般可以取10-200之間的數
# iteration: 在矩陣分解用交替最小二乘法求解時，進行疊代的最大次數。一般來說，不需要太大，比如5-20次即可。
# lambda: FunkSVD分解時對應的正則化係數。主要用於控制模型的擬合程度，增強模型泛化能力。取值越大，則正則化懲罰越強。
# alpha: 這個參數僅僅在使用隱式反饋trainImplicit時有用。
#        指定了隱式反饋信心閾值，這個值越大則越認為用戶和他沒有評分的物品之間沒有關聯。

# model = ALS.trainImplicit(ratings=ratings, rank=20, iterations=5, lambda_=0.02, alpha=0.01)
model = ALS.train(ratings=ratings, rank=20, iterations=5, lambda_=0.02)

# 儲存模型
model.save(sc, "./spark_ALS_explicit_model")

# 讀取模型
model = MatrixFactorizationModel.load(sc, "./spark_ALS_explicit_model")

In [36]:
# 預測99753客戶最喜歡的前10名產品

model.recommendProducts(99753,10)

[Rating(user=99753, product=48095, rating=183.70266695694423),
 Rating(user=99753, product=40576, rating=164.29677068978563),
 Rating(user=99753, product=17020, rating=161.87533503443484),
 Rating(user=99753, product=39782, rating=160.08180214491708),
 Rating(user=99753, product=39834, rating=128.1120609368091),
 Rating(user=99753, product=28335, rating=120.916233301852),
 Rating(user=99753, product=33692, rating=116.64747865424951),
 Rating(user=99753, product=48776, rating=115.71496891275969),
 Rating(user=99753, product=35776, rating=114.74972244677147),
 Rating(user=99753, product=42871, rating=111.46885955098162)]

In [37]:
# 預測物品27845買最多的20位客戶

model.recommendUsers(27845,20)

[Rating(user=204061, product=27845, rating=143.5716523250718),
 Rating(user=5588, product=27845, rating=136.7501103799552),
 Rating(user=193164, product=27845, rating=121.72224726838226),
 Rating(user=124991, product=27845, rating=121.26780098635496),
 Rating(user=100698, product=27845, rating=114.30059258117413),
 Rating(user=55989, product=27845, rating=112.43260294966208),
 Rating(user=28556, product=27845, rating=110.26587495786107),
 Rating(user=23832, product=27845, rating=107.56856123693942),
 Rating(user=119002, product=27845, rating=103.71278495987036),
 Rating(user=171919, product=27845, rating=99.3887401334534),
 Rating(user=99753, product=27845, rating=96.8860953053294),
 Rating(user=91606, product=27845, rating=96.71165789695475),
 Rating(user=126633, product=27845, rating=95.40045157776042),
 Rating(user=52489, product=27845, rating=93.91561871358702),
 Rating(user=199229, product=27845, rating=91.55252985523938),
 Rating(user=97865, product=27845, rating=89.3093395616469

In [38]:
# 預測99753客戶對產品38689的購買次數

model.predict(99753,38689)

95.91708381512103

In [39]:
# 預測每個客戶購買數量最高的前3項產品

model.recommendProductsForUsers(3).take(10)

[(18624,
  (Rating(user=18624, product=2090, rating=54.529223016135745),
   Rating(user=18624, product=40576, rating=46.586033565344955),
   Rating(user=18624, product=34246, rating=45.3464736963488))),
 (152288,
  (Rating(user=152288, product=40576, rating=21.65587684513895),
   Rating(user=152288, product=33692, rating=15.44271590436704),
   Rating(user=152288, product=14366, rating=14.567617019134182))),
 (23776,
  (Rating(user=23776, product=17020, rating=21.201474398782818),
   Rating(user=23776, product=25593, rating=18.40920468521647),
   Rating(user=23776, product=12885, rating=18.20396289442082))),
 (155312,
  (Rating(user=155312, product=40576, rating=15.47711855379748),
   Rating(user=155312, product=34246, rating=11.62621248984673),
   Rating(user=155312, product=15783, rating=10.314815960164243))),
 (58592,
  (Rating(user=58592, product=48776, rating=19.542927032730997),
   Rating(user=58592, product=12844, rating=18.913622459836702),
   Rating(user=58592, product=13128, r

In [20]:
# 預測每個產品購買量最高的3個客戶

model.recommendUsersForProducts(3).take(10)

[(27456,
  (Rating(user=82414, product=27456, rating=36.607387481766324),
   Rating(user=16397, product=27456, rating=29.92879255109831),
   Rating(user=103593, product=27456, rating=22.96549073368934))),
 (18624,
  (Rating(user=16397, product=18624, rating=51.74954878287376),
   Rating(user=82414, product=18624, rating=47.91279346577359),
   Rating(user=81678, product=18624, rating=39.127227741522546))),
 (17984,
  (Rating(user=81678, product=17984, rating=128.5156773855486),
   Rating(user=140753, product=17984, rating=115.81656200222838),
   Rating(user=154246, product=17984, rating=114.92028222416036))),
 (3456,
  (Rating(user=82414, product=3456, rating=176.56098821112468),
   Rating(user=5588, product=3456, rating=119.53461943028145),
   Rating(user=118685, product=3456, rating=94.55353238242157))),
 (22720,
  (Rating(user=100935, product=22720, rating=45.85980378227007),
   Rating(user=5588, product=22720, rating=39.44746951848547),
   Rating(user=125378, product=22720, rating=3

In [None]:
# 模型評估