In [1]:
#注意事项:
#当运行本Notebook的程序后，如果要关闭Notebook，请选择菜单: File > Close and Halt 才能确实停止当前正在运行的程序，并且释放资源
#如果没有使用以上方法，只关闭此分页，程序仍在运行，未释放资源，当您打开并运行其他的Notebook，可能会发生错误

# 12.7	如何进行数据准备?

In [36]:
sc.stop()

In [4]:
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName('recommender').setMaster('local[4]')
sc = SparkContext(conf=conf)

In [5]:
sc.master

'local[4]'

In [6]:
global Path    
if sc.master[0:5]=="local" :
   Path="file:/mnt/data1/workspace/data_analysis_mining/Python+Spark2.0+Hadoop机器学习与大数据实战/pythonsparkexample/PythonProject/"
else:   
   Path="hdfs://master:9000/user/hduser/"
#如果要在cluster模式运行(hadoop yarn 或Spark Stand alone)，请按照书上的说明，先把文件上传到HDFS目录

In [7]:
rawUserData = sc.textFile(Path+"data/u.data")
rawUserData.count()

100000

In [6]:
rawUserData.first()

'196\t242\t3\t881250949'

In [7]:
print(rawUserData.first())  # 用户id, 项目id, 评价, 时间戳

196	242	3	881250949


In [8]:
rawUserData.take(5)

['196\t242\t3\t881250949',
 '186\t302\t3\t891717742',
 '22\t377\t1\t878887116',
 '244\t51\t2\t880606923',
 '166\t346\t1\t886397596']

In [9]:
for x in rawUserData.take(5): 
    print(x)

196	242	3	881250949
186	302	3	891717742
22	377	1	878887116
244	51	2	880606923
166	346	1	886397596


In [10]:
from pyspark.mllib.recommendation import Rating

In [8]:
# 获取 用户id, 项目id, 评价
rawRatings = rawUserData.map(lambda line: line.split("\t")[:3] )
rawRatings.take(5)

[['196', '242', '3'],
 ['186', '302', '3'],
 ['22', '377', '1'],
 ['244', '51', '2'],
 ['166', '346', '1']]

In [9]:
# 转成tuple形式
ratingsRDD = rawRatings.map(lambda x: (x[0],x[1],x[2]))
ratingsRDD.take(5)

[('196', '242', '3'),
 ('186', '302', '3'),
 ('22', '377', '1'),
 ('244', '51', '2'),
 ('166', '346', '1')]

In [13]:
numRatings = ratingsRDD.count()
numRatings

100000

In [10]:
# 用户数量
numUsers = ratingsRDD.map(lambda x: x[0] ).distinct().count()
numUsers 

943

In [11]:
# 电影数量
numMovies = ratingsRDD.map(lambda x: x[1]).distinct().count() 
numMovies

1682

In [16]:
ratingsRDD.persist()

PythonRDD[20] at RDD at PythonRDD.scala:53

# 12.8	如何训练模型? 

In [12]:
from pyspark.mllib.recommendation import ALS

In [13]:
# 将矩阵(m, n)分解为(m, rank) 和 (rank, n), 正则化系数0.01
model = ALS.train(rawRatings, 10, 10, 0.01)  # 显示评分训练
print(model)

<pyspark.mllib.recommendation.MatrixFactorizationModel object at 0x7f5250045610>


# 12.9	如何使用模型进行推荐?

In [19]:
# 针对用户推荐电影 用户id, 推荐数量
model.recommendProducts(100, 5)

[Rating(user=100, product=1279, rating=5.945706776911413),
 Rating(user=100, product=958, rating=5.724989170205736),
 Rating(user=100, product=1450, rating=5.594900100070466),
 Rating(user=100, product=1463, rating=5.45629156772284),
 Rating(user=100, product=1166, rating=5.258614526350782)]

In [20]:
# 针对用户id 100, 推荐物品1141的评分
model.predict(100, 1141)

4.038890249846785

In [21]:
# 将物品推荐给用户
model.recommendUsers(product=200,num=5)

[Rating(user=762, product=200, rating=7.635004059570843),
 Rating(user=362, product=200, rating=7.592178145980742),
 Rating(user=811, product=200, rating=7.2542754802393805),
 Rating(user=157, product=200, rating=5.769093787952176),
 Rating(user=240, product=200, rating=5.537591662677301)]

In [14]:
predict_rdd = model.predictAll(rawRatings.map(lambda r: (r[0], r[1])))

In [33]:
predict_rdd = model.predictAll(rawRatings.map(lambda x: (x[0], x[1])))
    # 以(user, product) 为key, 评分为value 合并
predict_true = predict_rdd.map(lambda x: ((x[0], x[1]), x[2])).join(
    rawRatings.map(lambda x: ((int(x[0]), int(x[1])), float(x[2])))
)

In [34]:
predict_true.take(1)

[((470, 1084), (2.642686647279174, 3.0))]

# 12.10	显示推荐的电影的名称

In [23]:
itemRDD = sc.textFile(Path+"data/u.item")
itemRDD.count()

1682

In [24]:
movieTitle= itemRDD.map( lambda line : line.split("|"))     \
                                   .map(lambda a: (float(a[0]),a[1]))       \
                                   .collectAsMap()
len(movieTitle)

1682

In [32]:
movieTitle.get(5)

'Copycat (1995)'

In [33]:
for i in range(1,6): 
    print(f"{i}: {movieTitle[i]}")

1: Toy Story (1995)
2: GoldenEye (1995)
3: Four Rooms (1995)
4: Get Shorty (1995)
5: Copycat (1995)


In [34]:
movieTitle[5]

'Copycat (1995)'

In [35]:
recommendP= model.recommendProducts(100,5) 
for p in recommendP:
    print(f"对用户{p[0]} 推荐电影:{movieTitle[p[1]]}, 推荐评分: {p[2]}") 

对用户100 推荐电影:To Live (Huozhe) (1994), 推荐评分: 5.300027435511032
对用户100 推荐电影:Doom Generation, The (1995), 推荐评分: 5.151686198370768
对用户100 推荐电影:War Room, The (1993), 推荐评分: 5.025557551269055
对用户100 推荐电影:Stalker (1979), 推荐评分: 4.990324791516028
对用户100 推荐电影:Fearless (1993), 推荐评分: 4.924419605396701
