In [1]:
from pyspark import SparkContext
from pyspark.mllib.recommendation import ALS

In [2]:
# 读取文件
sc = SparkContext('local', 'MovieRec')
RawUserData = temp = sc.textFile('../../RS6/L5/MovieLens/ratings_small_without_header.csv')
print(RawUserData.count())
print(RawUserData.first())

100004
1,31,2.5,1260759144


In [3]:
# 处理数据
RawRatings = RawUserData.map(lambda line: line.split(',')[:3])
training_RDD = RawRatings.map(lambda x: (x[0], x[1], x[2]))

In [4]:
# 训练模型
rank = 3
model = ALS.train(training_RDD, rank, seed=5, iterations=10, lambda_=0.1)

In [5]:
print(model.recommendProducts(1,5))

[Rating(user=1, product=4404, rating=4.580639117086305), Rating(user=1, product=2275, rating=3.9592639878557634), Rating(user=1, product=67504, rating=3.795973435325095), Rating(user=1, product=83318, rating=3.795973435325095), Rating(user=1, product=83359, rating=3.795973435325095)]


In [6]:
# pyspark.ml库

In [1]:
from pyspark import SparkContext
from pyspark.ml.recommendation import ALS
from pyspark.sql import SQLContext
import pandas as pd

In [2]:
m
sc = SparkContext()
sql_sc = SQLContext(sc)
df_ratings = pd.read_csv('../../RS6/L5/MovieLens/ratings_small.csv')
pyspark_df_ratings = sql_sc.createDataFrame(df_ratings).drop('Timestamp')

In [3]:
als = ALS(rank=3, maxIter=10, regParam=0.1, userCol='userId', itemCol='movieId', ratingCol='rating')
model = als.fit(pyspark_df_ratings)
recommendations = model.recommendForAllUsers(5)

In [4]:
print(recommendations.where(recommendations.userId==1).collect())

[Row(userId=1, recommendations=[Row(movieId=91104, rating=4.507745265960693), Row(movieId=3915, rating=3.9106578826904297), Row(movieId=3414, rating=3.888821601867676), Row(movieId=4271, rating=3.8818955421447754), Row(movieId=26258, rating=3.8577077388763428)])]


In [5]:
# Surprise

In [16]:
from surprise import Dataset, Reader, BaselineOnly, KNNBasic, accuracy, NormalPredictor
from surprise.model_selection import KFold

In [7]:
# 读取数据
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file('../../RS6/L5/MovieLens/ratings.csv', reader=reader)
# train_set = data.build_full_trainset()

In [20]:
# 选择模型并设置模型参数
# ALS优化
bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}
# SGD优化
# bsl_options = {'method': 'sgd', 'n_epochs': 5, 'learning_rate': 0.005}

algo = BaselineOnly(bsl_options=bsl_options)

# algo = BaselineOnly()
# algo = NormalPredictor()

In [21]:
# KFold K折交叉验证迭代器，K=3
kf = KFold(n_splits=3)
for trainset, testset in kf.split(data):
    # 训练并预测
    algo.fit(trainset)
    predictions = algo.test(testset)
    # 计算RMSE
    accuracy.rmse(predictions, verbose=True)

Estimating biases using als...
RMSE: 0.8634
Estimating biases using als...
RMSE: 0.8636
Estimating biases using als...
RMSE: 0.8644


In [19]:
uid = str(196)
iid = str(302)
# 输出uid对iid的预测结果
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

user: 196        item: 302        r_ui = 4.00   est = 1.26   {'was_impossible': False}


In [1]:
# Slope One

In [1]:
from surprise import Dataset, Reader, SlopeOne

In [4]:
reader = Reader(line_format='user item rating timestamp', sep=',',skip_lines=1)
data = Dataset.load_from_file('../../RS6/L5/MovieLens/ratings.csv', reader=reader)
train_set = data.build_full_trainset()

In [5]:
algo = SlopeOne()
algo.fit(train_set)

<surprise.prediction_algorithms.slope_one.SlopeOne at 0x1254a3990>

In [6]:
uid = str(196)
iid = str(302)
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

user: 196        item: 302        r_ui = 4.00   est = 4.32   {'was_impossible': False}


In [7]:
# Pearson Baseline

In [175]:
from surprise import Dataset, Reader, KNNBaseline
import pandas as pd

In [176]:
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file('../../RS6/L5/MovieLens/ratings.csv', reader=reader)
train_set = data.build_full_trainset()

In [179]:
def read_item_names():
    data = pd.read_csv('../../RS6/L5/MovieLens/movies.csv', encoding='ISO-8859-1')
    rid_to_name = {}
    name_to_rid = {}
    for i in range(len(data['movieId'])):
        rid_to_name[data['movieId'][i]] = data['title'][i]
        name_to_rid[data['title'][i]] = data['movieId'][i]
    return rid_to_name, name_to_rid

def read_item_names1():
    file_name = '../../RS6/L5/MovieLens/movies1.csv'
    rid_to_name = {}
    name_to_rid = {}
    with open(file_name, 'r', encoding='utf-8') as f:
        for line in f:
            line_id = line.split(',')[0]
            line_name = line.split(',')[1]
            rid_to_name[line_id] = line_name
            name_to_rid[line_name] = line_id
    rid_to_name['1'] = rid_to_name.pop('\ufeff1')
    return rid_to_name, name_to_rid

In [177]:
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(train_set)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x10fc9d550>

In [165]:
# 获得电影名称信息数据
rid_to_name, name_to_rid = read_item_names()
# 获得Toy Story电影的电影id
toy_story_raw_id = name_to_rid['Toy Story (1995)']
print(toy_story_raw_id)

1


In [185]:
# 通过Toy Story电影的电影id获取该电影的推荐内部id
toy_story_inner_id = algo.trainset.to_inner_iid(str(toy_story_raw_id))
print(toy_story_inner_id)
algo.predict('196', toy_story_inner_id, verbose=True)

227
user: 196        item: 227        r_ui = None   est = 3.85   {'was_impossible': False}


Prediction(uid='196', iid=227, r_ui=None, est=3.847457766692354, details={'was_impossible': False})

In [190]:
# 获取toy story对应的内部id 并由此取得其对应的k个近邻 k个近邻对应的也是内部id
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id,k = 10)

# 近邻内部id转换为对应的名字
toy_story_neiggbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid] for rid in toy_story_neighbors)

In [191]:
for movie in toy_story_neighbors:
    print(movie)

Month by the Lake, A (1995)
Adventures of Pinocchio, The (1996)
Honey Moon (Honigmond) (1996)
On Golden Pond (1981)
Judge Dredd (1995)
Airplane II: The Sequel (1982)
Love Is All There Is (1996)
Fish Called Wanda, A (1988)
Party Girl (1995)
Highlander III: The Sorcerer (a.k.a. Highlander: The Final Dimension) (1994)
