In [3]:
# !pip install findspark pyspark

In [1]:
# !pip install gensim

In [2]:
# !pip install pandas

In [3]:
# !pip install jupyter_contrib_nbextensions

In [4]:
# !pip install tqdm

reference: https://www.youtube.com/watch?v=qlY8m2Fk2D4&ab_channel=%E8%9A%82%E8%9A%81%E5%AD%A6Python 

title:【推荐系统 python】 21 Python 使用 PySpark 训练 item2vec 实现电影相关推荐

GitHub: https://github.com/peiss/ant-learn-recsys, https://github.com/peiss/ant-learn-recsys/blob/master/04.%20Python%E8%AE%AD%E7%BB%83item2vec%E5%AE%9E%E7%8E%B0%E7%94%B5%E5%BD%B1%E7%9B%B8%E5%85%B3%E6%8E%A8%E8%8D%90.ipynb 

### IO: 

* Input: rating data. 
* Output: a file `item2vecEmb.csv` which is the embedding of the movie ids. 


In [1]:
import pandas as pd
import gensim, tqdm

### Load the rating file: 

In [2]:
file_path = r"D:/MachineLearningPractice/SparrowRecSys/NewCode/JavaPart/src/main/resources/"
rawSampleDataPath = file_path + "/webroot/sampledata/ratings.csv"

rawSampleData = pd.read_csv(rawSampleDataPath)
rawSampleData.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
5,1,112,3.5,1094785740
6,1,151,4.0,1094785734
7,1,223,4.0,1112485573
8,1,253,4.0,1112484940
9,1,260,4.0,1112484826


### Group the sequence of movie watching: 

In [6]:
df_group = rawSampleData[rawSampleData.rating >= 3.5]\
.sort_values(["userId", "timestamp"], ascending = True)\
.groupby(["userId"])["movieId"]\
.apply(lambda x: " ".join([str(y) for y in x]) )\
.reset_index()
df_group.head()

Unnamed: 0,userId,movieId
0,1,924 919 337 151 112 50 541 593 29 293 47 296 3...
1,2,62 110 589 70 908 480 266 3 260 541 924
2,3,589 858 904 919 260 318 924 953 50 32 541 457 ...
3,4,10 356 454 480 589 377 586 350 368 370 594 520...
4,5,62 141 736 780 671 832 150 590 380 457 480 595...


In [6]:
# df_group.to_csv("uid_movieids.csv", index=False)

### Build a spark environment: 

In [4]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("PySpark Item2vec") \
    .getOrCreate()

sc = spark.sparkContext

In [12]:
# df = spark.read.csv("preprocessedData/uid_movieids.csv", header=True)
# df.show(5)

### Create a spark dataframe from a pandas dataframe: 

In [7]:
df = spark.createDataFrame(df_group)
df.show(5)

+------+--------------------+
|userId|             movieId|
+------+--------------------+
|     1|924 919 337 151 1...|
|     2|62 110 589 70 908...|
|     3|589 858 904 919 2...|
|     4|10 356 454 480 58...|
|     5|62 141 736 780 67...|
+------+--------------------+
only showing top 5 rows



### Change the spark dataframe: 

In [8]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

# 把非常的字符串格式变成LIST形式
df = df.withColumn('movie_ids', F.split(df.movieId, " "))

In [9]:
df.show(5)

+------+--------------------+--------------------+
|userId|             movieId|           movie_ids|
+------+--------------------+--------------------+
|     1|924 919 337 151 1...|[924, 919, 337, 1...|
|     2|62 110 589 70 908...|[62, 110, 589, 70...|
|     3|589 858 904 919 2...|[589, 858, 904, 9...|
|     4|10 356 454 480 58...|[10, 356, 454, 48...|
|     5|62 141 736 780 67...|[62, 141, 736, 78...|
+------+--------------------+--------------------+
only showing top 5 rows



### Train the word2vec: 

In [10]:
from pyspark.ml.feature import Word2Vec

word2vec = Word2Vec()\
.setVectorSize(10)\
.setWindowSize(5)\
.setInputCol("movie_ids")\
.setOutputCol("movie_2vec")

model = word2vec.fit(df)

In [11]:
synonyms = model.findSynonyms("158", 20)

In [12]:
type(synonyms)

pyspark.sql.dataframe.DataFrame

In [13]:
synonyms.show(5)

+----+------------------+
|word|        similarity|
+----+------------------+
| 256|0.9595181941986084|
| 186|0.9306996464729309|
|  48|0.9181866645812988|
| 168|0.9145238399505615|
| 355|0.9065435528755188|
+----+------------------+
only showing top 5 rows



In [49]:
# counter = 0
# for row in model.getVectors().rdd.collect():
#     print(row["word"],  " ".join([str(_) for _ in row["vector"]])    )
# #     print(movie_id)
#     counter += 1
#     if counter >= 5:
#         break

In [50]:
# type(model.getVectors())

### Save the movie vectors: 

In [48]:
with open("preprocessedData/item2vecEmb_xmk.csv", 'w') as f:
    for row in tqdm.tqdm(model.getVectors().rdd.collect()):
        movie_id = row["word"]
        vectors = " ".join([str(_) for _ in row["vector"]])
        f.write("{}:{}\n".format(movie_id, vectors))
#     for movie_id in model.getVectors().rdd.collect():
#         vectors = " ".join([str(emb) for emb in model.getVectors()[movie_id]])
#         f.write(movie_id + ":" + vectors + "\n")

100%|█████████████████████████████████████████████████████████████████████████████| 881/881 [00:00<00:00, 31546.62it/s]


# Embedding the users: 

What is the basic logic of user embedding? 

In [5]:
ratingSamples = spark.createDataFrame(rawSampleData)
ratingSamples.show(5)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      2|   3.5|1112486027|
|     1|     29|   3.5|1112484676|
|     1|     32|   3.5|1112484819|
|     1|     47|   3.5|1112484727|
|     1|     50|   3.5|1112484580|
+------+-------+------+----------+
only showing top 5 rows



In [16]:
model.getVectors().show(5)

+----+--------------------+
|word|              vector|
+----+--------------------+
| 710|[0.41551432013511...|
| 205|[0.47179889678955...|
|  45|[-0.2113690823316...|
| 515|[-0.2060780823230...|
| 574|[0.34873422980308...|
+----+--------------------+
only showing top 5 rows



In [27]:
Vectors_list = []
# for key, value in model.getVectors().items():
#     Vectors_list.append((key, list(value)))
for row in tqdm.tqdm(model.getVectors().rdd.collect()):
    movie_id = row["word"]
    vector = [float(_) for _ in row["vector"]] ## 这里要转化为python基本形式, 如果不转的话, 就是numpy.float64, 这种格式后面是会遇上问题的. 
    Vectors_list.append((movie_id, vector))

100%|█████████████████████████████████████████████████████████████████████████████| 881/881 [00:00<00:00, 97588.32it/s]


In [31]:
# Vectors_list[2:5]

In [29]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [None]:
# spark.createDataFrame(samples.tolist(), FloatType()).toDF("x")

In [30]:
fields = [
    StructField('movieId', StringType(), False),
    StructField('emb', ArrayType(DoubleType()), False)
]
schema = StructType(fields)
Vectors_df = spark.createDataFrame(Vectors_list, schema=schema)

In [32]:
ratingSamples = ratingSamples.join(Vectors_df, on='movieId', how='inner')

In [39]:
results = ratingSamples.select('userId', 'emb').rdd.map(lambda x: (x[0], x[1])).reduceByKey(lambda a, b: [a[i] + b[i] for i in range(len(a))]).collect()

In [66]:
with open("preprocessedData/userEmb_xmk.csv", 'w') as f:
    for row in results:
        userId = row[0]
        vectors = " ".join([str(emb) for emb in row[1]])
        if userId == 1: 
            print(userId, vectors)
        f.write("{}:{}\n".format(userId, vectors))

1 0.6455604750663042 3.6822450892068446 -1.0383881330490112 1.9448952544480562 8.581681437790394 -0.15152354445308447 -1.0901359003037214 -4.314250817522407 3.0553889609873295 -1.4465340673923492


(后面的代码可以不用跑了, 只是做实验的.)

In [41]:
type(ratingSamples)

pyspark.sql.dataframe.DataFrame

In [55]:
samples_1 = ratingSamples.where(col("userId") == "1").rdd.collect()

In [62]:
target = [0] * len(samples_1[0]["emb"])
for row in samples_1:
#     print(row["emb"])
    target = [target[i] + row["emb"][i] for i in range(len(target))]
print(target)

[0.6455604750663042, 3.6822450892068446, -1.0383881330490112, 1.9448952544480562, 8.581681437790394, -0.15152354445308447, -1.0901359003037214, -4.314250817522407, 3.0553889609873295, -1.4465340673923492]


看到了吗, 最后计算的到的userEmb, 和我自己做实验写的代码(将userId为1的用户看过的, 有embedding的电影, 的embedding, 加起来.)计算出来的embedding, 是一样的. 证明**原作者计算的所谓用户embedding就是用户看过电影的embedding的和.**