### 知识：

1. word2vec：输入(doc, words)，得到word embedding
2. item2vec：输入（userid, itemids），得到item embedding

说明：

1. 使用标题/内容的分词embedding作推荐，属于内容相似推荐
2. 使用行为列表作embedding作推荐，属于行为相关推荐，效果比内容相似推荐更好

延伸：

1. 把word embedding进行加和、平均，就得到了document embedding；
2. 把item embedding进行加和、平均，就得到了user embedding；

#### 数据整理

In [1]:
import pandas as pd 
import numpy as np

In [2]:
df = pd.read_csv("./dataset/datas/ml-latest-small/ratings.csv")

In [3]:
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [4]:
df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [5]:
# 取评分大于均值的数据
df_new = df[df["rating"] >= df["rating"].mean()]

In [6]:
df_new

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100830,610,166528,4.0,1493879365
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047


In [7]:
# 按照用户id聚合，得到userid， movieids
df_group = df_new.groupby("userId")["movieId"].apply(lambda x : " ".join([str(m) for m in x])).reset_index()

In [8]:
df_group

Unnamed: 0,userId,movieId
0,1,1 3 6 47 50 101 110 151 157 163 216 231 235 26...
1,2,333 1704 3578 6874 46970 48516 58559 60756 681...
2,3,849 1587 2288 2851 3024 3703 4518 5181 5746 57...
3,4,106 125 162 176 215 232 260 265 319 342 345 34...
4,5,1 21 34 36 50 58 110 232 247 261 290 296 367 4...
...,...,...
604,606,17 18 29 32 46 50 68 70 73 80 82 147 154 156 1...
605,607,1 36 86 110 150 165 188 241 292 318 366 377 38...
606,608,10 16 47 50 110 170 172 293 296 318 333 353 38...
607,609,10 253 296 318 356 457 590 731 1150 1161


In [9]:
df_group.to_csv("./df_userid_movieids.csv",index=False)

#### pyspark.ml  word2vec

In [10]:
import findspark
findspark.init()

In [11]:
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .appName("pyspark item2vec") \
        .getOrCreate()

In [12]:
sc = spark.sparkContext

In [13]:
from pyspark.ml.feature import Word2Vec
import  pyspark.sql.functions as F

In [14]:
df_uid_movieids = spark.read.csv("./df_userid_movieids.csv",header=True)

In [15]:
df_uid_movieids.show(5)

+------+--------------------+
|userId|             movieId|
+------+--------------------+
|     1|1 3 6 47 50 101 1...|
|     2|333 1704 3578 687...|
|     3|849 1587 2288 285...|
|     4|106 125 162 176 2...|
|     5|1 21 34 36 50 58 ...|
+------+--------------------+
only showing top 5 rows



In [16]:
df_uid_movieids = df_uid_movieids.withColumn("movie_ids", F.split(df_uid_movieids.movieId," "))

In [17]:
df_uid_movieids.show(5)

+------+--------------------+--------------------+
|userId|             movieId|           movie_ids|
+------+--------------------+--------------------+
|     1|1 3 6 47 50 101 1...|[1, 3, 6, 47, 50,...|
|     2|333 1704 3578 687...|[333, 1704, 3578,...|
|     3|849 1587 2288 285...|[849, 1587, 2288,...|
|     4|106 125 162 176 2...|[106, 125, 162, 1...|
|     5|1 21 34 36 50 58 ...|[1, 21, 34, 36, 5...|
+------+--------------------+--------------------+
only showing top 5 rows



In [18]:
word2v = Word2Vec(
    minCount=0,
    inputCol="movie_ids",
    outputCol="movie_vec"
)

In [19]:
model = word2v.fit(df_uid_movieids)

In [20]:
model.getVectors().show(3,truncate=False)

+-----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [21]:
df_movie_embedding = model.getVectors().select("word","vector") \
        .toPandas() 

In [22]:
df_movie_embedding

Unnamed: 0,word,vector
0,26985,"[0.0050064753741025925, -0.002028297632932663,..."
1,5451,"[0.0031055069994181395, -0.0035000210627913475..."
2,4018,"[0.002354368567466736, -0.031213808804750443, ..."
3,4056,"[0.0064166150987148285, -0.025528382509946823,..."
4,32584,"[-0.002056463621556759, 0.002898153616115451, ..."
...,...,...
6293,104419,"[0.0028946588281542063, -0.004049575887620449,..."
6294,133867,"[0.0029085518326610327, -0.007749718148261309,..."
6295,3439,"[0.0034692941699177027, -0.0001765647029969841..."
6296,2141,"[0.01832941360771656, -0.018213754519820213, -..."


##### 对于给定电影算出最相似的10个电影

In [23]:
df_movie = pd.read_csv("./dataset/datas/ml-latest-small/movies.csv")

In [24]:
df_movie

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [25]:
df_movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [26]:
df_movie["movieId"] =df_movie["movieId"].astype(str)

In [27]:
df_merge = pd.merge(left=df_movie_embedding,right=df_movie, left_on="word",right_on="movieId")

In [28]:
df_merge.columns

Index(['word', 'vector', 'movieId', 'title', 'genres'], dtype='object')

In [29]:
df_merge = df_merge[['movieId', 'title', 'genres', 'vector']]

In [30]:
df_merge

Unnamed: 0,movieId,title,genres,vector
0,26985,Nirvana (1997),Action|Sci-Fi,"[0.0050064753741025925, -0.002028297632932663,..."
1,5451,Pumpkin (2002),Comedy|Drama|Romance,"[0.0031055069994181395, -0.0035000210627913475..."
2,4018,What Women Want (2000),Comedy|Romance,"[0.002354368567466736, -0.031213808804750443, ..."
3,4056,"Pledge, The (2001)",Crime|Drama|Mystery|Thriller,"[0.0064166150987148285, -0.025528382509946823,..."
4,32584,"Ballad of Jack and Rose, The (2005)",Drama,"[-0.002056463621556759, 0.002898153616115451, ..."
...,...,...,...,...
6293,104419,Justice League: Crisis on Two Earths (2010),Action|Animation|Sci-Fi,"[0.0028946588281542063, -0.004049575887620449,..."
6294,133867,Barely Lethal (2015),Action|Adventure|Comedy,"[0.0029085518326610327, -0.007749718148261309,..."
6295,3439,Teenage Mutant Ninja Turtles II: The Secret of...,Action|Children|Fantasy,"[0.0034692941699177027, -0.0001765647029969841..."
6296,2141,"American Tail, An (1986)",Adventure|Animation|Children|Comedy,"[0.01832941360771656, -0.018213754519820213, -..."


In [31]:
# 选定一个电影 104419
movie_id= "104419"

In [32]:
df_merge[df_merge["movieId"]== movie_id]

Unnamed: 0,movieId,title,genres,vector
6293,104419,Justice League: Crisis on Two Earths (2010),Action|Animation|Sci-Fi,"[0.0028946588281542063, -0.004049575887620449,..."


In [33]:
df_merge[df_merge["movieId"]== movie_id]["vector"].iloc[0]

DenseVector([0.0029, -0.004, -0.0045, 0.0018, 0.0004, 0.0007, 0.0027, 0.0069, 0.0033, 0.0039, -0.0094, -0.0004, 0.0106, -0.002, -0.0011, 0.0074, -0.0094, -0.0005, 0.002, -0.0056, 0.0045, 0.0087, 0.0099, 0.001, -0.0011, -0.0, -0.0035, 0.0114, 0.0075, 0.0032, -0.001, 0.0033, 0.0035, -0.0094, -0.0011, 0.0053, -0.0089, -0.0013, -0.0056, -0.0036, 0.0017, -0.0101, -0.0127, 0.0005, 0.0106, 0.0046, 0.0073, -0.0122, -0.0207, -0.0026, 0.0057, -0.0074, 0.0017, -0.0131, 0.0046, 0.0088, 0.0062, 0.0011, 0.005, -0.0052, -0.0028, 0.0039, 0.0023, -0.0043, 0.0001, 0.0022, 0.0021, 0.0111, 0.0102, 0.0049, -0.0073, 0.0054, -0.003, 0.0112, -0.0003, -0.0005, -0.0015, 0.0038, 0.0104, -0.0027, -0.0114, -0.0089, 0.0079, -0.0095, 0.004, -0.0077, -0.0006, 0.0098, 0.0005, 0.0048, -0.0059, -0.0049, 0.001, 0.002, -0.0135, 0.0035, -0.0035, -0.0016, -0.0022, 0.0041])

In [34]:
target_movie_emb= df_merge[df_merge["movieId"]== movie_id]["vector"].iloc[0]

In [35]:
# 余弦相似度， 与target——movieid 104419的相似度
from scipy.spatial import distance
df_merge["sim_value"] = df_merge["vector"].map(lambda x : 1- distance.cosine(target_movie_emb,x))

In [36]:
df_merge

Unnamed: 0,movieId,title,genres,vector,sim_value
0,26985,Nirvana (1997),Action|Sci-Fi,"[0.0050064753741025925, -0.002028297632932663,...",0.681731
1,5451,Pumpkin (2002),Comedy|Drama|Romance,"[0.0031055069994181395, -0.0035000210627913475...",0.654829
2,4018,What Women Want (2000),Comedy|Romance,"[0.002354368567466736, -0.031213808804750443, ...",0.715280
3,4056,"Pledge, The (2001)",Crime|Drama|Mystery|Thriller,"[0.0064166150987148285, -0.025528382509946823,...",0.821703
4,32584,"Ballad of Jack and Rose, The (2005)",Drama,"[-0.002056463621556759, 0.002898153616115451, ...",0.327558
...,...,...,...,...,...
6293,104419,Justice League: Crisis on Two Earths (2010),Action|Animation|Sci-Fi,"[0.0028946588281542063, -0.004049575887620449,...",1.000000
6294,133867,Barely Lethal (2015),Action|Adventure|Comedy,"[0.0029085518326610327, -0.007749718148261309,...",0.755136
6295,3439,Teenage Mutant Ninja Turtles II: The Secret of...,Action|Children|Fantasy,"[0.0034692941699177027, -0.0001765647029969841...",0.086787
6296,2141,"American Tail, An (1986)",Adventure|Animation|Children|Comedy,"[0.01832941360771656, -0.018213754519820213, -...",0.426517


In [37]:
# 按相似度降序排列，查询前10条
df_merge.sort_values("sim_value",ascending=False)[['movieId', 'title', 'genres','sim_value']].head(10)

Unnamed: 0,movieId,title,genres,sim_value
6293,104419,Justice League: Crisis on Two Earths (2010),Action|Animation|Sci-Fi,1.0
5423,47629,The Queen (2006),Drama,0.89854
4348,43921,Running Scared (2006),Action|Crime|Thriller,0.895997
4589,79592,"Other Guys, The (2010)",Action|Comedy,0.894612
3198,37733,"History of Violence, A (2005)",Action|Crime|Drama|Thriller,0.893884
3191,78469,"A-Team, The (2010)",Action|Comedy|Thriller,0.890925
5013,37729,Corpse Bride (2005),Animation|Comedy|Fantasy|Musical|Romance,0.890434
6083,7451,Mean Girls (2004),Comedy,0.889519
6159,7247,Chitty Chitty Bang Bang (1968),Adventure|Children|Comedy|Fantasy|Musical,0.885103
4400,26578,"Sacrifice, The (Offret - Sacraficatio) (1986)",Drama,0.884508
