### 知识：

1. word2vec：输入(doc, words)，得到word embedding
2. item2vec：输入（userid, itemids），得到item embedding

说明：

1. 使用标题/内容的分词embedding作推荐，属于内容相似推荐
2. 使用行为列表作embedding作推荐，属于行为相关推荐，效果比内容相似推荐更好

延伸：

1. 把word embedding进行加和、平均，就得到了document embedding；
2. 把item embedding进行加和、平均，就得到了user embedding；

In [1]:
import pandas as pd

import numpy as np

In [2]:
df = pd.read_csv("./dataset/datas/ml-latest-small/ratings.csv")

In [3]:
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [4]:
df["movieId"].nunique()

9724

In [5]:
df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [6]:
df_new = df[df["rating"] >= df["rating"].mean()]

In [7]:
df_new

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100830,610,166528,4.0,1493879365
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047


In [8]:
df_new["movieId"].nunique()

6298

In [9]:
df_new.groupby(["userId","movieId"]).count().reset_index()["movieId"].nunique()

6298

In [10]:
# 聚合得到userid, movieid 列表
df_new.groupby("userId")["movieId"].apply(lambda x :" ".join([str(m) for m in x])).reset_index()

Unnamed: 0,userId,movieId
0,1,1 3 6 47 50 101 110 151 157 163 216 231 235 26...
1,2,333 1704 3578 6874 46970 48516 58559 60756 681...
2,3,849 1587 2288 2851 3024 3703 4518 5181 5746 57...
3,4,106 125 162 176 215 232 260 265 319 342 345 34...
4,5,1 21 34 36 50 58 110 232 247 261 290 296 367 4...
...,...,...
604,606,17 18 29 32 46 50 68 70 73 80 82 147 154 156 1...
605,607,1 36 86 110 150 165 188 241 292 318 366 377 38...
606,608,10 16 47 50 110 170 172 293 296 318 333 353 38...
607,609,10 253 296 318 356 457 590 731 1150 1161


In [11]:
df_group = df_new.groupby("userId")["movieId"].apply(lambda x :" ".join([str(m) for m in x])).reset_index()

In [12]:
df_group

Unnamed: 0,userId,movieId
0,1,1 3 6 47 50 101 110 151 157 163 216 231 235 26...
1,2,333 1704 3578 6874 46970 48516 58559 60756 681...
2,3,849 1587 2288 2851 3024 3703 4518 5181 5746 57...
3,4,106 125 162 176 215 232 260 265 319 342 345 34...
4,5,1 21 34 36 50 58 110 232 247 261 290 296 367 4...
...,...,...
604,606,17 18 29 32 46 50 68 70 73 80 82 147 154 156 1...
605,607,1 36 86 110 150 165 188 241 292 318 366 377 38...
606,608,10 16 47 50 110 170 172 293 296 318 333 353 38...
607,609,10 253 296 318 356 457 590 731 1150 1161


#### gensim word2vec

In [13]:
from gensim.models import Word2Vec

In [14]:
df_group["movieIds"]=df_group["movieId"].map(lambda x:x.split())

In [15]:
df_group

Unnamed: 0,userId,movieId,movieIds
0,1,1 3 6 47 50 101 110 151 157 163 216 231 235 26...,"[1, 3, 6, 47, 50, 101, 110, 151, 157, 163, 216..."
1,2,333 1704 3578 6874 46970 48516 58559 60756 681...,"[333, 1704, 3578, 6874, 46970, 48516, 58559, 6..."
2,3,849 1587 2288 2851 3024 3703 4518 5181 5746 57...,"[849, 1587, 2288, 2851, 3024, 3703, 4518, 5181..."
3,4,106 125 162 176 215 232 260 265 319 342 345 34...,"[106, 125, 162, 176, 215, 232, 260, 265, 319, ..."
4,5,1 21 34 36 50 58 110 232 247 261 290 296 367 4...,"[1, 21, 34, 36, 50, 58, 110, 232, 247, 261, 29..."
...,...,...,...
604,606,17 18 29 32 46 50 68 70 73 80 82 147 154 156 1...,"[17, 18, 29, 32, 46, 50, 68, 70, 73, 80, 82, 1..."
605,607,1 36 86 110 150 165 188 241 292 318 366 377 38...,"[1, 36, 86, 110, 150, 165, 188, 241, 292, 318,..."
606,608,10 16 47 50 110 170 172 293 296 318 333 353 38...,"[10, 16, 47, 50, 110, 170, 172, 293, 296, 318,..."
607,609,10 253 296 318 356 457 590 731 1150 1161,"[10, 253, 296, 318, 356, 457, 590, 731, 1150, ..."


In [16]:
item2v = Word2Vec(df_group["movieIds"])

In [17]:
item2v

<gensim.models.word2vec.Word2Vec at 0x27659c6ac50>

In [18]:
item2v.wv["1"]

array([-0.16623561,  0.27933717,  0.07687131,  0.13285579,  0.03647139,
       -0.46026987,  0.11752003,  0.6490673 , -0.2971947 , -0.18031321,
       -0.292175  , -0.51075727, -0.18156888,  0.08701058,  0.01877802,
       -0.21312405, -0.02887055, -0.41128737,  0.0017676 , -0.5730419 ,
        0.2088514 ,  0.00775372, -0.04281485, -0.1657715 , -0.09870292,
       -0.06201299, -0.3266865 , -0.2238664 , -0.30766332,  0.08473128,
        0.37532556, -0.0352444 , -0.06370638, -0.10111361, -0.00218593,
        0.3797776 ,  0.13942455, -0.420457  , -0.09851398, -0.69940054,
        0.0170705 , -0.35387117, -0.12703943,  0.0852075 ,  0.14709526,
       -0.13471566, -0.23176463,  0.10401887,  0.0579191 ,  0.17762893,
        0.04686071, -0.2596056 , -0.15402012,  0.14331216, -0.36906713,
        0.20137104,  0.19342275, -0.0059259 , -0.3174036 , -0.02088949,
        0.11239772,  0.17215957, -0.16819343,  0.07659032, -0.3862844 ,
        0.19998485,  0.14212057,  0.11511867, -0.28917304,  0.36

In [19]:
type(item2v.wv["1"])

numpy.ndarray

In [20]:
item2v.wv["1"].shape

(100,)

In [21]:
item2v.wv.key_to_index

{'318': 0,
 '356': 1,
 '296': 2,
 '593': 3,
 '2571': 4,
 '260': 5,
 '2959': 6,
 '527': 7,
 '1196': 8,
 '110': 9,
 '1198': 10,
 '50': 11,
 '858': 12,
 '2858': 13,
 '1210': 14,
 '589': 15,
 '1': 16,
 '4993': 17,
 '480': 18,
 '2028': 19,
 '7153': 20,
 '47': 21,
 '457': 22,
 '608': 23,
 '5952': 24,
 '150': 25,
 '2762': 26,
 '4226': 27,
 '1270': 28,
 '32': 29,
 '3578': 30,
 '364': 31,
 '4306': 32,
 '588': 33,
 '58559': 34,
 '1214': 35,
 '1197': 36,
 '1193': 37,
 '79132': 38,
 '1221': 39,
 '1089': 40,
 '7361': 41,
 '1291': 42,
 '2329': 43,
 '1704': 44,
 '1213': 45,
 '590': 46,
 '1136': 47,
 '1036': 48,
 '1265': 49,
 '4973': 50,
 '6377': 51,
 '1240': 52,
 '293': 53,
 '48516': 54,
 '4995': 55,
 '3147': 56,
 '6539': 57,
 '6874': 58,
 '541': 59,
 '4886': 60,
 '1206': 61,
 '1200': 62,
 '780': 63,
 '1258': 64,
 '1208': 65,
 '4963': 66,
 '912': 67,
 '595': 68,
 '8961': 69,
 '1222': 70,
 '380': 71,
 '750': 72,
 '1682': 73,
 '648': 74,
 '1073': 75,
 '4878': 76,
 '377': 77,
 '1097': 78,
 '1580': 79,
 

In [22]:
item2v.wv.index_to_key

['318',
 '356',
 '296',
 '593',
 '2571',
 '260',
 '2959',
 '527',
 '1196',
 '110',
 '1198',
 '50',
 '858',
 '2858',
 '1210',
 '589',
 '1',
 '4993',
 '480',
 '2028',
 '7153',
 '47',
 '457',
 '608',
 '5952',
 '150',
 '2762',
 '4226',
 '1270',
 '32',
 '3578',
 '364',
 '4306',
 '588',
 '58559',
 '1214',
 '1197',
 '1193',
 '79132',
 '1221',
 '1089',
 '7361',
 '1291',
 '2329',
 '1704',
 '1213',
 '590',
 '1136',
 '1036',
 '1265',
 '4973',
 '6377',
 '1240',
 '293',
 '48516',
 '4995',
 '3147',
 '6539',
 '6874',
 '541',
 '4886',
 '1206',
 '1200',
 '780',
 '1258',
 '1208',
 '4963',
 '912',
 '595',
 '8961',
 '1222',
 '380',
 '750',
 '1682',
 '648',
 '1073',
 '4878',
 '377',
 '1097',
 '1580',
 '111',
 '924',
 '34',
 '5418',
 '5989',
 '33794',
 '60069',
 '7438',
 '2502',
 '778',
 '1617',
 '4011',
 '3793',
 '2716',
 '68954',
 '1732',
 '2997',
 '6',
 '5618',
 '1527',
 '3996',
 '1968',
 '68157',
 '223',
 '3949',
 '44191',
 '592',
 '2324',
 '733',
 '904',
 '165',
 '5445',
 '2918',
 '1259',
 '1721',
 '23

In [23]:
help(item2v.wv)

Help on KeyedVectors in module gensim.models.keyedvectors object:

class KeyedVectors(gensim.utils.SaveLoad)
 |  Serialize/deserialize objects from disk, by equipping them with the `save()` / `load()` methods.
 |  
 |  --------
 |  This uses pickle internally (among other techniques), so objects must not contain unpicklable attributes
 |  such as lambda functions etc.
 |  
 |  Method resolution order:
 |      KeyedVectors
 |      gensim.utils.SaveLoad
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __contains__(self, key)
 |  
 |  __getitem__(self, key_or_keys)
 |      Get vector representation of `key_or_keys`.
 |      
 |      Parameters
 |      ----------
 |      key_or_keys : {str, list of str, int, list of int}
 |          Requested key or list-of-keys.
 |      
 |      Returns
 |      -------
 |      numpy.ndarray
 |          Vector representation for `key_or_keys` (1D if `key_or_keys` is single key, otherwise - 2D).
 |  
 |  __init__(self, vector_size, count=0, d

In [24]:
len(item2v.wv.get_normed_vectors())

1956

In [25]:
item2v.wv.save_word2vec_format("item2v.txt")

In [26]:
len(item2v.wv.index_to_key)

1956

In [27]:
item2v.wv

<gensim.models.keyedvectors.KeyedVectors at 0x27659c6acc0>

In [28]:
help(item2v.wv.unit_normalize_all)

Help on method unit_normalize_all in module gensim.models.keyedvectors:

unit_normalize_all() method of gensim.models.keyedvectors.KeyedVectors instance
    Destructively scale all vectors to unit-length.
    
    You cannot sensibly continue training after such a step.



In [29]:
temp = {}
for i in item2v.wv.index_to_key:
    temp[i] = item2v.wv[i]

In [30]:
temp

{'318': array([-2.60985821e-01,  3.97552788e-01,  1.59960762e-01,  3.49834114e-01,
         1.14019044e-01, -7.58800566e-01,  2.52932996e-01,  9.75248456e-01,
        -4.21704113e-01, -3.28733742e-01, -4.61125761e-01, -7.60758281e-01,
        -3.37395817e-01,  9.96729061e-02,  2.47774348e-01, -2.42963821e-01,
         3.57817449e-02, -5.25798440e-01,  1.68532029e-01, -8.26534390e-01,
         2.36786932e-01,  1.24271557e-01, -2.67691705e-02, -3.54674518e-01,
        -1.20053187e-01, -6.16437793e-02, -5.21998763e-01, -8.36883187e-02,
        -4.67460126e-01,  1.38188437e-01,  6.23411298e-01, -2.16533005e-01,
        -7.25903288e-02, -1.88991353e-01, -8.48379508e-02,  5.98323762e-01,
         3.59088629e-01, -5.90995491e-01, -1.70040682e-01, -8.73986721e-01,
         5.12765720e-02, -5.10087907e-01, -1.68231025e-01,  1.61718398e-01,
         2.62945890e-01, -5.17029501e-02, -2.67585546e-01,  8.71705785e-02,
         1.86658232e-04,  2.62656152e-01,  4.03812788e-02, -3.42598319e-01,
     

In [31]:
pd.DataFrame.from_dict(temp,orient="index")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
318,-0.260986,0.397553,0.159961,0.349834,0.114019,-0.758801,0.252933,0.975248,-0.421704,-0.328734,...,0.362420,0.220012,-0.182297,0.177420,0.781304,0.342519,0.081883,-0.462397,0.285026,-0.006304
356,-0.292787,0.378320,0.152198,0.372961,0.156777,-0.769726,0.234118,0.960330,-0.398565,-0.328241,...,0.366839,0.169871,-0.194308,0.152331,0.738061,0.344238,0.103146,-0.460681,0.275536,0.012118
296,-0.263721,0.387719,0.152423,0.378977,0.158270,-0.746364,0.224727,0.993158,-0.450316,-0.335882,...,0.345735,0.247031,-0.181710,0.183994,0.784877,0.336173,0.078342,-0.475059,0.311938,-0.013186
593,-0.350238,0.414212,0.146770,0.134235,0.163511,-0.736748,0.277760,0.974438,-0.362014,-0.264107,...,0.389371,0.089065,-0.192828,0.122695,0.817593,0.523954,0.193565,-0.425784,0.350681,-0.041455
2571,-0.457623,0.599330,-0.090084,-0.100383,-0.136350,-0.898079,0.333388,0.980174,-0.327240,-0.191363,...,0.500874,0.138903,-0.164614,0.086055,0.896714,0.421210,0.327467,-0.595426,0.252633,0.028030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7983,-0.049875,0.071785,-0.001305,-0.016025,-0.003352,-0.118678,0.026309,0.114335,-0.047511,-0.023374,...,0.054643,0.019705,-0.024812,0.019819,0.100877,0.057890,0.031780,-0.066579,0.025277,-0.005680
3018,-0.038766,0.063002,0.005063,-0.012149,-0.016286,-0.102192,0.042878,0.115286,-0.043832,-0.026418,...,0.047093,0.026311,-0.015990,0.015302,0.085814,0.057099,0.025543,-0.062985,0.024117,0.004557
72733,-0.026085,0.063432,-0.001511,-0.007001,-0.016497,-0.084814,0.031342,0.089767,-0.025385,-0.019967,...,0.051297,0.013522,-0.023023,0.016681,0.076148,0.046648,0.020416,-0.056725,0.025174,0.001621
1797,-0.042420,0.048730,-0.004828,-0.013935,-0.015172,-0.080543,0.025071,0.091453,-0.026000,-0.026243,...,0.038998,0.015141,-0.021404,0.010489,0.076779,0.045086,0.032522,-0.054135,0.016038,-0.012094


In [32]:
df_user_movie=pd.DataFrame(item2v.wv.index_to_key,columns=["movie_id"])

In [33]:
df_user_movie

Unnamed: 0,movie_id
0,318
1,356
2,296
3,593
4,2571
...,...
1951,7983
1952,3018
1953,72733
1954,1797


In [34]:
df_user_movie["movie_vec"]=df_user_movie["movie_id"].map(lambda x: item2v.wv[x])

In [35]:
df_user_movie

Unnamed: 0,movie_id,movie_vec
0,318,"[-0.26098582, 0.3975528, 0.15996076, 0.3498341..."
1,356,"[-0.29278684, 0.37832025, 0.15219828, 0.372960..."
2,296,"[-0.26372126, 0.387719, 0.152423, 0.37897727, ..."
3,593,"[-0.35023752, 0.4142115, 0.14676973, 0.1342354..."
4,2571,"[-0.4576228, 0.59932977, -0.09008434, -0.10038..."
...,...,...
1951,7983,"[-0.04987536, 0.071785115, -0.0013049456, -0.0..."
1952,3018,"[-0.038766336, 0.063001536, 0.00506285, -0.012..."
1953,72733,"[-0.026085202, 0.06343186, -0.0015110146, -0.0..."
1954,1797,"[-0.042420436, 0.04873002, -0.0048282538, -0.0..."


In [36]:
df_user_movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1956 entries, 0 to 1955
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   movie_id   1956 non-null   object
 1   movie_vec  1956 non-null   object
dtypes: object(2)
memory usage: 30.7+ KB


In [37]:
df_user_movie["movie_id"].astype(str)

0         318
1         356
2         296
3         593
4        2571
        ...  
1951     7983
1952     3018
1953    72733
1954     1797
1955    52287
Name: movie_id, Length: 1956, dtype: object

In [38]:
df_user_movie[df_user_movie["movie_id"]=="318"]

Unnamed: 0,movie_id,movie_vec
0,318,"[-0.26098582, 0.3975528, 0.15996076, 0.3498341..."


#### 电影数据集

In [39]:
df_movie = pd.read_csv("./dataset/datas/ml-latest-small/movies.csv")

In [40]:
df_movie

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [41]:
df_movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [42]:
df_movie["movieId"] = df_movie["movieId"].astype(str)

In [43]:
df_merge = pd.merge(left=df_user_movie,right=df_movie,left_on="movie_id",right_on="movieId")

In [44]:
df_merge

Unnamed: 0,movie_id,movie_vec,movieId,title,genres
0,318,"[-0.26098582, 0.3975528, 0.15996076, 0.3498341...",318,"Shawshank Redemption, The (1994)",Crime|Drama
1,356,"[-0.29278684, 0.37832025, 0.15219828, 0.372960...",356,Forrest Gump (1994),Comedy|Drama|Romance|War
2,296,"[-0.26372126, 0.387719, 0.152423, 0.37897727, ...",296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3,593,"[-0.35023752, 0.4142115, 0.14676973, 0.1342354...",593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
4,2571,"[-0.4576228, 0.59932977, -0.09008434, -0.10038...",2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
...,...,...,...,...,...
1951,7983,"[-0.04987536, 0.071785115, -0.0013049456, -0.0...",7983,Broadway Danny Rose (1984),Comedy
1952,3018,"[-0.038766336, 0.063001536, 0.00506285, -0.012...",3018,Re-Animator (1985),Comedy|Horror|Sci-Fi
1953,72733,"[-0.026085202, 0.06343186, -0.0015110146, -0.0...",72733,Invictus (2009),Drama
1954,1797,"[-0.042420436, 0.04873002, -0.0048282538, -0.0...",1797,Everest (1998),Documentary|IMAX


In [45]:
movie_id = '318'

In [46]:
df_merge[df_merge["movieId"] == movie_id]["movie_vec"].iloc[0]

array([-2.60985821e-01,  3.97552788e-01,  1.59960762e-01,  3.49834114e-01,
        1.14019044e-01, -7.58800566e-01,  2.52932996e-01,  9.75248456e-01,
       -4.21704113e-01, -3.28733742e-01, -4.61125761e-01, -7.60758281e-01,
       -3.37395817e-01,  9.96729061e-02,  2.47774348e-01, -2.42963821e-01,
        3.57817449e-02, -5.25798440e-01,  1.68532029e-01, -8.26534390e-01,
        2.36786932e-01,  1.24271557e-01, -2.67691705e-02, -3.54674518e-01,
       -1.20053187e-01, -6.16437793e-02, -5.21998763e-01, -8.36883187e-02,
       -4.67460126e-01,  1.38188437e-01,  6.23411298e-01, -2.16533005e-01,
       -7.25903288e-02, -1.88991353e-01, -8.48379508e-02,  5.98323762e-01,
        3.59088629e-01, -5.90995491e-01, -1.70040682e-01, -8.73986721e-01,
        5.12765720e-02, -5.10087907e-01, -1.68231025e-01,  1.61718398e-01,
        2.62945890e-01, -5.17029501e-02, -2.67585546e-01,  8.71705785e-02,
        1.86658232e-04,  2.62656152e-01,  4.03812788e-02, -3.42598319e-01,
       -2.76455134e-01,  

In [47]:
movie_embed = df_merge[df_merge["movieId"] == movie_id]["movie_vec"].iloc[0]

In [48]:
movie_embed

array([-2.60985821e-01,  3.97552788e-01,  1.59960762e-01,  3.49834114e-01,
        1.14019044e-01, -7.58800566e-01,  2.52932996e-01,  9.75248456e-01,
       -4.21704113e-01, -3.28733742e-01, -4.61125761e-01, -7.60758281e-01,
       -3.37395817e-01,  9.96729061e-02,  2.47774348e-01, -2.42963821e-01,
        3.57817449e-02, -5.25798440e-01,  1.68532029e-01, -8.26534390e-01,
        2.36786932e-01,  1.24271557e-01, -2.67691705e-02, -3.54674518e-01,
       -1.20053187e-01, -6.16437793e-02, -5.21998763e-01, -8.36883187e-02,
       -4.67460126e-01,  1.38188437e-01,  6.23411298e-01, -2.16533005e-01,
       -7.25903288e-02, -1.88991353e-01, -8.48379508e-02,  5.98323762e-01,
        3.59088629e-01, -5.90995491e-01, -1.70040682e-01, -8.73986721e-01,
        5.12765720e-02, -5.10087907e-01, -1.68231025e-01,  1.61718398e-01,
        2.62945890e-01, -5.17029501e-02, -2.67585546e-01,  8.71705785e-02,
        1.86658232e-04,  2.62656152e-01,  4.03812788e-02, -3.42598319e-01,
       -2.76455134e-01,  

In [49]:
df_merge[df_merge["movieId"] == "1"]

Unnamed: 0,movie_id,movie_vec,movieId,title,genres
16,1,"[-0.16623561, 0.27933717, 0.07687131, 0.132855...",1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [50]:
df_merge[df_merge["movieId"] == "2"]

Unnamed: 0,movie_id,movie_vec,movieId,title,genres
164,2,"[-0.13055372, 0.21916679, 0.0254016, 0.0636036...",2,Jumanji (1995),Adventure|Children|Fantasy


In [51]:
item2v.wv.similarity("1","2")

0.996106

In [52]:
help(item2v.wv.distance)

Help on method distance in module gensim.models.keyedvectors:

distance(w1, w2) method of gensim.models.keyedvectors.KeyedVectors instance
    Compute cosine distance between two keys.
    Calculate 1 - :meth:`~gensim.models.keyedvectors.KeyedVectors.similarity`.
    
    Parameters
    ----------
    w1 : str
        Input key.
    w2 : str
        Input key.
    
    Returns
    -------
    float
        Distance between `w1` and `w2`.



In [53]:
df_merge["sim_value"] = df_merge["movie_id"].map(lambda x :1 -item2v.wv.distance("318",x))

In [54]:
df_merge.sort_values("sim_value",ascending=False)

Unnamed: 0,movie_id,movie_vec,movieId,title,genres,sim_value
0,318,"[-0.26098582, 0.3975528, 0.15996076, 0.3498341...",318,"Shawshank Redemption, The (1994)",Crime|Drama,1.000000
53,293,"[-0.22644289, 0.31886825, 0.12365857, 0.286758...",293,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller,0.998640
2,296,"[-0.26372126, 0.387719, 0.152423, 0.37897727, ...",296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,0.998498
5,260,"[-0.24782567, 0.37247637, 0.14847778, 0.320460...",260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,0.997911
1,356,"[-0.29278684, 0.37832025, 0.15219828, 0.372960...",356,Forrest Gump (1994),Comedy|Drama|Romance|War,0.997003
...,...,...,...,...,...,...
36,1197,"[-0.09267239, 0.6764026, 0.08897273, -0.405202...",1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,0.757913
14,1210,"[-0.04332845, 0.69028693, 0.060294375, -0.3882...",1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,0.752987
45,1213,"[-0.023133853, 0.6564915, 0.044946924, -0.3676...",1213,Goodfellas (1990),Crime|Drama,0.746128
10,1198,"[-0.04698544, 0.65205395, 0.121058024, -0.4372...",1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,0.733670


### pysaprk.ml word2vec

In [55]:
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352
