### 知识：

1. word2vec：输入(doc, words)，得到word embedding
2. item2vec：输入（userid, itemids），得到item embedding

说明：

1. 使用标题/内容的分词embedding作推荐，属于内容相似推荐
2. 使用行为列表作embedding作推荐，属于行为相关推荐，效果比内容相似推荐更好

延伸：

1. 把word embedding进行加和、平均，就得到了document embedding；
2. 把item embedding进行加和、平均，就得到了user embedding；

In [1]:
import pandas as pd

import numpy as np

In [2]:
df = pd.read_csv("./dataset/datas/ml-latest-small/ratings.csv")

In [3]:
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [55]:
df["movieId"].nunique()

9724

In [4]:
df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [5]:
df_new = df[df["rating"] >= df["rating"].mean()]

In [6]:
df_new

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100830,610,166528,4.0,1493879365
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047


In [56]:
df_new["movieId"].nunique()

6298

In [62]:
df_new.groupby(["userId","movieId"]).count().reset_index()["movieId"].nunique()

6298

In [7]:
# 聚合得到userid, movieid 列表
df_new.groupby("userId")["movieId"].apply(lambda x :" ".join([str(m) for m in x])).reset_index()

Unnamed: 0,userId,movieId
0,1,1 3 6 47 50 101 110 151 157 163 216 231 235 26...
1,2,333 1704 3578 6874 46970 48516 58559 60756 681...
2,3,849 1587 2288 2851 3024 3703 4518 5181 5746 57...
3,4,106 125 162 176 215 232 260 265 319 342 345 34...
4,5,1 21 34 36 50 58 110 232 247 261 290 296 367 4...
...,...,...
604,606,17 18 29 32 46 50 68 70 73 80 82 147 154 156 1...
605,607,1 36 86 110 150 165 188 241 292 318 366 377 38...
606,608,10 16 47 50 110 170 172 293 296 318 333 353 38...
607,609,10 253 296 318 356 457 590 731 1150 1161


In [8]:
df_group = df_new.groupby("userId")["movieId"].apply(lambda x :" ".join([str(m) for m in x])).reset_index()

In [9]:
df_group

Unnamed: 0,userId,movieId
0,1,1 3 6 47 50 101 110 151 157 163 216 231 235 26...
1,2,333 1704 3578 6874 46970 48516 58559 60756 681...
2,3,849 1587 2288 2851 3024 3703 4518 5181 5746 57...
3,4,106 125 162 176 215 232 260 265 319 342 345 34...
4,5,1 21 34 36 50 58 110 232 247 261 290 296 367 4...
...,...,...
604,606,17 18 29 32 46 50 68 70 73 80 82 147 154 156 1...
605,607,1 36 86 110 150 165 188 241 292 318 366 377 38...
606,608,10 16 47 50 110 170 172 293 296 318 333 353 38...
607,609,10 253 296 318 356 457 590 731 1150 1161


#### gensim word2vec

In [10]:
from gensim.models import Word2Vec

In [11]:
df_group["movieIds"]=df_group["movieId"].map(lambda x:x.split())

In [12]:
df_group

Unnamed: 0,userId,movieId,movieIds
0,1,1 3 6 47 50 101 110 151 157 163 216 231 235 26...,"[1, 3, 6, 47, 50, 101, 110, 151, 157, 163, 216..."
1,2,333 1704 3578 6874 46970 48516 58559 60756 681...,"[333, 1704, 3578, 6874, 46970, 48516, 58559, 6..."
2,3,849 1587 2288 2851 3024 3703 4518 5181 5746 57...,"[849, 1587, 2288, 2851, 3024, 3703, 4518, 5181..."
3,4,106 125 162 176 215 232 260 265 319 342 345 34...,"[106, 125, 162, 176, 215, 232, 260, 265, 319, ..."
4,5,1 21 34 36 50 58 110 232 247 261 290 296 367 4...,"[1, 21, 34, 36, 50, 58, 110, 232, 247, 261, 29..."
...,...,...,...
604,606,17 18 29 32 46 50 68 70 73 80 82 147 154 156 1...,"[17, 18, 29, 32, 46, 50, 68, 70, 73, 80, 82, 1..."
605,607,1 36 86 110 150 165 188 241 292 318 366 377 38...,"[1, 36, 86, 110, 150, 165, 188, 241, 292, 318,..."
606,608,10 16 47 50 110 170 172 293 296 318 333 353 38...,"[10, 16, 47, 50, 110, 170, 172, 293, 296, 318,..."
607,609,10 253 296 318 356 457 590 731 1150 1161,"[10, 253, 296, 318, 356, 457, 590, 731, 1150, ..."


In [13]:
item2v = Word2Vec(df_group["movieIds"])

In [14]:
item2v

<gensim.models.word2vec.Word2Vec at 0x1ed6754fda0>

In [15]:
item2v.wv["1"]

array([-0.15190507,  0.2586639 ,  0.06611691,  0.17004797,  0.06953306,
       -0.45716375,  0.096256  ,  0.64996797, -0.3255526 , -0.19567816,
       -0.29271615, -0.478204  , -0.17887942,  0.13341768,  0.04070657,
       -0.21169496, -0.00904123, -0.3906236 , -0.01187412, -0.5733833 ,
        0.19068348, -0.00510782, -0.06955276, -0.19758013, -0.10984576,
       -0.08012579, -0.317151  , -0.17239542, -0.2982334 ,  0.09005812,
        0.36776802, -0.05198264, -0.08752274, -0.11241957, -0.01932125,
        0.38079387,  0.1823208 , -0.40112254, -0.09999765, -0.65784186,
        0.00653287, -0.35071254, -0.11184344,  0.0939784 ,  0.14873372,
       -0.10611552, -0.1942084 ,  0.11721998,  0.06718684,  0.16287607,
        0.02969293, -0.2709642 , -0.15929604,  0.16926478, -0.40014803,
        0.18671153,  0.16887298, -0.01429624, -0.3171072 , -0.01916993,
        0.11127859,  0.15459995, -0.14626126,  0.05873318, -0.34545687,
        0.201513  ,  0.16042817,  0.13065578, -0.31269646,  0.38

In [16]:
type(item2v.wv["1"])

numpy.ndarray

In [17]:
item2v.wv["1"].shape

(100,)

In [18]:
item2v.wv.key_to_index

{'318': 0,
 '356': 1,
 '296': 2,
 '593': 3,
 '2571': 4,
 '260': 5,
 '2959': 6,
 '527': 7,
 '1196': 8,
 '110': 9,
 '1198': 10,
 '50': 11,
 '858': 12,
 '2858': 13,
 '1210': 14,
 '589': 15,
 '1': 16,
 '4993': 17,
 '480': 18,
 '2028': 19,
 '7153': 20,
 '47': 21,
 '457': 22,
 '608': 23,
 '5952': 24,
 '150': 25,
 '2762': 26,
 '4226': 27,
 '1270': 28,
 '32': 29,
 '3578': 30,
 '364': 31,
 '4306': 32,
 '588': 33,
 '58559': 34,
 '1214': 35,
 '1197': 36,
 '1193': 37,
 '79132': 38,
 '1221': 39,
 '1089': 40,
 '7361': 41,
 '1291': 42,
 '2329': 43,
 '1704': 44,
 '1213': 45,
 '590': 46,
 '1136': 47,
 '1036': 48,
 '1265': 49,
 '4973': 50,
 '6377': 51,
 '1240': 52,
 '293': 53,
 '48516': 54,
 '4995': 55,
 '3147': 56,
 '6539': 57,
 '6874': 58,
 '541': 59,
 '4886': 60,
 '1206': 61,
 '1200': 62,
 '780': 63,
 '1258': 64,
 '1208': 65,
 '4963': 66,
 '912': 67,
 '595': 68,
 '8961': 69,
 '1222': 70,
 '380': 71,
 '750': 72,
 '1682': 73,
 '648': 74,
 '1073': 75,
 '4878': 76,
 '377': 77,
 '1097': 78,
 '1580': 79,
 

In [19]:
item2v.wv.index_to_key

['318',
 '356',
 '296',
 '593',
 '2571',
 '260',
 '2959',
 '527',
 '1196',
 '110',
 '1198',
 '50',
 '858',
 '2858',
 '1210',
 '589',
 '1',
 '4993',
 '480',
 '2028',
 '7153',
 '47',
 '457',
 '608',
 '5952',
 '150',
 '2762',
 '4226',
 '1270',
 '32',
 '3578',
 '364',
 '4306',
 '588',
 '58559',
 '1214',
 '1197',
 '1193',
 '79132',
 '1221',
 '1089',
 '7361',
 '1291',
 '2329',
 '1704',
 '1213',
 '590',
 '1136',
 '1036',
 '1265',
 '4973',
 '6377',
 '1240',
 '293',
 '48516',
 '4995',
 '3147',
 '6539',
 '6874',
 '541',
 '4886',
 '1206',
 '1200',
 '780',
 '1258',
 '1208',
 '4963',
 '912',
 '595',
 '8961',
 '1222',
 '380',
 '750',
 '1682',
 '648',
 '1073',
 '4878',
 '377',
 '1097',
 '1580',
 '111',
 '924',
 '34',
 '5418',
 '5989',
 '33794',
 '60069',
 '7438',
 '2502',
 '778',
 '1617',
 '4011',
 '3793',
 '2716',
 '68954',
 '1732',
 '2997',
 '6',
 '5618',
 '1527',
 '3996',
 '1968',
 '68157',
 '223',
 '3949',
 '44191',
 '592',
 '2324',
 '733',
 '904',
 '165',
 '5445',
 '2918',
 '1259',
 '1721',
 '23

In [20]:
help(item2v.wv)

Help on KeyedVectors in module gensim.models.keyedvectors object:

class KeyedVectors(gensim.utils.SaveLoad)
 |  Serialize/deserialize objects from disk, by equipping them with the `save()` / `load()` methods.
 |  
 |  --------
 |  This uses pickle internally (among other techniques), so objects must not contain unpicklable attributes
 |  such as lambda functions etc.
 |  
 |  Method resolution order:
 |      KeyedVectors
 |      gensim.utils.SaveLoad
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __contains__(self, key)
 |  
 |  __getitem__(self, key_or_keys)
 |      Get vector representation of `key_or_keys`.
 |      
 |      Parameters
 |      ----------
 |      key_or_keys : {str, list of str, int, list of int}
 |          Requested key or list-of-keys.
 |      
 |      Returns
 |      -------
 |      numpy.ndarray
 |          Vector representation for `key_or_keys` (1D if `key_or_keys` is single key, otherwise - 2D).
 |  
 |  __init__(self, vector_size, count=0, d

In [21]:
len(item2v.wv.get_normed_vectors())

1956

In [22]:
item2v.wv.save_word2vec_format("item2v.txt")

In [23]:
len(item2v.wv.index_to_key)

1956

In [24]:
item2v.wv

<gensim.models.keyedvectors.KeyedVectors at 0x1ed6754fdd8>

In [25]:
help(item2v.wv.unit_normalize_all)

Help on method unit_normalize_all in module gensim.models.keyedvectors:

unit_normalize_all() method of gensim.models.keyedvectors.KeyedVectors instance
    Destructively scale all vectors to unit-length.
    
    You cannot sensibly continue training after such a step.



In [26]:
temp = {}
for i in item2v.wv.index_to_key:
    temp[i] = item2v.wv[i]

In [27]:
temp

{'318': array([-0.26765195,  0.36566406,  0.16020532,  0.42120892,  0.17477362,
        -0.7899041 ,  0.24813364,  1.0139135 , -0.4614174 , -0.3714401 ,
        -0.4700639 , -0.75197387, -0.370563  ,  0.18281002,  0.29947788,
        -0.24675854,  0.06587697, -0.50463206,  0.14057325, -0.875108  ,
         0.22207916,  0.11727619, -0.06861186, -0.40915707, -0.13602744,
        -0.09969606, -0.5249416 , -0.01299041, -0.48441985,  0.17093688,
         0.6522949 , -0.27157384, -0.12148076, -0.22043222, -0.13203163,
         0.6420953 ,  0.43510342, -0.59490657, -0.16469926, -0.8617905 ,
         0.05113297, -0.53878623, -0.16024064,  0.1786031 ,  0.26405242,
         0.00632328, -0.21793802,  0.12322978,  0.01488166,  0.25357795,
         0.00988423, -0.36907932, -0.27013364,  0.2917181 , -0.57912093,
         0.39912686,  0.29261142,  0.04140107, -0.42731377,  0.04911773,
         0.04629497,  0.3036391 , -0.21465842,  0.06242179, -0.31664312,
         0.36813235,  0.21400744,  0.1743630

In [28]:
pd.DataFrame.from_dict(temp,orient="index")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
318,-0.267652,0.365664,0.160205,0.421209,0.174774,-0.789904,0.248134,1.013914,-0.461417,-0.371440,...,0.401888,0.238992,-0.192188,0.155017,0.807175,0.281675,0.063094,-0.499734,0.318817,-0.005941
356,-0.284456,0.360290,0.168543,0.434942,0.219403,-0.796794,0.247805,1.009220,-0.440855,-0.370575,...,0.409074,0.197301,-0.197344,0.130949,0.782797,0.291503,0.094106,-0.482380,0.315338,0.001335
296,-0.267334,0.375213,0.144903,0.417853,0.206227,-0.783044,0.229496,1.035649,-0.492550,-0.374498,...,0.390729,0.270123,-0.184072,0.165484,0.819511,0.281559,0.078092,-0.505938,0.341217,-0.027884
593,-0.295092,0.382053,0.180560,0.148408,0.221315,-0.712468,0.291614,1.001207,-0.373189,-0.271004,...,0.404142,0.096571,-0.184633,0.090370,0.835191,0.471081,0.199219,-0.394485,0.361563,-0.050212
2571,-0.429888,0.625265,-0.116839,-0.051448,-0.136160,-0.937055,0.298402,1.045905,-0.356930,-0.165504,...,0.557058,0.154558,-0.163075,0.061338,0.935982,0.403253,0.359281,-0.608337,0.221908,-0.018150
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7983,-0.046536,0.070413,-0.004825,-0.011414,-0.002056,-0.115772,0.021678,0.113188,-0.047682,-0.019376,...,0.057175,0.020318,-0.023513,0.016336,0.099339,0.053023,0.034367,-0.065912,0.020819,-0.008893
3018,-0.043771,0.072572,-0.000085,-0.006266,-0.016701,-0.118068,0.042700,0.135930,-0.051361,-0.026783,...,0.060309,0.029271,-0.019138,0.013108,0.101319,0.059863,0.032045,-0.076424,0.025366,0.002642
72733,-0.028427,0.068011,-0.005116,-0.003517,-0.017091,-0.091998,0.028977,0.099329,-0.029323,-0.018159,...,0.058500,0.015356,-0.025027,0.014570,0.082923,0.046211,0.024551,-0.064312,0.024717,0.000131
1797,-0.045132,0.055917,-0.008272,-0.008186,-0.015442,-0.092184,0.024764,0.106388,-0.031553,-0.026015,...,0.048948,0.016996,-0.023298,0.008397,0.088077,0.044923,0.037450,-0.063238,0.016330,-0.014734


In [29]:
df_user_movie=pd.DataFrame(item2v.wv.index_to_key,columns=["movie_id"])

In [30]:
df_user_movie

Unnamed: 0,movie_id
0,318
1,356
2,296
3,593
4,2571
...,...
1951,7983
1952,3018
1953,72733
1954,1797


In [31]:
df_user_movie["movie_vec"]=df_user_movie["movie_id"].map(lambda x: item2v.wv[x])

In [32]:
df_user_movie

Unnamed: 0,movie_id,movie_vec
0,318,"[-0.26765195, 0.36566406, 0.16020532, 0.421208..."
1,356,"[-0.2844562, 0.36029017, 0.16854262, 0.4349422..."
2,296,"[-0.26733398, 0.37521252, 0.14490281, 0.417853..."
3,593,"[-0.2950925, 0.38205284, 0.18056022, 0.1484075..."
4,2571,"[-0.4298875, 0.625265, -0.11683946, -0.0514475..."
...,...,...
1951,7983,"[-0.04653603, 0.07041316, -0.0048250565, -0.01..."
1952,3018,"[-0.04377124, 0.07257209, -8.487021e-05, -0.00..."
1953,72733,"[-0.028427375, 0.06801068, -0.0051156725, -0.0..."
1954,1797,"[-0.045132343, 0.055917025, -0.00827237, -0.00..."


In [33]:
df_user_movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1956 entries, 0 to 1955
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   movie_id   1956 non-null   object
 1   movie_vec  1956 non-null   object
dtypes: object(2)
memory usage: 30.7+ KB


In [34]:
df_user_movie["movie_id"].astype(str)

0         318
1         356
2         296
3         593
4        2571
        ...  
1951     7983
1952     3018
1953    72733
1954     1797
1955    52287
Name: movie_id, Length: 1956, dtype: object

In [35]:
df_user_movie[df_user_movie["movie_id"]=="318"]

Unnamed: 0,movie_id,movie_vec
0,318,"[-0.26765195, 0.36566406, 0.16020532, 0.421208..."


#### 电影数据集

In [36]:
df_movie = pd.read_csv("./dataset/datas/ml-latest-small/movies.csv")

In [37]:
df_movie

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [38]:
df_movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [41]:
df_movie["movieId"] = df_movie["movieId"].astype(str)

In [42]:
df_merge = pd.merge(left=df_user_movie,right=df_movie,left_on="movie_id",right_on="movieId")

In [43]:
df_merge

Unnamed: 0,movie_id,movie_vec,movieId,title,genres
0,318,"[-0.26765195, 0.36566406, 0.16020532, 0.421208...",318,"Shawshank Redemption, The (1994)",Crime|Drama
1,356,"[-0.2844562, 0.36029017, 0.16854262, 0.4349422...",356,Forrest Gump (1994),Comedy|Drama|Romance|War
2,296,"[-0.26733398, 0.37521252, 0.14490281, 0.417853...",296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3,593,"[-0.2950925, 0.38205284, 0.18056022, 0.1484075...",593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
4,2571,"[-0.4298875, 0.625265, -0.11683946, -0.0514475...",2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
...,...,...,...,...,...
1951,7983,"[-0.04653603, 0.07041316, -0.0048250565, -0.01...",7983,Broadway Danny Rose (1984),Comedy
1952,3018,"[-0.04377124, 0.07257209, -8.487021e-05, -0.00...",3018,Re-Animator (1985),Comedy|Horror|Sci-Fi
1953,72733,"[-0.028427375, 0.06801068, -0.0051156725, -0.0...",72733,Invictus (2009),Drama
1954,1797,"[-0.045132343, 0.055917025, -0.00827237, -0.00...",1797,Everest (1998),Documentary|IMAX


In [46]:
movie_id = '318'

In [48]:
df_merge[df_merge["movieId"] == movie_id]["movie_vec"].iloc[0]

array([-0.26765195,  0.36566406,  0.16020532,  0.42120892,  0.17477362,
       -0.7899041 ,  0.24813364,  1.0139135 , -0.4614174 , -0.3714401 ,
       -0.4700639 , -0.75197387, -0.370563  ,  0.18281002,  0.29947788,
       -0.24675854,  0.06587697, -0.50463206,  0.14057325, -0.875108  ,
        0.22207916,  0.11727619, -0.06861186, -0.40915707, -0.13602744,
       -0.09969606, -0.5249416 , -0.01299041, -0.48441985,  0.17093688,
        0.6522949 , -0.27157384, -0.12148076, -0.22043222, -0.13203163,
        0.6420953 ,  0.43510342, -0.59490657, -0.16469926, -0.8617905 ,
        0.05113297, -0.53878623, -0.16024064,  0.1786031 ,  0.26405242,
        0.00632328, -0.21793802,  0.12322978,  0.01488166,  0.25357795,
        0.00988423, -0.36907932, -0.27013364,  0.2917181 , -0.57912093,
        0.39912686,  0.29261142,  0.04140107, -0.42731377,  0.04911773,
        0.04629497,  0.3036391 , -0.21465842,  0.06242179, -0.31664312,
        0.36813235,  0.21400744,  0.17436303, -0.39207643,  0.65

In [49]:
movie_embed = df_merge[df_merge["movieId"] == movie_id]["movie_vec"].iloc[0]

In [50]:
movie_embed

array([-0.26765195,  0.36566406,  0.16020532,  0.42120892,  0.17477362,
       -0.7899041 ,  0.24813364,  1.0139135 , -0.4614174 , -0.3714401 ,
       -0.4700639 , -0.75197387, -0.370563  ,  0.18281002,  0.29947788,
       -0.24675854,  0.06587697, -0.50463206,  0.14057325, -0.875108  ,
        0.22207916,  0.11727619, -0.06861186, -0.40915707, -0.13602744,
       -0.09969606, -0.5249416 , -0.01299041, -0.48441985,  0.17093688,
        0.6522949 , -0.27157384, -0.12148076, -0.22043222, -0.13203163,
        0.6420953 ,  0.43510342, -0.59490657, -0.16469926, -0.8617905 ,
        0.05113297, -0.53878623, -0.16024064,  0.1786031 ,  0.26405242,
        0.00632328, -0.21793802,  0.12322978,  0.01488166,  0.25357795,
        0.00988423, -0.36907932, -0.27013364,  0.2917181 , -0.57912093,
        0.39912686,  0.29261142,  0.04140107, -0.42731377,  0.04911773,
        0.04629497,  0.3036391 , -0.21465842,  0.06242179, -0.31664312,
        0.36813235,  0.21400744,  0.17436303, -0.39207643,  0.65

In [52]:
df_merge[df_merge["movieId"] == "1"]

Unnamed: 0,movie_id,movie_vec,movieId,title,genres
16,1,"[-0.15190507, 0.2586639, 0.06611691, 0.1700479...",1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [53]:
df_merge[df_merge["movieId"] == "2"]

Unnamed: 0,movie_id,movie_vec,movieId,title,genres
164,2,"[-0.11680536, 0.2070055, 0.012744192, 0.080876...",2,Jumanji (1995),Adventure|Children|Fantasy


In [51]:
item2v.wv.similarity("1","2")

0.9939973

In [63]:
help(item2v.wv.distance)

Help on method distance in module gensim.models.keyedvectors:

distance(w1, w2) method of gensim.models.keyedvectors.KeyedVectors instance
    Compute cosine distance between two keys.
    Calculate 1 - :meth:`~gensim.models.keyedvectors.KeyedVectors.similarity`.
    
    Parameters
    ----------
    w1 : str
        Input key.
    w2 : str
        Input key.
    
    Returns
    -------
    float
        Distance between `w1` and `w2`.



In [67]:
df_merge["sim_value"] = df_merge["movie_id"].map(lambda x :1 -item2v.wv.distance("318",x))

In [68]:
df_merge.sort_values("sim_value",ascending=False)

Unnamed: 0,movie_id,movie_vec,movieId,title,genres,sim_value
0,318,"[-0.26765195, 0.36566406, 0.16020532, 0.421208...",318,"Shawshank Redemption, The (1994)",Crime|Drama,1.000000
2,296,"[-0.26733398, 0.37521252, 0.14490281, 0.417853...",296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,0.999097
53,293,"[-0.21035893, 0.27823773, 0.11631062, 0.312766...",293,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller,0.998452
1,356,"[-0.2844562, 0.36029017, 0.16854262, 0.4349422...",356,Forrest Gump (1994),Comedy|Drama|Romance|War,0.997567
5,260,"[-0.26072028, 0.37798178, 0.12138332, 0.318804...",260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,0.996119
...,...,...,...,...,...,...
36,1197,"[-0.0523994, 0.7377284, 0.11406993, -0.3809206...",1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,0.722067
45,1213,"[-0.0013735311, 0.7160999, 0.064513594, -0.325...",1213,Goodfellas (1990),Crime|Drama,0.720990
14,1210,"[-0.009095999, 0.78071564, 0.08618086, -0.3732...",1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,0.719155
10,1198,"[-0.010971045, 0.7289689, 0.14374071, -0.41768...",1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,0.697334


### pysaprk.ml word2vec