In [1]:
import pandas as pd
import numpy as np
dtype = [("userId", np.int32), ("movieId", np.int32), ("rating", np.float32)]
dataset = pd.read_csv('./dataset/ratings.csv', usecols=range(3), dtype=dict(dtype))

In [2]:
dataset.head(5)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [3]:
users_ratings = dataset.groupby('userId').agg([list])
items_ratings = dataset.groupby('movieId').agg([list])
items_ratings

Unnamed: 0_level_0,userId,rating
Unnamed: 0_level_1,list,list
movieId,Unnamed: 1_level_2,Unnamed: 2_level_2
1,"[1, 5, 7, 15, 17, 18, 19, 21, 27, 31, 32, 33, ...","[4.0, 4.0, 4.5, 2.5, 4.5, 3.5, 4.0, 3.5, 3.0, ..."
2,"[6, 8, 18, 19, 20, 21, 27, 51, 62, 68, 82, 91,...","[4.0, 4.0, 3.0, 3.0, 3.0, 3.5, 4.0, 4.5, 4.0, ..."
3,"[1, 6, 19, 32, 42, 43, 44, 51, 58, 64, 68, 91,...","[4.0, 5.0, 3.0, 3.0, 4.0, 5.0, 3.0, 4.0, 3.0, ..."
4,"[6, 14, 84, 162, 262, 411, 600]","[3.0, 3.0, 3.0, 3.0, 1.0, 2.0, 1.5]"
5,"[6, 31, 43, 45, 58, 66, 68, 84, 103, 107, 111,...","[5.0, 3.0, 5.0, 3.0, 4.0, 4.0, 2.0, 3.0, 4.0, ..."
...,...,...
193581,[184],[4.0]
193583,[184],[3.5]
193585,[184],[3.5]
193587,[184],[3.5]


In [4]:
#computer global mean
global_mean = dataset.rating.mean()

In [5]:
#init bu&bi
bu = dict(zip(users_ratings.index, np.zeros(len(users_ratings.index))))
bi = dict(zip(items_ratings.index, np.zeros(len(items_ratings.index))))

In [6]:
#use 梯度下降optimize
for i in range(20):
    print("iter{}".format(i))
    for uid, iid, real_rating in dataset.itertuples(index=False):
        error = real_rating - (global_mean + bu[uid] + bi[iid])
        bu[uid] += 0.21 * (error - 0.1*bu[uid])
        bi[iid] += 0.21 * (error - 0.1*bi[iid])

iter0
iter1
iter2
iter3
iter4
iter5
iter6
iter7
iter8
iter9
iter10
iter11
iter12
iter13
iter14
iter15
iter16
iter17
iter18
iter19


In [7]:
# predict score
def predict(uid, iid):
    predict_rating = global_mean + bi[iid] + bu[uid]
    return predict_rating

In [8]:
predict(1,1)

4.249813585315364

###交替最小二乘法###

In [9]:
bu = dict(zip(users_ratings.index, np.zeros(len(users_ratings.index))))
bi = dict(zip(items_ratings.index, np.zeros(len(items_ratings.index))))

In [10]:
for i in range(15):
    print('iter{}'.format(i))
    for iid, uids, ratings in items_ratings.itertuples(index=True):
        _sum = 0
        for uid, rating in zip(uids, ratings):
            _sum += rating - global_mean - bu[uid]
        bi[iid] = _sum/(0.1 + len(uids))
    for uid, iids, ratings in users_ratings.itertuples(index=True):
        _sum = 0
        for iid, rating in zip(iids, ratings):
            _sum += rating - global_mean - bi[iid]
        bu[uid] = _sum/(0.1 + len(iids))

iter0
iter1
iter2
iter3
iter4
iter5
iter6
iter7
iter8
iter9
iter10
iter11
iter12
iter13
iter14


In [11]:
predict(1,1)

4.670701061298773

###奇异值分解

In [12]:
users_ratings = dataset.groupby('userId').agg([list])
items_ratings = dataset.groupby('movieId').agg([list])

In [13]:
#初始化矩阵p和q
#each row corresponds to a user and each column corresponds to a random rating.
P = dict(zip(users_ratings.index, np.random.rand(len(users_ratings), 10).astype(np.float32)))

In [14]:
P[2]

array([0.4455267 , 0.65569925, 0.6300857 , 0.96242183, 0.18228525,
       0.7060573 , 0.70598936, 0.4112105 , 0.31449172, 0.68936825],
      dtype=float32)

In [15]:
Q = dict(zip(items_ratings.index, np.random.rand(len(items_ratings), 10).astype(np.float32)))

In [16]:
for i in range(15):
    print('*'*10,i)
    for uid, iid, real_rating in dataset.itertuples(index=False):
        #遍历用户、物品的评分数据 通过用户id到矩阵中获取用户向量
        v_puk = P[uid]
        v_qik = Q[iid]
        # count error
        error = real_rating - np.dot(v_puk,v_qik)
        #0.02lr 0.01reg
        v_puk += 0.02*(error*v_qik-0.01*v_puk)
        v_qik += 0.02*(error*v_puk-0.01*v_qik)

        P[uid] = v_puk
        Q[iid] = v_qik

********** 0
********** 1
********** 2
********** 3
********** 4
********** 5
********** 6
********** 7
********** 8
********** 9
********** 10
********** 11
********** 12
********** 13
********** 14


In [17]:
def predict_2(uid,iid):
    if uid not in users_ratings.index or iid not in items_ratings.index:
        return global_mean
    p_u = P[uid]
    q_i = Q[iid]
    return np.dot(p_u,q_i)

In [18]:
predict_2(1,1)

4.8230743

In [19]:
###TF-IDF

In [20]:
_tags = pd.read_csv('./dataset/tags.csv', usecols=range(1,3)).dropna()

In [21]:
tags = _tags.groupby('movieId').agg(list)

In [22]:
tags.sort_values(by='movieId', inplace=True)
tags

Unnamed: 0_level_0,tag
movieId,Unnamed: 1_level_1
1,"[pixar, pixar, fun]"
2,"[fantasy, magic board game, Robin Williams, game]"
3,"[moldy, old]"
5,"[pregnancy, remake]"
7,[remake]
...,...
183611,"[Comedy, funny, Rachel McAdams]"
184471,"[adventure, Alicia Vikander, video game adapta..."
187593,"[Josh Brolin, Ryan Reynolds, sarcasm]"
187595,"[Emilia Clarke, star wars]"


In [23]:
movies = pd.read_csv('./dataset/movies.csv', index_col='movieId').drop_duplicates()
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))

In [24]:
movies.sort_values(by='movieId', inplace=True)
movies

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
2,Jumanji (1995),"[Adventure, Children, Fantasy]"
3,Grumpier Old Men (1995),"[Comedy, Romance]"
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
5,Father of the Bride Part II (1995),[Comedy]
...,...,...
193581,Black Butler: Book of the Atlantic (2017),"[Action, Animation, Comedy, Fantasy]"
193583,No Game No Life: Zero (2017),"[Animation, Comedy, Fantasy]"
193585,Flint (2017),[Drama]
193587,Bungo Stray Dogs: Dead Apple (2018),"[Action, Animation]"


In [25]:
movies_index = set(movies.index)&set(tags.index)

In [26]:
#tags多，movies少
new_tags = tags.loc[movies_index]
ret = movies.join(new_tags)

  new_tags = tags.loc[movies_index]


In [27]:
ret

Unnamed: 0_level_0,title,genres,tag
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[pixar, pixar, fun]"
2,Jumanji (1995),"[Adventure, Children, Fantasy]","[fantasy, magic board game, Robin Williams, game]"
3,Grumpier Old Men (1995),"[Comedy, Romance]","[moldy, old]"
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",
5,Father of the Bride Part II (1995),[Comedy],"[pregnancy, remake]"
...,...,...,...
193581,Black Butler: Book of the Atlantic (2017),"[Action, Animation, Comedy, Fantasy]",
193583,No Game No Life: Zero (2017),"[Animation, Comedy, Fantasy]",
193585,Flint (2017),[Drama],
193587,Bungo Stray Dogs: Dead Apple (2018),"[Action, Animation]",


In [28]:
movie_dataset = pd.DataFrame(
    map(
        lambda x:(x[0], x[1], x[2], x[2]+x[3]) if x[3] is not np.nan else
(x[0], x[1], x[2], []), ret.itertuples())
    , columns = ['movieId','title','genres','tags']
)
movie_dataset.set_index('movieId', inplace=True)

In [29]:
movie_dataset

Unnamed: 0_level_0,title,genres,tags
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[Adventure, Animation, Children, Comedy, Fanta..."
2,Jumanji (1995),"[Adventure, Children, Fantasy]","[Adventure, Children, Fantasy, fantasy, magic ..."
3,Grumpier Old Men (1995),"[Comedy, Romance]","[Comedy, Romance, moldy, old]"
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",[]
5,Father of the Bride Part II (1995),[Comedy],"[Comedy, pregnancy, remake]"
...,...,...,...
193581,Black Butler: Book of the Atlantic (2017),"[Action, Animation, Comedy, Fantasy]",[]
193583,No Game No Life: Zero (2017),"[Animation, Comedy, Fantasy]",[]
193585,Flint (2017),[Drama],[]
193587,Bungo Stray Dogs: Dead Apple (2018),"[Action, Animation]",[]


In [30]:
dataset = movie_dataset['tags'].values

In [31]:
dataset[20]

['Comedy', 'Crime', 'Thriller', 'Hollywood']

In [32]:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel

In [33]:
#创建dic对象
dct = Dictionary(dataset)

In [34]:
corpus = [dct.doc2bow(doc) for doc in dataset]#语料库

In [35]:
dataset[1]

['Adventure',
 'Children',
 'Fantasy',
 'fantasy',
 'magic board game',
 'Robin Williams',
 'game']

In [36]:
dct[3]

'Comedy'

In [37]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2)],
 [(0, 1), (2, 1), (4, 1), (7, 1), (8, 1), (9, 1), (10, 1)],
 [(3, 1), (11, 1), (12, 1), (13, 1)],
 [],
 [(3, 1), (14, 1), (15, 1)],
 [],
 [(3, 1), (11, 1), (15, 1)],
 [],
 [],
 [],
 [(3, 1), (11, 1), (16, 1), (17, 1), (18, 1)],
 [],
 [],
 [(16, 1), (17, 1), (18, 1)],
 [],
 [(16, 1), (19, 1), (20, 1)],
 [(11, 1), (16, 1), (21, 1)],
 [],
 [],
 [],
 [(3, 1), (19, 1), (22, 1), (23, 1)],
 [(16, 1), (19, 1), (23, 1), (24, 1), (25, 1), (26, 1)],
 [],
 [],
 [(11, 1), (16, 1), (27, 1)],
 [(16, 1), (28, 1)],
 [],
 [(11, 1), (16, 1), (21, 1), (29, 1)],
 [(0, 1), (4, 1), (16, 1), (25, 1), (30, 1), (31, 1)],
 [],
 [(16, 1), (32, 1), (33, 1)],
 [(15, 1),
  (23, 1),
  (25, 1),
  (30, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 3),
  (40, 1)],
 [(2, 1), (16, 1), (41, 1), (42, 1), (43, 1)],
 [(16, 1), (19, 1), (44, 1), (45, 1)],
 [(2, 1), (3, 1), (46, 1)],
 [(3, 1),
  (11, 1),
  (21, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  

In [38]:
#训练TF-IDF模型，即计算TF-IDF的zhi
model = TfidfModel(corpus)

In [39]:
model[corpus[4]]

[(3, 0.27865135629966864), (14, 0.7029528753875794), (15, 0.6543780838450272)]

In [40]:
_movie_profile = []
for i, data in enumerate(movie_dataset.itertuples()):
    mid = data[0]
    title = data[1]
    genres = data[2]
    vector = model[corpus[i]]
    movie_tags = sorted(vector, key = lambda x:x[1], reverse=True)[:30]
    topN_tags_weights = dict(map(lambda x:(dct[x[0]],x[1]), movie_tags))
    #将类别词的添加进去，设权重为1.0
    for g in genres:
        topN_tags_weights[g] = 1.0
    topN_tags = [i[0] for i in topN_tags_weights.items()]
    _movie_profile.append((mid, title, topN_tags, topN_tags_weights))

movie_profile = pd.DataFrame(_movie_profile, columns=["movieId","title","profile","weights"])
movie_profile.set_index("movieId", inplace=True)

In [41]:
movie_profile

Unnamed: 0_level_0,title,profile,weights
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[pixar, fun, Animation, Children, Fantasy, Adv...","{'pixar': 0.837374709121301, 'fun': 0.34531665..."
2,Jumanji (1995),"[game, magic board game, Robin Williams, fanta...","{'game': 0.49506005899914796, 'magic board gam..."
3,Grumpier Old Men (1995),"[moldy, old, Romance, Comedy]","{'moldy': 0.669101789463952, 'old': 0.66910178..."
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]","{'Comedy': 1.0, 'Drama': 1.0, 'Romance': 1.0}"
5,Father of the Bride Part II (1995),"[pregnancy, remake, Comedy]","{'pregnancy': 0.7029528753875794, 'remake': 0...."
...,...,...,...
193581,Black Butler: Book of the Atlantic (2017),"[Action, Animation, Comedy, Fantasy]","{'Action': 1.0, 'Animation': 1.0, 'Comedy': 1...."
193583,No Game No Life: Zero (2017),"[Animation, Comedy, Fantasy]","{'Animation': 1.0, 'Comedy': 1.0, 'Fantasy': 1.0}"
193585,Flint (2017),[Drama],{'Drama': 1.0}
193587,Bungo Stray Dogs: Dead Apple (2018),"[Action, Animation]","{'Action': 1.0, 'Animation': 1.0}"


In [42]:
#通过标签找到具体的电影
def create_inverted_table(movie_profile):
    inverted_table = {}
    for mid, weights in movie_profile["weights"].items():
        for tag, weight in weights.items():
            #dict用tag作为Key取值，取不到返回[]
            _ = inverted_table.get(tag, [])
            _.append((mid, weight))
            inverted_table.setdefault(tag,_)
    return inverted_table
inverted_table = create_inverted_table(movie_profile)

In [43]:
inverted_table

{'pixar': [(1, 0.837374709121301)],
 'fun': [(1, 0.34531665530514855),
  (89745, 0.3284369053807601),
  (108932, 0.31964654815096755),
  (122918, 0.747908115567127)],
 'Animation': [(1, 1.0),
  (13, 1.0),
  (48, 1.0),
  (239, 1.0),
  (313, 1.0),
  (364, 1.0),
  (551, 1.0),
  (558, 1.0),
  (588, 1.0),
  (594, 1.0),
  (595, 1.0),
  (596, 1.0),
  (610, 1.0),
  (616, 1.0),
  (631, 1.0),
  (661, 1.0),
  (673, 1.0),
  (709, 1.0),
  (720, 1.0),
  (741, 1.0),
  (745, 1.0),
  (783, 1.0),
  (888, 1.0),
  (1022, 1.0),
  (1023, 1.0),
  (1024, 1.0),
  (1025, 1.0),
  (1029, 1.0),
  (1030, 1.0),
  (1032, 1.0),
  (1033, 1.0),
  (1064, 1.0),
  (1148, 1.0),
  (1151, 1.0),
  (1223, 1.0),
  (1274, 1.0),
  (1282, 1.0),
  (1405, 1.0),
  (1489, 1.0),
  (1566, 1.0),
  (1688, 1.0),
  (1881, 1.0),
  (1907, 1.0),
  (1920, 1.0),
  (2018, 1.0),
  (2033, 1.0),
  (2048, 1.0),
  (2078, 1.0),
  (2080, 1.0),
  (2081, 1.0),
  (2085, 1.0),
  (2087, 1.0),
  (2089, 1.0),
  (2090, 1.0),
  (2092, 1.0),
  (2096, 1.0),
  (2099

In [44]:
inverted_table['Thriller']

[(6, 1.0),
 (10, 1.0),
 (20, 1.0),
 (21, 1.0),
 (22, 1.0),
 (23, 1.0),
 (32, 1.0),
 (45, 1.0),
 (47, 1.0),
 (50, 1.0),
 (61, 1.0),
 (66, 1.0),
 (70, 1.0),
 (76, 1.0),
 (78, 1.0),
 (79, 1.0),
 (89, 1.0),
 (92, 1.0),
 (95, 1.0),
 (100, 1.0),
 (103, 1.0),
 (111, 1.0),
 (132, 1.0),
 (145, 1.0),
 (149, 1.0),
 (161, 1.0),
 (164, 1.0),
 (165, 1.0),
 (170, 1.0),
 (172, 1.0),
 (183, 1.0),
 (185, 1.0),
 (190, 1.0),
 (198, 1.0),
 (217, 1.0),
 (225, 1.0),
 (227, 1.0),
 (229, 1.0),
 (230, 1.0),
 (240, 1.0),
 (257, 1.0),
 (259, 1.0),
 (280, 1.0),
 (288, 1.0),
 (291, 1.0),
 (292, 1.0),
 (293, 1.0),
 (296, 1.0),
 (303, 1.0),
 (311, 1.0),
 (315, 1.0),
 (319, 1.0),
 (320, 1.0),
 (328, 1.0),
 (335, 1.0),
 (338, 1.0),
 (349, 1.0),
 (350, 1.0),
 (353, 1.0),
 (366, 1.0),
 (373, 1.0),
 (376, 1.0),
 (377, 1.0),
 (379, 1.0),
 (380, 1.0),
 (382, 1.0),
 (407, 1.0),
 (415, 1.0),
 (420, 1.0),
 (422, 1.0),
 (423, 1.0),
 (426, 1.0),
 (427, 1.0),
 (434, 1.0),
 (436, 1.0),
 (454, 1.0),
 (456, 1.0),
 (457, 1.0),
 (459,

###创建用户画像###

In [45]:
watch_record = pd.read_csv('./dataset/ratings.csv', usecols=range(2),dtype={'userId':np.int32,'movieId':np.int32})
watch_record = watch_record.groupby('userId').agg(list)

In [46]:
from functools import reduce
import collections
watch_record

Unnamed: 0_level_0,movieId
userId,Unnamed: 1_level_1
1,"[1, 3, 6, 47, 50, 70, 101, 110, 151, 157, 163,..."
2,"[318, 333, 1704, 3578, 6874, 8798, 46970, 4851..."
3,"[31, 527, 647, 688, 720, 849, 914, 1093, 1124,..."
4,"[21, 32, 45, 47, 52, 58, 106, 125, 126, 162, 1..."
5,"[1, 21, 34, 36, 39, 50, 58, 110, 150, 153, 232..."
...,...
606,"[1, 7, 11, 15, 17, 18, 19, 28, 29, 32, 36, 46,..."
607,"[1, 11, 25, 34, 36, 86, 110, 112, 150, 153, 16..."
608,"[1, 2, 3, 10, 16, 19, 21, 24, 31, 32, 34, 39, ..."
609,"[1, 10, 110, 116, 137, 150, 161, 185, 208, 231..."


In [47]:
user_profile = {}

In [77]:
for uid, mids in watch_record.itertuples():
    #取出看过的所有电影
    record_movie_profile = movie_profile.loc[mids]
    counter = collections.Counter(reduce(lambda x,y :list(x)+list(y), record_movie_profile['profile'].values))
    #兴趣词
    interest_words = counter.most_common(10)
    maxcount = interest_words[0][1]
    interest_words = [(w,round(c/maxcount, 4)) for w,c in interest_words]
    user_profile[uid] = interest_words

In [78]:
user_profile

{1: [('Action', 1.0),
  ('Adventure', 0.9444),
  ('Comedy', 0.9222),
  ('Drama', 0.7556),
  ('Thriller', 0.6111),
  ('Fantasy', 0.5222),
  ('Crime', 0.5),
  ('Children', 0.4667),
  ('Sci-Fi', 0.4444),
  ('Animation', 0.3222)],
 2: [('Drama', 1.0),
  ('Action', 0.6471),
  ('Crime', 0.5882),
  ('Thriller', 0.5882),
  ('Comedy', 0.4118),
  ('Leonardo DiCaprio', 0.2941),
  ('IMAX', 0.2353),
  ('Sci-Fi', 0.2353),
  ('Adventure', 0.1765),
  ('suspense', 0.1765)],
 3: [('Drama', 1.0),
  ('Sci-Fi', 0.9375),
  ('Action', 0.875),
  ('Adventure', 0.6875),
  ('Comedy', 0.5625),
  ('Horror', 0.5),
  ('Thriller', 0.4375),
  ('War', 0.3125),
  ('Romance', 0.3125),
  ('Children', 0.3125)],
 4: [('Drama', 1.0),
  ('Comedy', 0.8667),
  ('Romance', 0.4833),
  ('Thriller', 0.3167),
  ('Adventure', 0.2417),
  ('Crime', 0.225),
  ('Action', 0.2083),
  ('Mystery', 0.1917),
  ('Fantasy', 0.1583),
  ('Musical', 0.1333)],
 5: [('Drama', 1.0),
  ('Comedy', 0.6),
  ('Crime', 0.48),
  ('Romance', 0.44),
  ('Childr

In [83]:
for uid, interest_words in user_profile.items():
    result_table = {}
    for word, weight0 in interest_words:
        related_movies = inverted_table[word]
        for mid, weight1 in related_movies:
            _ = result_table.get(mid, [])
            _.append(weight1)
            result_table.setdefault(mid, _)
            # from collections import defaultdict
            # result_table = defaultdict(list)
            # for mid, weight in related_movies:
            #     result_table[mid].append(weight)
    rs_result = map(lambda x:(x[0], sum(x[1])), result_table.items())
    rs_result = sorted(rs_result, key=lambda x:x[1],reverse=True)[:10]
    print(uid)
    print(rs_result)

1
[(546, 6.0), (4719, 6.0), (6350, 6.0), (26340, 6.0), (26590, 6.0), (26701, 6.0), (27155, 6.0), (40339, 6.0), (51939, 6.0), (52287, 6.0)]
2
[(79132, 6.300311497026073), (81132, 6.0), (49530, 5.3422397407098465), (20, 5.0), (145, 5.0), (198, 5.0), (459, 5.0), (519, 5.0), (1396, 5.0), (1432, 5.0)]
3
[(4956, 6.0), (26236, 6.0), (71999, 6.0), (81132, 6.0), (459, 5.0), (1907, 5.0), (2890, 5.0), (4719, 5.0), (4800, 5.0), (6395, 5.0)]
4
[(81132, 7.0), (459, 6.0), (4719, 6.0), (4956, 6.0), (6902, 6.0), (7835, 6.0), (31921, 6.0), (71999, 6.0), (117646, 6.0), (31367, 6.0)]
5
[(4719, 7.0), (459, 6.0), (1907, 6.0), (4956, 6.0), (71999, 6.0), (81132, 6.0), (108540, 6.0), (117646, 6.0), (134853, 6.0), (2987, 6.0)]
6
[(71999, 7.0), (459, 6.0), (4719, 6.0), (4956, 6.0), (6902, 6.0), (81132, 6.0), (117646, 6.0), (148775, 6.0), (546, 6.0), (31367, 6.0)]
7
[(71999, 7.0), (459, 6.0), (546, 6.0), (4719, 6.0), (4956, 6.0), (31367, 6.0), (81132, 6.0), (117646, 6.0), (164226, 6.0), (6902, 6.0)]
8
[(71999, 7.

In [88]:
import gensim
sentence = list(movie_profile['profile'].values)
model = gensim.models.Word2Vec(sentence, window=3, min_count=1, epochs=20)

In [91]:
model.wv.most_similar(positive=['sex'], topn=10)

[('dreamlike', 0.9968480467796326),
 ('satirical', 0.9965980052947998),
 ('Leonardo DiCaprio', 0.9965884685516357),
 ('psychological', 0.9965869784355164),
 ('Tom Hanks', 0.9965709447860718),
 ('philosophy', 0.9965360760688782),
 ('thought-provoking', 0.9965320229530334),
 ('good dialogue', 0.9965268969535828),
 ('mindfuck', 0.9965223670005798),
 ('chick flick', 0.9965112209320068)]

In [92]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(words, [movie_id]) for movie_id, words in movie_profile['profile'].items()]
model = Doc2Vec(documents, vector_size=100, window=3, min_count=1, workers=4, epochs=25)

In [93]:
words = movie_profile['profile'].loc[6]

In [98]:
words

['Action', 'Crime', 'Thriller']

In [95]:
inferred_vector = model.infer_vector(words)
sims = model.docvecs.most_similar([inferred_vector], topn=10)

  sims = model.docvecs.most_similar([inferred_vector], topn=10)


In [96]:
sims

[(4848, 0.9646806120872498),
 (4298, 0.9629760384559631),
 (2206, 0.962874174118042),
 (164, 0.9626970887184143),
 (52604, 0.9602718949317932),
 (5291, 0.9600585699081421),
 (32587, 0.9590525031089783),
 (8507, 0.9589979648590088),
 (4037, 0.9587830305099487),
 (1248, 0.9585187435150146)]

In [103]:
movie_profile['profile'].loc[4848]

['Crime', 'Drama', 'Film-Noir', 'Mystery', 'Thriller']