# Description:
这个jupyter完成基于内容的推荐， 用的数据集依然是movieLens数据集。总体流程如下：<br>

 ①建立物品画像

- ①用户打tag ②电影的分类值
- 根据电影的id 把tag和分类值合并起来 求tf-idf
- 根据tf-idf的结果 为每一部电影筛选出 top-n（tf-idf比较大的）个关键词
- 电影id-关键词-关键词权重

② 建立倒排索引

- 通过关键词找到电影
- 遍历 电影id-关键词-关键词权重 数据， 读取每一个关键词，用关键词作为key [(关键词对应的电影id,tfidf)] 作为value 保存到dict当中

③ 用户画像

- 看用户看过那些电影， 到电影的 电影id-关键词-关键词权重 数据中 找到电影所对应的关键词  
- 把用户看过的所有的关键词放到一起 统计词频 每个词出现了几次
- 出现次数多的关键词 作为用户的兴趣词，这个兴趣词实际上就是用户画像的关键词

④ 根据用户的兴趣词 找到兴趣词对应的电影 多个兴趣词可能对应一个电影 {电影id：[关键词1权重，关键词2权重]}

- 把每一个部电影对应的关键词权重求和之后 排序  权重比较高的排在前面 推荐给用户

In [1]:
import numpy as np
import pandas as pd

# 物品画像

## 构建数据集

In [2]:
_tags = pd.read_csv("ml-latest-small/all-tags.csv", usecols=range(1, 3)).dropna()
tags = _tags.groupby("movieId").agg(list)

In [3]:
tags.head()

Unnamed: 0_level_0,tag
movieId,Unnamed: 1_level_1
1,"[animated, buddy movie, Cartoon, cgi, comedy, ..."
2,"[fantasy, adapted from:book, animals, bad cgi,..."
3,"[moldy, old, Ann Margaret, Burgess Meredith, D..."
4,"[characters, girl movie, characters, chick fli..."
5,"[steve martin, steve martin, pregnancy, remake..."


In [4]:
movies = pd.read_csv("ml-latest-small/movies.csv", index_col="movieId")
movies['genres'] = movies['genres'].apply(lambda x: x.split("|"))

In [5]:
movies.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
2,Jumanji (1995),"[Adventure, Children, Fantasy]"
3,Grumpier Old Men (1995),"[Comedy, Romance]"
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
5,Father of the Bride Part II (1995),[Comedy]


In [6]:
movies_index = set(movies.index) & set(tags.index)
new_tags = tags.loc[list(movies_index)]
ret = movies.join(new_tags)

In [7]:
ret.head()

Unnamed: 0_level_0,title,genres,tag
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[animated, buddy movie, Cartoon, cgi, comedy, ..."
2,Jumanji (1995),"[Adventure, Children, Fantasy]","[fantasy, adapted from:book, animals, bad cgi,..."
3,Grumpier Old Men (1995),"[Comedy, Romance]","[moldy, old, Ann Margaret, Burgess Meredith, D..."
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]","[characters, girl movie, characters, chick fli..."
5,Father of the Bride Part II (1995),[Comedy],"[steve martin, steve martin, pregnancy, remake..."


In [8]:
df = map(lambda x: (x[0], x[1], x[2], x[2]+x[3]) if x[3] is not np.nan else (x[0], x[1], x[2], []), ret.itertuples())

In [9]:
movies_dataset = pd.DataFrame(df, columns=['movieId', 'title', 'genres', 'tags'])

In [10]:
movies_dataset.head()

Unnamed: 0,movieId,title,genres,tags
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[Adventure, Animation, Children, Comedy, Fanta..."
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]","[Adventure, Children, Fantasy, fantasy, adapte..."
2,3,Grumpier Old Men (1995),"[Comedy, Romance]","[Comedy, Romance, moldy, old, Ann Margaret, Bu..."
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]","[Comedy, Drama, Romance, characters, girl movi..."
4,5,Father of the Bride Part II (1995),[Comedy],"[Comedy, steve martin, steve martin, pregnancy..."


In [11]:
def get_movie_dataset():
    # 加载基于所有电影的标签
    # all-tag.csv
    _tags = pd.read_csv("ml-latest-small/all-tags.csv", usecols=range(1, 3)).dropna()
    tags = _tags.groupby("movieId").agg(list)
    
    # 加载电影列表数据
    movies = pd.read_csv("ml-latest-small/movies.csv", index_col="movieId")
    # 将电影的类别词分开
    movies['genres'] = movies['genres'].apply(lambda x: x.split("|"))
    
    # 为每部电影匹配对应的标签数据， 如果没有将会是NAN
    movies_index = set(movies.index) & set(tags.index)
    new_tags = tags.loc[list(movies_index)]
    ret = movies.join(new_tags)
    
    # 构建电影数据集， 包含电影ID， 电影名称， 类别和标签四个字段
    # 如果电影没有标签数据， 就替换为空列表
    # map(fun, 可迭代对象)
    df = map(lambda x: (x[0], x[1], x[2], x[2]+x[3]) if x[3] is not np.nan else (x[0], x[1], x[2], []), ret.itertuples())
    movies_dataset = pd.DataFrame(df, columns=['movieId', 'title', 'genres', 'tags'])
    
    movies_dataset.set_index("movieId", inplace=True)
    return movies_dataset

In [12]:
movies_dataset = get_movie_dataset()

In [13]:
movies_dataset.head()

Unnamed: 0_level_0,title,genres,tags
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[Adventure, Animation, Children, Comedy, Fanta..."
2,Jumanji (1995),"[Adventure, Children, Fantasy]","[Adventure, Children, Fantasy, fantasy, adapte..."
3,Grumpier Old Men (1995),"[Comedy, Romance]","[Comedy, Romance, moldy, old, Ann Margaret, Bu..."
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]","[Comedy, Drama, Romance, characters, girl movi..."
5,Father of the Bride Part II (1995),[Comedy],"[Comedy, steve martin, steve martin, pregnancy..."


## Tfidf Model

In [14]:
from gensim.models import TfidfModel
from pprint import pprint
from gensim.corpora import Dictionary

In [15]:
dataset = movies_dataset["tags"].values

In [16]:
# 根据数据集建立词袋， 并统计词频， 将所有词放入一个词典， 使用索引进行获取
dct = Dictionary(dataset)

In [17]:
# 对于每条数据， 返回对应的词索引和词频
corpus = [dct.doc2bow(line) for line in dataset]     # 这个东西统计每个单词在每部电影中出现的词频 tf

In [18]:
# 根据这个词频， 就可以训练Tf-IDF模型， 计算TF-IDF值
model = TfidfModel(corpus)

In [19]:
model[corpus[0]]

[(0, 0.011056904421060245),
 (1, 0.022544275919360972),
 (2, 0.007079278641715789),
 (3, 0.002798045765174126),
 (4, 0.012559237823372017),
 (5, 0.011503590086751218),
 (6, 0.003822990021240867),
 (7, 0.006436556765484401),
 (8, 0.012559237823372017),
 (9, 0.010358359169695198),
 (10, 0.007294227222687766),
 (11, 0.012559237823372017),
 (12, 0.003052808998969177),
 (13, 0.06290655524434838),
 (14, 0.0036895245437134857),
 (15, 0.0013298049743059352),
 (16, 0.010358359169695198),
 (17, 0.003681211552798544),
 (18, 0.2598207646720874),
 (19, 0.0070304347783136335),
 (20, 0.011611370980069282),
 (21, 0.0034440300273526517),
 (22, 0.008395905773059107),
 (23, 0.029694717526112635),
 (24, 0.004848884825145518),
 (25, 0.007448038929756369),
 (26, 0.006809393011219959),
 (27, 0.5233581922752641),
 (28, 0.02748351220336432),
 (29, 0.011056904421060245),
 (30, 0.05370223399240885),
 (31, 0.2552979527940718),
 (32, 0.012559237823372017),
 (33, 0.008395905773059107),
 (34, 0.0080522376164367),
 (

In [20]:
# 保存每个电影tf-idf值最高的30个标签
movie_profile = {}
for i, mid in enumerate(movies_dataset.index):
    # 对于每部电影， 返回每个标签的tf-idf值
    tfidf_vec = model[corpus[i]]
    # 按照tfidf值排序， 然后取Top-N
    movies_tags = sorted(tfidf_vec, key=lambda x: x[1], reverse=True)[:30]
    # 根据关键词提取对应的名称
    movie_profile[mid] = dict(map(lambda x: (dct[x[0]], x[1]), movies_tags))

In [21]:
pprint(movie_profile[2])

{'Adaptation of Book': 0.034075799351485185,
 'Children': 0.050052200548347164,
 'Chris Van Allsburg': 0.05999931871653004,
 'Filmed in BC': 0.05620872438935659,
 'For children': 0.028932283348594864,
 'Joe Johnston': 0.05272105069429941,
 'Kirsten Dunst': 0.110664625189357,
 'Lebbat': 0.03150404135004002,
 'Robin Williams': 0.695468714308112,
 'adapted from:book': 0.033531509912372595,
 'animals': 0.21493519083390042,
 'bad cgi': 0.1991963253408426,
 'board game': 0.3156946502624451,
 'childhood recaptured': 0.08999897807479505,
 'fantasy': 0.28312224495427457,
 'fiction': 0.044568770707859084,
 'game': 0.044568770707859084,
 'giant insect': 0.028104362194678295,
 'herds of CGI animals': 0.05999931871653004,
 'jungle': 0.037407363762693525,
 'kid flick': 0.20738815492035886,
 'magic board game': 0.15752020675020012,
 'monkey': 0.08193209561708079,
 'new home': 0.028104362194678295,
 'not for kids': 0.07136630203711364,
 'scary': 0.13475111914523905,
 'see also:Zathura': 0.034075799351

In [22]:
# 完善物品画像的提取
def create_movie_profile(movie_dataset):
    '''
    使用tfidf，分析提取topn关键词
    :param movie_dataset:
    :return:
    '''
    dataset = movie_dataset["tags"].values

    from gensim.corpora import Dictionary
    # 根据数据集建立词袋，并统计词频，将所有词放入一个词典，使用索引进行获取
    dct = Dictionary(dataset)
    # 根据将每条数据，返回对应的词索引和词频
    corpus = [dct.doc2bow(line) for line in dataset]
    # 训练TF-IDF模型，即计算TF-IDF值
    model = TfidfModel(corpus)

    _movie_profile = []
    for i, data in enumerate(movie_dataset.itertuples()):
        mid = data[0]
        title = data[1]
        genres = data[2]
        vector = model[corpus[i]]
        movie_tags = sorted(vector, key=lambda x: x[1], reverse=True)[:30]
        topN_tags_weights = dict(map(lambda x: (dct[x[0]], x[1]), movie_tags))
        # 将类别词的添加进去，并设置权重值为1.0
        for g in genres:
            topN_tags_weights[g] = 1.0
        topN_tags = [i[0] for i in topN_tags_weights.items()]
        _movie_profile.append((mid, title, topN_tags, topN_tags_weights))

    movie_profile = pd.DataFrame(_movie_profile, columns=["movieId", "title", "profile", "weights"])
    movie_profile.set_index("movieId", inplace=True)
    return movie_profile

In [23]:
movie_dataset = get_movie_dataset()
movie_profile = create_movie_profile(movie_dataset)

In [24]:
movie_profile.head()

Unnamed: 0_level_0,title,profile,weights
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[Pixar, pixar, animation, toys, Disney, Tom Ha...","{'Pixar': 0.5233581922752641, 'pixar': 0.43327..."
2,Jumanji (1995),"[Robin Williams, board game, time travel, fant...","{'Robin Williams': 0.695468714308112, 'board g..."
3,Grumpier Old Men (1995),"[moldy, Walter Matthau, Jack Lemmon, fishing, ...","{'moldy': 0.4257623617953107, 'Walter Matthau'..."
4,Waiting to Exhale (1995),"[chick flick, girl movie, characters, interrac...","{'chick flick': 0.6111589664927171, 'girl movi..."
5,Father of the Bride Part II (1995),"[steve martin, pregnancy, childhood classics, ...","{'steve martin': 0.7577224298058415, 'pregnanc..."


## 建立倒排索引

In [25]:
# 建立倒排索引  为了根据指定关键词迅速匹配到对应的电影，因此需要对物品画像的标签词，建立**倒排索引**
# 通常数据存储数据， 都是以物品的ID作为索引， 去提取物品的其他信息数据
# 而倒排索引就是用物品的其他数据作为索引， 去提取他们对应的物品的ID列表
def create_inverted_table(movie_profile):
    inverted_table = {}
    for mid, weights in movie_profile['weights'].iteritems():
        for tag, weight in weights.items():
            # 到inverted_table dict 用tag作为key去取值， 如果取不到就返回[]
            _ = inverted_table.get(tag, [])
            _.append((mid, weight))
            inverted_table.setdefault(tag, _)
    return inverted_table

In [26]:
inverted_table = create_inverted_table(movie_profile)

In [27]:
pprint(inverted_table)         # 这样就可以直接根据标签去推荐电影了  具体是在建立完用户画像的时候用到

{"!950's Superman TV show": [(47950, 0.15688786635723084)],
 '!David O. Russell': [(2890, 0.035465033313768844)],
 '!George Clooney': [(2890, 0.035465033313768844)],
 '"A Christmas Carol"': [(4023, 0.08038874529231568)],
 '"A MÃ£o-de-Deus"': [(5294, 0.04869940486459242)],
 '"Duck and cover!"': [(5288, 0.14443385314067872)],
 '"Ghost for adults"': [(5034, 0.2510549234741949)],
 '"I need a computer."': [(172, 0.04131594342445676)],
 '"It\'s not an American story': [(1488, 0.2736749468050325)],
 '"Jack black"': [(93287, 0.08251504656723087)],
 '"Jessica Biel is hot like hell"': [(54004, 0.2290141122401853)],
 '"May"-weirdness -- not as good': [(8959, 0.28993124656045793)],
 '"Show me the money."': [(1393, 0.12645332355409533)],
 '"The Hunter"': [(2490, 0.05965362602447225), (26172, 0.47853223553525415)],
 '"Tonite': [(71838, 0.025611331269783485)],
 '"artsy"': [(4552, 0.30637610777969454)],
 '"bad cia"-- too simplistic': [(135861, 0.13714241487167472)],
 '"based on true events"': [(6746, 

                 (88140, 0.26898725183715716),
                 (89745, 0.08888323942746236),
                 (110102, 0.37576048276475715),
                 (122920, 0.23986433509879965),
                 (139747, 0.1455424552604479),
                 (170697, 0.10731691729660146)],
 'Chris Eyre': [(1914, 0.21818418377404905)],
 'Chris Farley': [(88, 0.6287472460895912),
                  (333, 0.7911567794261843),
                  (1431, 0.6229813289149064),
                  (1887, 0.20235118179751485),
                  (3253, 0.0538619448092196)],
 'Chris Gerolmo': [(51357, 0.22433964080091956)],
 'Chris Hegedus': [(556, 0.30013309016141304), (4304, 0.4206428126128029)],
 'Chris Hemsworth': [(86332, 0.21020511262100952),
                     (94780, 0.13205208491255335),
                     (98239, 0.3893067892941706),
                     (106072, 0.3227722493140481),
                     (117466, 0.07502516070902965),
                     (122916, 0.22698016044103017),
      

 'Ed Harris': [(150, 0.01912602821269583),
               (257, 0.3032909703406125),
               (276, 0.2877326270159409),
               (733, 0.21157436939765686),
               (1095, 0.06064378621238361),
               (1127, 0.24083163799013105),
               (1231, 0.15365314370073224),
               (1459, 0.14405261506042424),
               (2120, 0.13051616271320626),
               (2432, 0.21174181193750904),
               (4017, 0.14818760646553938),
               (4223, 0.17928605835845446),
               (6567, 0.10308876205248559),
               (6887, 0.14341006505904647),
               (6927, 0.13536857003562952),
               (26704, 0.13933363612032867),
               (37733, 0.056321616113212236),
               (43914, 0.1467441068438473),
               (55290, 0.20676319404965035),
               (56775, 0.09750411345306408),
               (61986, 0.264663839259568),
               (83369, 0.10373780151665642),
               (160730, 0.2960551

 'Katharina Thalbach': [(1161, 0.0956755730315048)],
 'Katharine Hepburn': [(898, 0.1873148618719804),
                       (955, 0.3341565881684067),
                       (969, 0.14320843834120597),
                       (1124, 0.08733608329016354),
                       (4427, 0.22377037983316475),
                       (6970, 0.357323507822084),
                       (7121, 0.12136849058340721)],
 'Katharine Isabelle': [(6615, 0.09286695433744402),
                        (6820, 0.08453932203133416),
                        (27746, 0.15211808303125096),
                        (27778, 0.17824317279771276),
                        (32914, 0.2397835002044247)],
 'Katharine McPhee': [(61250, 0.05336747687200694)],
 'Katharine Ross': [(1247, 0.09671953130415331)],
 'Katherine De Hetre': [(2526, 0.27463357182738846)],
 'Katherine Heigl': [(42013, 0.13425906230470058),
                     (52973, 0.4004968699865862),
                     (56949, 0.6067277634591536),
             

 'Painting': [(851, 0.15244345791219185),
              (106889, 0.21287188919370628),
              (118985, 0.06731236477380584)],
 'Pakistan': [(77800, 0.09024379124695213),
              (98961, 0.13883725388112825),
              (101076, 0.10698598427475066),
              (168326, 0.05881135804036108)],
 'Palahnuik': [(65126, 0.12379900074218751)],
 'Palermo': [(1172, 0.03132832116610716)],
 'Palestine': [(41997, 0.073705076136311)],
 "Palme d'Or": [(509, 0.12243747231231178),
                (665, 0.054441876718237255),
                (1041, 0.1393089924306264),
                (1859, 0.11143463177539677),
                (2512, 0.20250986975909632),
                (3010, 0.17309605667584932),
                (3910, 0.061990912640502785),
                (5073, 0.16677964071952187),
                (6101, 0.09381318022925313),
                (6890, 0.04111027311315743),
                (44937, 0.15226678659546986),
                (47894, 0.06426155835342565),
              

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



(5538, 0.24536455223985063),
                     (5651, 0.207159179961312),
                     (6251, 0.1562751404373503),
                     (6377, 0.13747643022549305),
                     (8360, 0.1633752374920412),
                     (8372, 0.09673306395065295),
                     (8907, 0.17612704718123443),
                     (26593, 0.11242978836487794),
                     (26662, 0.05521921176928343),
                     (26999, 0.23754150400309498),
                     (27619, 0.2656380322851941),
                     (31193, 0.09919504785731521),
                     (31223, 0.2103214377560762),
                     (33615, 0.1263694029703725),
                     (38038, 0.07442411920842794),
                     (41566, 0.11621690589198526),
                     (44022, 0.1502578102301913),
                     (45074, 0.18476370896695474),
                     (45431, 0.46216856899956776),
                     (48414, 0.10137329477215266),
                

         (32770, 0.10535151535016633),
         (32892, 0.0750847826039297),
         (33162, 0.07591197696856694),
         (33493, 0.04914468948135276),
         (34536, 0.07822841202033583),
         (36529, 0.10462486939231024),
         (37240, 0.11095411545934611),
         (40278, 0.4252517504757243),
         (40414, 0.06558807549281401),
         (42943, 0.1551021517948372),
         (43677, 0.08033930205545392),
         (47202, 0.1503699166070252),
         (48319, 0.08731869051640076),
         (48596, 0.1506560651881072),
         (48774, 0.04308166468567249),
         (48783, 0.05171041700059969),
         (49530, 0.1366252272282927),
         (50068, 0.1817188913228693),
         (51562, 0.17965452349142247),
         (51662, 0.19604188955063137),
         (55052, 0.1700512094053697),
         (55116, 0.12405215890982078),
         (55442, 0.03981953780636436),
         (55946, 0.1992813918862861),
         (56921, 0.19235463870667058),
         (58293, 0.023291416698233

# 用户画像
构建步骤：
1. 根据用户的评分历史，结合物品画像，将有观影记录的电影的画像标签作为初始标签反打到用户身上
2. 通过对用户观影标签的次数进行统计，计算用户的每个初始标签的权重值，排序后选取TOP-N作为用户最终的画像标签

In [28]:
import collections
from functools import reduce

In [29]:
watch_record = pd.read_csv("ml-latest-small/ratings.csv", usecols=range(2), dtype={"userId":np.int32, "movieId": np.int32})
watch_record = watch_record.groupby("userId").agg(list)

In [30]:
watch_record.head()

Unnamed: 0_level_0,movieId
userId,Unnamed: 1_level_1
1,"[1, 3, 6, 47, 50, 70, 101, 110, 151, 157, 163,..."
2,"[318, 333, 1704, 3578, 6874, 8798, 46970, 4851..."
3,"[31, 527, 647, 688, 720, 849, 914, 1093, 1124,..."
4,"[21, 32, 45, 47, 52, 58, 106, 125, 126, 162, 1..."
5,"[1, 21, 34, 36, 39, 50, 58, 110, 150, 153, 232..."


In [31]:
movie_profile.head()

Unnamed: 0_level_0,title,profile,weights
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[Pixar, pixar, animation, toys, Disney, Tom Ha...","{'Pixar': 0.5233581922752641, 'pixar': 0.43327..."
2,Jumanji (1995),"[Robin Williams, board game, time travel, fant...","{'Robin Williams': 0.695468714308112, 'board g..."
3,Grumpier Old Men (1995),"[moldy, Walter Matthau, Jack Lemmon, fishing, ...","{'moldy': 0.4257623617953107, 'Walter Matthau'..."
4,Waiting to Exhale (1995),"[chick flick, girl movie, characters, interrac...","{'chick flick': 0.6111589664927171, 'girl movi..."
5,Father of the Bride Part II (1995),"[steve martin, pregnancy, childhood classics, ...","{'steve martin': 0.7577224298058415, 'pregnanc..."


# 产生Top-N推荐

In [32]:
user_profile = {}
for uid, mids in watch_record.itertuples():
    record_movie_profile = movie_profile.loc[list(mids)]  # 这里把当前用户看过的电影从movie_profile中找出来
    # 下面需要把这些电影的标签都合并到一块， 然后统计出现的次数, 这里的Counter和reduce用的秒
    counter = collections.Counter(reduce(lambda x, y: list(x) + list(y), record_movie_profile['profile'].values))
    
    # 兴趣词
    interest_words = counter.most_common(50)
    maxcount = interest_words[0][1]
    interest_words = [(w, round(c/maxcount, 4)) for w, c, in interest_words]  # 这里归一化一下
    user_profile[uid] = interest_words

In [33]:
user_profile[1]        # 用户1感兴趣的词

[('Action', 1.0),
 ('Adventure', 0.9556),
 ('Comedy', 0.9222),
 ('Drama', 0.7556),
 ('classic', 0.6222),
 ('Thriller', 0.6222),
 ('Fantasy', 0.5222),
 ('Crime', 0.5),
 ('Children', 0.4667),
 ('Sci-Fi', 0.4444),
 ('action', 0.4),
 ('adventure', 0.3556),
 ('comedy', 0.3556),
 ('fantasy', 0.3444),
 ('Animation', 0.3222),
 ('sci-fi', 0.3222),
 ('funny', 0.3111),
 ('imdb top 250', 0.2889),
 ('Romance', 0.2889),
 ('cult film', 0.2889),
 ('Disney', 0.2778),
 ('quirky', 0.2778),
 ('based on a book', 0.2778),
 ('War', 0.2556),
 ('Musical', 0.2444),
 ('animation', 0.2333),
 ('violence', 0.2333),
 ('surreal', 0.2333),
 ('crime', 0.2222),
 ('great soundtrack', 0.2222),
 ('atmospheric', 0.2111),
 ('war', 0.2111),
 ('humorous', 0.2),
 ('violent', 0.2),
 ('Mystery', 0.2),
 ('thriller', 0.1889),
 ('Horror', 0.1889),
 ('aliens', 0.1889),
 ('talking animals', 0.1778),
 ('music', 0.1778),
 ('Nudity (Topless)', 0.1667),
 ('satire', 0.1667),
 ('musical', 0.1667),
 ('suspense', 0.1556),
 ('tense', 0.1556),


In [34]:
# 为用户产生推荐结果
for uid, interest_words in user_profile.items():
    result_table = {}   # 电影id: [0.2, 0.5]
    for interest_word, interest_weight in interest_words:
        related_movies = inverted_table[interest_word]
        for mid, relate_weight in related_movies:
            _ = result_table.get(mid, [])
            _.append(interest_weight)    #只考虑用户的兴趣程度
            # _.append(related_weight)   # 只考虑兴趣词与电影的关联程度
            # _.append(interest_weight * related_weight)     # 二者都考虑
            result_table.setdefault(mid, _)
    
    rs_result = map(lambda x: (x[0], sum(x[1])), result_table.items()) 
    rs_result = sorted(rs_result, key=lambda x: x[1], reverse=True)[:100]
    print(uid)
    pprint(rs_result)
    break

1
[(1, 6.3222),
 (1197, 5.9444),
 (6539, 5.633400000000001),
 (85261, 5.611099999999999),
 (48774, 5.5889),
 (380, 5.5556),
 (81132, 5.5445),
 (8961, 5.533399999999999),
 (3000, 5.5333),
 (296, 5.4777),
 (2987, 5.4111),
 (588, 5.3111999999999995),
 (1196, 5.2444),
 (187031, 5.2334),
 (7099, 5.233299999999999),
 (1206, 5.1998999999999995),
 (2617, 5.144500000000001),
 (7153, 5.1445),
 (1215, 5.133400000000001),
 (1200, 5.1334),
 (1210, 5.066599999999999),
 (34405, 5.0445),
 (49530, 5.0445),
 (26340, 5.0443999999999996),
 (117529, 5.011199999999999),
 (1907, 4.999999999999999),
 (6902, 4.9666),
 (6350, 4.955499999999999),
 (53125, 4.9334),
 (80219, 4.9334),
 (1136, 4.9334),
 (6016, 4.9333),
 (88125, 4.922299999999999),
 (780, 4.8556),
 (52462, 4.8555),
 (79132, 4.855499999999999),
 (4306, 4.8334),
 (908, 4.8223),
 (4956, 4.8223),
 (40339, 4.8111),
 (26236, 4.8001000000000005),
 (5027, 4.7778),
 (117646, 4.7778),
 (7235, 4.7667),
 (51939, 4.7556),
 (71129, 4.7332),
 (480, 4.7223),
 (2000,

# 物品冷启动处理
这里主要包括两个很厉害的技术：
* Word2Vec: 这个可以根据得到电影标签的词向量， 根据这个词向量， 就能够得到tag之间的相似性， 这样就能够根据用户看过的某个电影， 得到这个电影的标签， 然后根据这些标签得到与其近似的标签， 然后得到这些近似标签下的电影对该用户产生推荐
* Doc2Vec：这个可以根据电影的所有标签， 训练一个模型来得到最终电影的影片向量， 根据这个， 就能够直接计算用户看过的某个电影与其他电影的相似性， 然后根据这个相似性给用户推荐最相似的几篇文章。  

这两个的区别体会一下， 下面就看具体的使用

## Word2Vec

In [35]:
from gensim.models import Word2Vec

In [36]:
# 由于前面我们已经得到了每部影片的tags，物品画像里面。 所以这里我们就可以直接建立word2vec模型， 来进行标签的词向量计算
sentences = list(movie_profile["profile"].values)   # 二维列表  每个元素是字符串

In [38]:
# 这里可以直接建立模型
model = Word2Vec(sentences, window=3, min_count=1, epochs=20)

In [39]:
# 建立完了之后， 对于某个电影的某个tag， 我们就可以得到与其相似的N个词
words = input("words: ")
ret = model.wv.most_similar(positive=[words], topn=10)   # 找到最相似的n 个词
print(ret)

words: action
[('martial arts', 0.9353725910186768), ('very cool movie', 0.9265753626823425), ('too much action', 0.9161287546157837), ('mini coopers', 0.9140243530273438), ('I AM  THE LAW', 0.9118229746818542), ('Joseph Kosinski', 0.9077568054199219), ('perfect movie to watch when having hangover', 0.9071716070175171), ('kung fu', 0.9028152823448181), ('kungfu hackers', 0.9019882082939148), ('Nick Fury', 0.897875189781189)]


## Doc2Vec

In [40]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile

In [41]:
# 建立文档， words就是影片的tags， tags就是影片的id
documents = [TaggedDocument(words, [movie_id]) for movie_id, words in movie_profile["profile"].iteritems()]

In [42]:
documents

[TaggedDocument(words=['Pixar', 'pixar', 'animation', 'toys', 'Disney', 'Tom Hanks', 'computer animation', 'children', 'witty', 'funny', 'animated', 'family', 'friendship', 'clever', 'adventure', 'comedy', 'humorous', 'Cartoon', 'toy', 'time travel', 'buddy movie', 'Tim Allen', 'classic', 'cgi', 'imdb top 250', 'fun', 'fantasy', 'kids', 'John Lasseter', 'Pixar animation', 'Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy'], tags=[1]),
 TaggedDocument(words=['Robin Williams', 'board game', 'time travel', 'fantasy', 'animals', 'kid flick', 'bad cgi', 'magic board game', 'scary', 'Kirsten Dunst', 'childhood recaptured', 'monkey', 'not for kids', 'Chris Van Allsburg', 'herds of CGI animals', 'Filmed in BC', 'Joe Johnston', 'Children', 'fiction', 'game', 'time', 'jungle', 'Adaptation of Book', 'see also:Zathura', 'adapted from:book', 'Lebbat', 'thrill', 'For children', 'giant insect', 'new home', 'Adventure', 'Fantasy'], tags=[2]),
 TaggedDocument(words=['moldy', 'Walter Matthau', 'J

In [43]:
# 训练Doc2Vec模型
model = Doc2Vec(documents, vector_size=100, window=3, min_count=1, workers=4, epochs=20)

# 模型保存
# fname = get_tmpfile("my_doc2vec_model")
# model.save(fname)

In [44]:
# 获取某个电影的tages
words = movie_profile["profile"].loc[6]
print(words)
# 拿到该影片的Doc2vec向量
inferred_vector = model.infer_vector(words)
sims = model.docvecs.most_similar([inferred_vector], topn=10)
print(sims)

['Al Pacino', 'Robert De Niro', 'realistic action', 'Michael Mann', 'great acting', 'Val Kilmer', 'bank robbery', 'suspense', 'Natalie Portman', 'tense', 'crime', 'atmospheric', 'gunfight', 'dialogue', 'long', 'career criminal', 'philosophy', 'overrated', 'realistic', 'Los Angeles', 'visceral', 'complex characters', 'Bank Heist', 'professionals', 'police', 'too long', 'amazing cast', 'Ashley Judd', 'loner', 'Mary Kircher', 'Action', 'Crime', 'Thriller']
[(1617, 0.9730480909347534), (27773, 0.9584764838218689), (47, 0.9581588506698608), (4262, 0.956497311592102), (55820, 0.9555869102478027), (50, 0.9536582827568054), (111, 0.9530277252197266), (6, 0.9526984095573425), (1252, 0.9491648077964783), (1645, 0.9478868246078491)]


  sims = model.docvecs.most_similar([inferred_vector], topn=10)
