In [2]:
import pandas as pd
import numpy as np
import time
import os

In [3]:

movie_all_info_path = 'data/m_all_info.csv'
movie_all_column_names = ['movie_id', 'title', 'director', 'scriptwriter', 'actors', 'genres',
                          'place', 'languages', 'time', 'duration', 'other_names', '_']
user_movie_score_path = 'data/u_score.csv' 
user_movie_column_names = ['movie_id', 'user_id', 'rating']

ITEM_ITEM_SIMMAT_PATH = 'data/ITEM_ITEM_SIMMAT.csv'

In [4]:
ITEM_ITEM_SIMMAT = pd.read_csv(ITEM_ITEM_SIMMAT_PATH, index_col=0)
ITEM_ITEM_SIMMAT.index

Int64Index([30163509,  4840388, 26369709, 30287729, 30349667, 27622447,
            25986662, 26374197, 30331149, 27109679,
            ...
             1291818,  1291828, 26389601, 25798222,  1305164,  1966460,
             3070921,  1299436,  1294540, 27126359],
           dtype='int64', name='movie_id', length=1663)

In [9]:
def init_movie_data():
    """加载所有电影数据"""
    data = pd.read_csv(movie_all_info_path, names=movie_all_column_names,
        sep=r',\s*', engine='python', encoding='utf-8')
    return data


def init_rating_data():
    """加载电影评分数据"""
    data = pd.read_csv(user_movie_score_path, names=user_movie_column_names, usecols=(0, 1, 2))
    return data


def init_user_item_mat():
    """
    初始化User对Item的评分矩阵。并全局存储。
    """
    # 获取不重复的user_id列表与movie_id列表作为矩阵行索引标签与列索引标签
    unique_movieids = rating_data['movie_id'].unique()
    unique_userids = rating_data['user_id'].unique()
    user_item_mat = pd.DataFrame(
        0, index=unique_userids, columns=unique_movieids)
    # 遍历rating_data中的每一行数据，充实评分矩阵的内容
    for index, row in rating_data.iterrows():
        user_item_mat[row['movie_id']][row['user_id']] = row['rating']
    return user_item_mat


In [10]:
movie_data = init_movie_data()
rating_data = init_rating_data()
user_item_mat = init_user_item_mat()


In [14]:
movie_data

Unnamed: 0,movie_id,title,director,scriptwriter,actors,genres,place,languages,time,duration,other_names,_
0,30163509,飞驰人生,韩寒,韩寒,沈腾 / 黄景瑜 / 尹正 / 张本煜 / 尹昉 / 田雨 / 魏翔 / 赵文瑄 / 腾格尔...,剧情 / 励志 / 喜剧,中国大陆,汉语普通话 / 英语,2019-02-05(中国大陆),98分钟,Pegasus,tt9597190
1,4840388,新喜剧之王,周星驰,周星驰,王宝强 / 鄂靖文 / 张全蛋 / 景如洋 / 张琪 / 袁兴哲 / 田启文 / 黄骁鹏 /...,剧情 / 喜剧,中国大陆 / 香港,汉语普通话 / 粤语,2019-02-05(中国大陆),91分钟,喜剧之王2 / D计划 / The New King of Comedy / King of...,tt9368628
2,26369709,欢迎来到马文镇 Welcome to Marwen,罗伯特·泽米吉斯,罗伯特·泽米吉斯 / 卡罗琳·汤普森,史蒂夫·卡瑞尔 / 法尔克·亨特切尔 / 马特·欧莱瑞 / 尼古莱·维切尔 / 帕特里克·罗...,剧情 / 传记,美国,英语,2018-12-21(美国),116分钟,马克的异乡世界(台) / 马文科尔 / Marwencol / The Women of M...,tt3289724
3,30287729,正义联盟大战致命五人组 Justice League vs. The Fatal Five,刘山姆,Jim Shooter,凯文·康瑞 / 乔治·纽伯恩 / 苏珊·爱森伯格 / 丹妮拉·玻芭迪拉 / 素玛立·蒙塔诺 ...,动画,美国,英语,2019-03-30(美国),Nan,Nan,tt8752474
4,30349667,指挥家 De dirigent,玛利亚·彼特斯,玛利亚·彼特斯,克里斯蒂亚娜 ‧德‧布奥恩 / 本杰明·温赖特 / 斯科特·特纳·菲尔德 / 塞马斯·F·萨...,传记,荷兰,英语,2018-10-25(荷兰),137分钟,The Conductor,tt6932818
...,...,...,...,...,...,...,...,...,...,...,...,...
1981,26661189,脱单告急,柯孟融,任鹏,董子健 / 钟楚曦 / 春夏 / 袁福福 / 靳锦 / 尹雨航 / 吴昱瑶 / 刘梦梦 / 郗婧妍,喜剧 / 爱情,中国大陆,汉语普通话,2018-04-20(中国大陆) / 2018-04-15(北京国际电影节),106分钟,完全男生手册 / Dude's Manual,tt8319694
1982,27145025,精灵宝可梦：大家的故事 劇場版 ポケットモンスター みんなの物語,矢岛哲生,梅原英司 / 高羽彩,松本梨香 / 大谷育江 / 林原惠美 / 三木真一郎 / 犬山犬子 / 石冢运升 / 川荣李...,动画 / 奇幻 / 冒险,日本,日语,2018-07-13(日本),97分钟,精灵宝可梦剧场版：大家的故事 / 剧场版精灵宝可梦：我们的故事(港) / 剧场版口袋妖怪 大...,tt8108230
1983,27107625,代号基亚斯：反叛的鲁路修3之皇道 コードギアス 反逆のルルーシュⅢ 皇道,谷口悟朗,大河内一楼 / 谷口悟朗,福山润 / 樱井孝宏 / 野上尤加奈 / 小清水亚美 / 名塚佳织 / 绿川光 / 金井美香...,动作 / 科幻 / 动画,日本,日语,2018-05-26(日本),140分钟,Nan,Nan
1984,27611038,猫是要抱着的 猫は抱くもの,犬童一心,高田亮 / 大山淳子,泽尻英龙华 / 吉泽亮 / 峯田和伸 / 岩松了,剧情,日本,日语,2018-06-18(上海电影节) / 2018-06-23(日本),110分钟,猫是用来抱的(台) / 拥抱猫咪 / The Cat In Their Arms,tt9742106


In [6]:
user_item_mat.head()

Unnamed: 0,26411377,30267308,30181455,26776469,1292271,6973376,11502153,3771562,3319755,1297052,...,1291818,1291828,26389601,25798222,1305164,1966460,3070921,1299436,1294540,27126359
165852925,4,3,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,5,5,3,5,5,5,...,0,0,0,0,0,0,0,0,0,0
130012755,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150932792,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
189793987,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
itemids = user_item_mat.columns
ITEM_ITEM_SIMMAT = pd.DataFrame(0.0, index=itemids, columns=itemids, dtype='f8')

In [8]:
movie_feature_map = pd.get_dummies(movie_data, columns=['director']) 
movie_feature_map.head(2)

Unnamed: 0,movie_id,title,scriptwriter,actors,genres,place,languages,time,duration,other_names,...,director_黄泰来,director_黄渤,director_黄瀬和哉,director_黄荣昇,director_黄进,director_黎继强,director_黑泽明,director_黑泽清,director_黛布拉·格兰尼克,director_민철기 / 노시용
0,30163509,飞驰人生,韩寒,沈腾 / 黄景瑜 / 尹正 / 张本煜 / 尹昉 / 田雨 / 魏翔 / 赵文瑄 / 腾格尔...,剧情 / 励志 / 喜剧,中国大陆,汉语普通话 / 英语,2019-02-05(中国大陆),98分钟,Pegasus,...,0,0,0,0,0,0,0,0,0,0
1,4840388,新喜剧之王,周星驰,王宝强 / 鄂靖文 / 张全蛋 / 景如洋 / 张琪 / 袁兴哲 / 田启文 / 黄骁鹏 /...,剧情 / 喜剧,中国大陆 / 香港,汉语普通话 / 粤语,2019-02-05(中国大陆),91分钟,喜剧之王2 / D计划 / The New King of Comedy / King of...,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# 整理导演字段多标签二值化处理
L = pd.DataFrame(movie_data['actors'].str.split(' / ', expand=True)).fillna('').values
import sklearn.preprocessing as sp
mlb = sp.MultiLabelBinarizer()
res = pd.DataFrame(mlb.fit_transform(L), columns=mlb.classes_)
movie_feature_map2 = pd.concat([movie_feature_map, res], axis=1)
print(movie_feature_map2.shape)

(1986, 15881)


In [10]:
# 整理电影类型字段多标签二值化处理
G = pd.DataFrame(movie_data['genres'].str.split(' / ', expand=True)).fillna('').values
import sklearn.preprocessing as sp
mlb = sp.MultiLabelBinarizer()
res = pd.DataFrame(mlb.fit_transform(G), columns=mlb.classes_)
movie_feature_map3 = pd.concat([movie_feature_map2, res], axis=1)
print(movie_feature_map3.shape)
movie_feature_map3.head(1)

(1986, 15915)


Unnamed: 0,movie_id,title,scriptwriter,actors,genres,place,languages,time,duration,other_names,...,犯罪,真人秀,短片,科幻,纪录片,脱口秀,西部,运动,音乐,黑色电影
0,30163509,飞驰人生,韩寒,沈腾 / 黄景瑜 / 尹正 / 张本煜 / 尹昉 / 田雨 / 魏翔 / 赵文瑄 / 腾格尔...,剧情 / 励志 / 喜剧,中国大陆,汉语普通话 / 英语,2019-02-05(中国大陆),98分钟,Pegasus,...,0,0,0,0,0,0,0,0,0,0


In [11]:
movie_feature_map3.drop(movie_feature_map3.columns[np.arange(len(movie_all_column_names))], axis=1, inplace=True)
movie_feature_map3.T


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985
director_Antonio Carluccio,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
director_Anu Menon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
director_Athanassios Vakalis,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
director_Babis Makridis,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
director_Charles Haid / Chris Long,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
脱口秀,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
西部,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
运动,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
音乐,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:

MOVIE_DATA_PATH = 'data/m_all_info.csv'
MOVIE_DATA_COLUMNS = ['movie_id', 'title', 'director', 'scriptwriter', 'actors', 'genres',
                          'place', 'languages', 'time', 'duration', 'other_names', '_']
RATING_DATA_PATH = 'data/u_score.csv' 
RATING_DATA_CLOLUMNS = ['movie_id', 'user_id', 'rating']

ITEM_ITEM_SIMMAT_PATH = 'data/ITEM_ITEM_SIMMAT.csv'

def init_movie_data():
    """加载所有电影数据"""
    data = pd.read_csv(MOVIE_DATA_PATH, names=MOVIE_DATA_COLUMNS,
        sep=r',\s*', engine='python', encoding='utf-8')
    data.drop_duplicates(subset=['movie_id'], keep='first', inplace=True)
    print('Movie Data loaded:', data.shape)
    return data

def init_rating_data():
    """加载电影评分数据"""
    data = pd.read_csv(RATING_DATA_PATH, names=RATING_DATA_CLOLUMNS, usecols=(0, 1, 2))
    print('Rating Data loaded:', data.shape)
    return data


def init_user_item_mat():
    """
    初始化User对Item的评分矩阵。并全局存储。
    """
    # 获取不重复的user_id列表与movie_id列表作为矩阵行索引标签与列索引标签
    unique_movieids = rating_data['movie_id'].unique()
    unique_userids = rating_data['user_id'].unique()
    user_item_mat = pd.DataFrame(
        0, index=unique_userids, columns=unique_movieids)
    # 遍历rating_data中的每一行数据，充实评分矩阵的内容
    for index, row in rating_data.iterrows():
        user_item_mat[row['movie_id']][row['user_id']] = row['rating']
    print('User_item_mat loaded:', user_item_mat.shape)
    return user_item_mat

movie_data = init_movie_data()
rating_data = init_rating_data()
user_item_mat = init_user_item_mat()
ITEM_ITEM_SIMMAT = None
if os.path.exists(ITEM_ITEM_SIMMAT_PATH):
    print('load model ITEM_ITEM_SIMMAT')
    ITEM_ITEM_SIMMAT = pd.read_csv(ITEM_ITEM_SIMMAT_PATH, index_col=0, header=0)


movie_data.index = movie_data['movie_id']
movie_feature_map = pd.get_dummies(movie_data, columns=['director']) 
print(movie_feature_map.shape)
movie_feature_map.index

Movie Data loaded: (1663, 12)
Rating Data loaded: (4053, 3)
User_item_mat loaded: (401, 1551)
(1663, 1238)


Int64Index([30163509,  4840388, 26369709, 30287729, 30349667, 27622447,
            25986662, 26374197, 30331149, 27109679,
            ...
             1291818,  1291828, 26389601, 25798222,  1305164,  1966460,
             3070921,  1299436,  1294540, 27126359],
           dtype='int64', name='movie_id', length=1663)

In [35]:
L = pd.DataFrame(movie_data['actors'].str.split(' / ', expand=True)).fillna('').values
import sklearn.preprocessing as sp
mlb = sp.MultiLabelBinarizer()
res = pd.DataFrame(mlb.fit_transform(L), columns=mlb.classes_, index=movie_data['movie_id'])
movie_feature_map2 = pd.concat([movie_feature_map, res], axis=1)
movie_feature_map2

Unnamed: 0_level_0,movie_id,title,scriptwriter,actors,genres,place,languages,time,duration,other_names,...,미나,미리아,민영,사나,소미,정연,정원,지효,쯔위,채영
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30163509,30163509,飞驰人生,韩寒,沈腾 / 黄景瑜 / 尹正 / 张本煜 / 尹昉 / 田雨 / 魏翔 / 赵文瑄 / 腾格尔...,剧情 / 励志 / 喜剧,中国大陆,汉语普通话 / 英语,2019-02-05(中国大陆),98分钟,Pegasus,...,0,0,0,0,0,0,0,0,0,0
4840388,4840388,新喜剧之王,周星驰,王宝强 / 鄂靖文 / 张全蛋 / 景如洋 / 张琪 / 袁兴哲 / 田启文 / 黄骁鹏 /...,剧情 / 喜剧,中国大陆 / 香港,汉语普通话 / 粤语,2019-02-05(中国大陆),91分钟,喜剧之王2 / D计划 / The New King of Comedy / King of...,...,0,0,0,0,0,0,0,0,0,0
26369709,26369709,欢迎来到马文镇 Welcome to Marwen,罗伯特·泽米吉斯 / 卡罗琳·汤普森,史蒂夫·卡瑞尔 / 法尔克·亨特切尔 / 马特·欧莱瑞 / 尼古莱·维切尔 / 帕特里克·罗...,剧情 / 传记,美国,英语,2018-12-21(美国),116分钟,马克的异乡世界(台) / 马文科尔 / Marwencol / The Women of M...,...,0,0,0,0,0,0,0,0,0,0
30287729,30287729,正义联盟大战致命五人组 Justice League vs. The Fatal Five,Jim Shooter,凯文·康瑞 / 乔治·纽伯恩 / 苏珊·爱森伯格 / 丹妮拉·玻芭迪拉 / 素玛立·蒙塔诺 ...,动画,美国,英语,2019-03-30(美国),Nan,Nan,...,0,0,0,0,0,0,0,0,0,0
30349667,30349667,指挥家 De dirigent,玛利亚·彼特斯,克里斯蒂亚娜 ‧德‧布奥恩 / 本杰明·温赖特 / 斯科特·特纳·菲尔德 / 塞马斯·F·萨...,传记,荷兰,英语,2018-10-25(荷兰),137分钟,The Conductor,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1966460,1966460,图雅的婚事,芦苇 / 王全安,余男 / 巴特尔 / 森格,剧情 / 爱情,中国大陆,汉语普通话,2006-09-21(中国大陆),95分钟 / 86分钟(中国大陆),萨仁图雅 / Tuya's Marriage,...,0,0,0,0,0,0,0,0,0,0
3070921,3070921,饥饿 Hunger,恩达·沃尔什 / 史蒂夫·麦奎因,斯图尔特·格雷厄姆 / 莱恩·梅加 / 布赖恩·米利根 / 利亚姆·麦克马洪 / 凯伦·汉森...,剧情 / 传记,英国 / 爱尔兰,英语 / 爱尔兰语,2008-05-15(戛纳电影节) / 2008-10-31(英国/爱尔兰),96分钟,大绝食(港) / 饥饿宣言(台) / 绝食,...,0,0,0,0,0,0,0,0,0,0
1299436,1299436,风柜来的人 風櫃來的人,朱天文,钮承泽 / 张世 / 庹宗华 / 林秀玲 / 杨丽音 / 张纯芳 / 陈博正,剧情,台湾,汉语普通话 / 闽南语,1983(台湾) / 1990-07-21(日本),101分钟,The Boys From Fengkuei / All the Youthful Days,...,0,0,0,0,0,0,0,0,0,0
1294540,1294540,你那边几点 你那邊幾點,蔡明亮 / 杨璧莹,李康生 / 陈湘琪 / 陆弈静 / 苗天 / 叶童 / 让-皮埃尔·利奥德 / Chao-j...,剧情 / 爱情,法国 / 台湾,英语 / 法语 / 汉语普通话 / 台语,2001-09-26,116 分钟,你那边几点？ / What Time Is It There? / 7 to 400 Blows,...,0,0,0,0,0,0,0,0,0,0


In [9]:
import pandas as pd
import numpy as np
import time
import os

MOVIE_DATA_PATH = 'data/m_all_info.csv'
MOVIE_DATA_COLUMNS = ['movie_id', 'title', 'director', 'scriptwriter', 'actors', 'genres',
                          'place', 'languages', 'time', 'duration', 'other_names', '_']
RATING_DATA_PATH = 'data/u_score.csv' 
RATING_DATA_CLOLUMNS = ['movie_id', 'user_id', 'rating']

ITEM_ITEM_SIMMAT_PATH = 'data/ITEM_ITEM_SIMMAT.csv'
USER_USER_SIMMAT_PATH = 'data/USER_USER_SIMMAT.csv'
PERSONA_TAG_TABLE_PATH = 'terdata/user_tag.csv'
USER_INDEX_PATH = 'terdata/u_idx.csv'

In [10]:
# 加载用户索引与id信息
users = pd.read_csv(USER_INDEX_PATH, index_col=0, header=False)
# 加载用户标签信息
tag_table = pd.read_csv(PERSONA_TAG_TABLE_PATH, header=0)
# 合并两张表
user_tag_table = pd.merge(users, tag_table, left_index=True, right_index=True)

In [12]:
users

Unnamed: 0_level_0,165852925
0,Unnamed: 1_level_1
1,0
2,130012755
3,150932792
4,189793987
5,142889559
...,...
396,140956290
397,188123312
398,173449675
399,137960270
