In [18]:
import pandas as pd
movies=pd.read_csv('movies.csv')

### 1.数据清洗

In [19]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140502 entries, 0 to 140501
Data columns (total 21 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   MOVIE_ID       140502 non-null  int64  
 1   NAME           140502 non-null  object 
 2   ALIAS          30322 non-null   object 
 3   ACTORS         82851 non-null   object 
 4   COVER          50654 non-null   object 
 5   DIRECTORS      70244 non-null   object 
 6   DOUBAN_SCORE   140502 non-null  float64
 7   DOUBAN_VOTES   140502 non-null  float64
 8   GENRES         136452 non-null  object 
 9   IMDB_ID        113256 non-null  object 
 10  LANGUAGES      131442 non-null  object 
 11  MINS           140502 non-null  float64
 12  OFFICIAL_SITE  9821 non-null    object 
 13  REGIONS        136501 non-null  object 
 14  RELEASE_DATE   77687 non-null   object 
 15  SLUG           140502 non-null  object 
 16  STORYLINE      87052 non-null   object 
 17  TAGS           97398 non-null

删除无用列并只保留评分大于6.5的电影

In [20]:
movies = movies.drop(
    columns=['COVER','IMDB_ID','MINS','OFFICIAL_SITE','RELEASE_DATE','SLUG','ACTOR_IDS','DIRECTOR_IDS','LANGUAGES','GENRES','ALIAS']
)
movies = movies[movies['DOUBAN_SCORE'] >= 6.5]

In [21]:
movies.isnull().sum()

MOVIE_ID           0
NAME               0
ACTORS          2023
DIRECTORS       2688
DOUBAN_SCORE       0
DOUBAN_VOTES       0
REGIONS            0
STORYLINE        492
TAGS               0
YEAR               0
dtype: int64

筛选评分人数大于5000的电影并降序排列

In [22]:
movies[(movies['DOUBAN_VOTES'] >= 5000)].sort_values(by=['DOUBAN_SCORE','DOUBAN_VOTES'], ascending=[False,False])[['MOVIE_ID','NAME','DOUBAN_SCORE','DOUBAN_VOTES','DIRECTORS','TAGS','REGIONS']].head(50)

Unnamed: 0,MOVIE_ID,NAME,DOUBAN_SCORE,DOUBAN_VOTES,DIRECTORS,TAGS,REGIONS
56283,1291546,霸王别姬,9.6,1167141.0,陈凯歌,经典/人性/文艺/爱情/人生/同志/剧情/文革,中国大陆 / 中国香港
28985,24882562,悲惨世界：25周年纪念演唱会,9.6,6430.0,尼克·莫里斯,音乐剧/悲惨世界/英国/经典/音乐/LesMisérables/2010/西区,英国
8031,27121232,夏目友人帐 第六季 特别篇 铃响的残株,9.6,5955.0,大森贵弘/出合小都美,夏目友人帐/治愈/动漫/日本/动画/温情/日本动画/治愈系,日本
55895,1295124,辛德勒的名单,9.5,637119.0,史蒂文·斯皮尔伯格,人性/二战/经典/斯皮尔伯格/战争/辛德勒的名单/美国/剧情,美国
81761,10583098,十二怒汉（电视版）,9.5,12646.0,富兰克林·沙夫纳,法律/经典/人性/悬疑/美国/剧情/十二怒汉/犯罪,美国
65811,5294851,控方证人,9.5,7272.0,艾伦·吉布森,悬疑/推理/经典/阿加莎·克里斯蒂/法律/犯罪/美国/美国电影,美国 / 英国
9683,27011740,夏目友人帐 五 特别篇：一夜酒杯,9.5,5629.0,大森贵弘/出合小都美,夏目友人帐/治愈/日本/动漫/动画/日本动画/温情/OVA,日本
80491,1293182,十二怒汉,9.4,261934.0,西德尼·吕美特,经典/人性/美国/黑白/剧情/法律/1957/推理,美国
55698,1307856,背靠背，脸对脸,9.4,38410.0,黄建新/杨亚洲,政治/经典/大陆/中国电影/中国/剧情/1994/人性,中国大陆 / 中国香港
44473,1291831,灿烂人生,9.4,34451.0,马可·图利奥·吉奥达纳,意大利/意大利电影/人生/灿烂人生/成长/文艺/文艺片/剧情,意大利


### 2.余弦相似度模型构建

In [23]:
movies['TAGS'] = (
    movies['STORYLINE'].fillna('').astype(str) + " " +
    movies['TAGS'].fillna('').astype(str) + " " +
    movies['YEAR'].fillna('').astype(str)
)

In [24]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15846 entries, 1378 to 137531
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MOVIE_ID      15846 non-null  int64  
 1   NAME          15846 non-null  object 
 2   ACTORS        13823 non-null  object 
 3   DIRECTORS     13158 non-null  object 
 4   DOUBAN_SCORE  15846 non-null  float64
 5   DOUBAN_VOTES  15846 non-null  float64
 6   REGIONS       15846 non-null  object 
 7   STORYLINE     15354 non-null  object 
 8   TAGS          15846 non-null  object 
 9   YEAR          15846 non-null  float64
dtypes: float64(3), int64(1), object(6)
memory usage: 1.3+ MB


In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
cv=CountVectorizer(max_features=20000,stop_words='english') #只考虑词频最高的前20000个特征词，自动移除英语停用词（如 "the", "and", "is" 等）

In [27]:
vector=cv.fit_transform(movies['TAGS'].values.astype('U')).toarray()

In [28]:
vector.shape

(15846, 20000)

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity=cosine_similarity(vector)  #索引有些问题，之后再搞

### 后续需添加导演和时代标签等关系网