In [1]:
!pip install beautifulsoup4 lxml requests



In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from bs4 import BeautifulSoup
import requests
import time
import os


In [3]:
def get_title_from_index(df, index):
	return df[df.index == index]["title"].values[0]

def get_index_from_title(df, title):
	return df[df.title.str.lower() == title]["index"].values[0]

In [4]:
movie_data = pd.read_csv('movie_dataset.csv')
print(movie_data.info())
movie_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [5]:
features = ['keywords', 'cast', 'genres', 'director']

for feature in features:
  movie_data[feature] = movie_data[feature].fillna('')

movie_data['combine_sentence'] = movie_data['keywords'] + ' ' + movie_data['cast'] + ' ' + movie_data['genres'] + ' ' + movie_data['director']

cv = CountVectorizer()
count_matrix = cv.fit_transform(movie_data['combine_sentence'])

In [6]:
test = ['London London Paris', 'Paris Paris London', 'LA Paris']
test_matrix = cv.fit_transform(test)
print(test_matrix.toarray())

[[0 2 1]
 [0 1 2]
 [1 0 1]]


In [7]:
cosine_sim = cosine_similarity(count_matrix)

In [8]:
movie_recommendations = []

print("Enter a movie you like: ")
movie_user_likes = input("")
movie_user_likes = movie_user_likes.lower()
movie_index = get_index_from_title(movie_data, movie_user_likes)

similar_movies = list(enumerate(cosine_sim[movie_index]))
sorted_list = sorted(similar_movies, key=lambda x: x[1], reverse=True)

print()
print("Perhaps you like these movies: ")
i = 0
for movie in sorted_list:
  if get_title_from_index(movie_data, movie[0]).lower() != movie_user_likes:
    movie_recommendations.append(get_title_from_index(movie_data, movie[0]))
    print(get_title_from_index(movie_data, movie[0]))
  i = i + 1
  if i > 10:
    break
print("")

Enter a movie you like: 
Alien

Perhaps you like these movies: 
Aliens
Alien³
Moonraker
Planet of the Apes
Avatar
Gravity
Jason X
The Helix... Loaded
Alien: Resurrection
The Thing



In [67]:
def get_zhihu_urls(movie):
  response = requests.get('https://www.google.com/search?q=' + movie + '+site%3Azhihu.com&oq=' + movie + '+site%3Azhihu.com&aqs=chrome..69i57.6845j0j7&sourceid=chrome&ie=UTF-8').text
  soup = BeautifulSoup(response, 'lxml')
  
  zhihu_urls = []
  for a in soup.find_all('a', href=True):
    if ('zhihu.com/question/' in a['href']) and \
       ('img' not in a['href']) and \
       ('answer' not in a['href']):
      zhihu_url = a['href'][7:].partition('&sa')[0]
      zhihu_urls.append(zhihu_url)

  return zhihu_urls

In [109]:
def get_zhihu_question_answers(zhihu_question_id, answer_list):
  headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
  }

  answer_api = 'https://www.zhihu.com/api/v4/questions/' + zhihu_question_id + '/answers?include=content&limit=5&offset=5&platform=desktop&sort_by=default'
  
  response = requests.get(answer_api, headers=headers)

  if len(response.json()['data']) == 0:
    return

  for answer in response.json()['data']:
    answer_list.append(answer['content'])

In [112]:
test_zhihu_urls = get_zhihu_urls('Dune')

answer_list = []
for url in test_zhihu_urls:
  qid = url.partition('/question/')[2]
  get_zhihu_question_answers(qid, answer_list)

for answer in answer_list:
  print(answer)

f = open('test.html','w')

message = """<html>
<head></head>
<body>""" + answer_list[0] + """</body>
</html>"""

f.write(message)
f.close()

<p>谈一谈个人的粗浅看法。</p><p>丹尼斯维伦纽瓦的电影，有几个特点。它们让他成为了电影版《沙丘》可能是最佳的导演选择。它具备《沙丘》的诸多要素，同时也是维伦纽瓦自己的《沙丘》。它寄予了科幻电影“似乎必备”的宏大场景、美术设定，但却不是为了类型化目的，而是实现维伦纽瓦的作者表达。它是科幻电影，但却绝对不够商业。</p><p>维伦纽瓦非常重视环境与人物的呼应关系，并以这种呼应关系作为对人物内心变化的持续性表达。由此，也就诞生了第二个特点：围绕人物内心世界的主题。无论是他的题材设定有多宏大，剧情逻辑有多复杂，涉及知识有多硬核，内在却永远是指向人物内心描写的。</p><figure data-size="normal"><noscript><img src="https://pic2.zhimg.com/50/v2-33f185b8b4533cd605e4254897697d50_720w.jpg?source=1940ef5c" data-rawwidth="690" data-rawheight="966" data-size="normal" data-default-watermark-src="https://pic2.zhimg.com/50/v2-f7bb56c850758023d9d40c83d77d922f_720w.jpg?source=1940ef5c" class="origin_image zh-lightbox-thumb" width="690" data-original="https://pica.zhimg.com/v2-33f185b8b4533cd605e4254897697d50_r.jpg?source=1940ef5c"/></noscript><img src="data:image/svg+xml;utf8,&lt;svg xmlns=&#39;http://www.w3.org/2000/svg&#39; width=&#39;690&#39; height=&#39;966&#39;&gt;&lt;/svg&gt;" data-rawwidth="690" data-rawheight="966" data-size="normal" data-default-watermark-src="https://pic