大一小学期/Week_3-Python/hw/2020/papersnake/scrapy/douban_movie.py

#!/usr/bin/python3
import os
import sys
import ast
import subprocess
import requests
import json
import re
import time
import threading
from fake_useragent import UserAgent
import MySQLdb
from bs4 import BeautifulSoup
import inspect
import ctypes
import asyncio
import aiohttp

class DouBanMovieSpider:
	def __init__(self):
		self.headers = {
			'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
			'Accept-Encoding': 'gzip, deflate',
			'Accept-Language': 'zh,en;q=0.9,zh-CN;q=0.8,en-US;q=0.7',
			'Cache-Control': 'max-age=0',
			'Connection': 'keep-alive',
			'Upgrade-Insecure-Requests': '1',
			'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
		}
		self.ua = UserAgent()
		self.ip_pool = []
		self.proxy = ""
		self.fail = 3
		self.changeIP()
		self.db = MySQLdb.connect("127.0.0.1", "douban", "", "douban", charset='utf8' )
		print(self.db)
		self.execute_list = []
		self.create_movie = "CREATE TABLE IF NOT EXISTS `movie`(id INT(10) NOT NULL AUTO_INCREMENT PRIMARY KEY,title TEXT COMMENT '标题',directors TEXT COMMENT '导演',authors TEXT COMMENT '编剧',actors TEXT COMMENT '主演',datePublished TEXT COMMENT '上映日期',genre TEXT COMMENT '类型',duration TEXT COMMENT '片长',description TEXT COMMENT '剧情简介',comments TEXT COMMENT '评论',info_link TEXT,pic_link TEXT,rate float COMMENT '评分',rate_count NUMERIC COMMENT '评分数',flag NUMERIC)"
		self.create_celebrity = "CREATE TABLE IF NOT EXISTS celebrity(id INT(10) NOT NULL AUTO_INCREMENT PRIMARY KEY,title TEXT COMMENT '姓名',img_url TEXT,info TEXT COMMENT '影人简介',movies TEXT COMMENT '全部作品',actors TEXT COMMENT '合作影人')"
		self.execute_list.append([self.create_movie])
		self.execute_list.append([self.create_celebrity])
		self.commit()
	def __del__(self):
		self.db.close()
	def commit(self):
		self.cur = self.db.cursor()
		for sql in self.execute_list:
			try:
				self.cur.execute(*sql)
			except:
				print("Error",sql)
		self.db.commit()
		self.execute_list.clear()
	def changeIP(self):
		pass
	def get_new_ip(self):
		pass
	async def get_html(self,url,check_url):
		while(True):
			print(url,check_url)
			try:
				async with aiohttp.ClientSession(trust_env=True, headers=self.headers) as session:
					async with session.get(url, timeout=5) as response:
						text = await response.text()
						if check_url in text:
							return text
						else:
							print("Ban")
			except:
				print("Ban")
		
		
	def get_movie_info(self,movie_list):
		jobs = []
		loop = asyncio.get_event_loop()
		for id in movie_list:
			url = "https://movie.douban.com/subject/" + str(id)
			jobs.append(self.get_html(url,"电影"))
		tasks = loop.run_until_complete(asyncio.gather(*jobs))
		for i in range(len(tasks)):
			id = int(movie_list[i])
			response = tasks[i]
			if not response:
				continue
			soup = BeautifulSoup(response,"lxml")
			content = soup.find("script",type="application/ld+json").string.replace("\n","")
			try:
				content = json.loads(content, strict=False)
			except:
				continue
			sql = "replace into movie(id,title,directors,authors,actors,datePublished,genre,duration,description,info_link,pic_link,rate,rate_count,comments,flag) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,1)"
			title = content["name"]
			directors = []
			authors = []
			actors = []
			comments = []
			for director in content["director"]:
				url = re.match("/celebrity/(\d+)/",director["url"]).group(1)
				name = director["name"]
				directors.append([url,name])
			for author in content["author"]:
				url = re.match("/celebrity/(\d+)/",author["url"]).group(1)
				name = author["name"]
				authors.append([url,name])
			for actor in content["actor"]:
				url = re.match("/celebrity/(\d+)/",actor["url"]).group(1)
				name = actor["name"]
				actors.append([url,name])
			directors = str(directors)
			authors = str(authors)
			actors = str(actors)
			datePublished = content["datePublished"];
			genre = str(content["genre"]);
			duration = content["duration"]
			description = content["description"]
			info_link = "https://movie.douban.com" + content["url"]
			pic_link = content["image"]
			rate = content["aggregateRating"]["ratingValue"]
			rate_count = content["aggregateRating"]["ratingCount"]
			comment_list = soup.find_all("div",class_="comment")
			for comment in comment_list:			
				try:
					vote = comment.find(class_="votes").text;
					short = comment.find(class_="short")
					info = comment.find(class_="comment-info");
					name = info.find('a').text
					flag = info.find_all('span')[0].text
					star = info.find_all('span')[1].get('class')[0]
					time = info.find(class_="comment-time").get('title')
				except:
					continue
					
				comments.append([name,flag,star,vote,time,short])
			comments = str(comments)
	#		print(sql % (id,title,directors,authors,actors,datePublished,genre,duration,description,info_link,pic_link,rate,rate_count,comments))
			self.execute_list.append([sql , [id,title,directors,authors,actors,datePublished,genre,duration,description,info_link,pic_link,rate,rate_count,comments]])
			if len(self.execute_list) > 8:
				self.commit()
	def get_celebrity_info(self,celebrity_id):
		update_id = []
		for id in celebrity_id:
			self.cur.execute("SELECT id from celebrity where id=%s" , [id])
			celebrity_cursor = self.cur.fetchall()
			if not len(celebrity_cursor):
				update_id.append(id)
		jobs = []
		loop = asyncio.get_event_loop()
		for id in update_id:
			url = "https://movie.douban.com/celebrity/" + str(id)
			jobs.append(self.get_html(url,"影人"))
		tasks = loop.run_until_complete(asyncio.gather(*jobs))
		for i in range(len(tasks)):
			id = int(update_id[i])
			response = tasks[i]
			soup = BeautifulSoup(response,"lxml")
			content = soup.find("div",id="content")
			name = content.find("h1").text
			img_url = content.find("a",class_="nbg").get('href')
			info = content.find("div",class_="bd")
			start = 0
			movies = []
			actors = []
			while(True):
				if(start>=50):
					break
				url = "https://movie.douban.com/celebrity/%d/movies?start=%d&format=text&sortby=vote" % (id,start)
				loop = asyncio.get_event_loop()
				response = loop.run_until_complete(self.get_html(url,"作品"))
				soup = BeautifulSoup(response,"lxml")
				content = soup.find_all("td",headers="m_name")
				if not len(content):
					break
				for item in content:
					movie_url = item.find("a").get('href')
					movie_id = re.search("\d+",movie_url).group(0)
					if movie_id:
						movies.append(movie_id)
				start += 25
			url = "https://movie.douban.com/celebrity/%d/partners" % id
			loop = asyncio.get_event_loop()
			response = loop.run_until_complete(self.get_html(url,"影人"))
			soup = BeautifulSoup(response,"lxml")
			content = soup.find_all("div",class_="info")
			for item in content:
				actor_url = item.find("h2").find("a").get('href')
				actor_id = re.search("\d+",actor_url).group(0)
				actor_cor_info = str(item.find_all("li")[1])
				if actor_id:
					actors.append([actor_id,actor_cor_info])
			info = str(info)
			movies = str(movies)
			actors = str(actors)
			sql = "replace into celebrity(id,title,img_url,info,movies,actors) values(%s,%s,%s,%s,%s,%s)"
			self.execute_list.append([sql , [id,name,img_url,info,movies,actors]])
			if len(self.execute_list) > 8:
				self.commit()
	def auto_get_detail(self,start=0):
		while(True):
			try:
				self.cur.execute("SELECT id,directors,authors,actors,flag from movie where flag=0 order by id limit %s,30" , [start])
			except:
				print("Error",sys._getframe().f_code.co_name)
				print("SELECT id,directors,authors,actors,flag from movie where flag=0 order by id limit 10")
				return
			cursor = self.cur.fetchall()
			movie_update_id = []
			flag = 1
			for item in cursor:
				if item[4] == 0:
					flag = 0
					movie_update_id.append(item[0])
			self.get_movie_info(movie_update_id)
			self.commit()
			if(flag):
				break


spider = DouBanMovieSpider()
spider.auto_get_detail()