From 0568a4c246094113aaf7aad69468053daf9a6434 Mon Sep 17 00:00:00 2001 From: yyyz <904108079@qq.com> Date: Mon, 14 Feb 2022 00:18:58 +0800 Subject: [PATCH] =?UTF-8?q?release=204.3.0=20=E6=96=B0=E5=A2=9Emongodb?= =?UTF-8?q?=E5=AD=98=E5=82=A8=EF=BC=8C=E5=92=8C=E5=AF=B9=E5=BA=94=E7=9A=84?= =?UTF-8?q?api=20=E6=96=B0=E5=A2=9Eapi=E6=96=B9=E5=BC=8F=E7=9A=84=E9=85=8D?= =?UTF-8?q?=E7=BD=AE=E9=A1=B9=E5=8F=8B=E9=93=BE=20=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?=E8=BF=87=E6=9C=9F=E6=96=87=E7=AB=A0=E6=B8=85=E9=99=A4=E4=B8=8D?= =?UTF-8?q?=E7=94=9F=E6=95=88=E7=9A=84=E9=97=AE=E9=A2=98=20=E5=85=B6?= =?UTF-8?q?=E5=AE=83bug=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/main.yml | 4 +- Dockerfile | 2 + README.md | 10 +- api/main.py | 2 + api/mongodbapi.py | 122 +++++++++++++++ .../pipelines/leancloud_pipe.py | 26 ++-- .../pipelines/mongodb_pipe.py | 142 ++++++++++++++++++ hexo_circle_of_friends/pipelines/pipelines.py | 18 ++- hexo_circle_of_friends/pipelines/sql_pipe.py | 39 +++-- hexo_circle_of_friends/requirements.txt | 4 +- hexo_circle_of_friends/run.py | 3 +- hexo_circle_of_friends/settings.py | 20 +-- .../spiders/hexo_circle_of_friends.py | 29 ++-- requirements.txt | 4 +- server.sh | 3 +- 15 files changed, 354 insertions(+), 74 deletions(-) create mode 100644 api/mongodbapi.py create mode 100644 hexo_circle_of_friends/pipelines/mongodb_pipe.py diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d959f94ada91..c05676e2ff21 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -28,8 +28,8 @@ env: GITHUB_NAME: ${{ secrets.GH_NAME }} # 你的github昵称 GITHUB_EMAIL: ${{ secrets.GH_EMAIL }} # 你的github邮箱 GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} # github token - - + # mongodb配置 + MONGODB_URI: ${{ secrets.MONGODB_URI }} # mongodb URI jobs: build: diff --git a/Dockerfile b/Dockerfile index 2e1d5d583502..713d40aec190 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,8 @@ ENV APPKEY="" #ENV MYSQL_PASSWORD="" #ENV MYSQL_IP="" #ENV MYSQL_DB="" +### mongodb配置 +#ENV MONGODB_URI="" EXPOSE 8000 WORKDIR / RUN cd ./hexo_circle_of_friends && pip3 install -r requirements.txt -i https://pypi.douban.com/simple/ diff --git a/README.md b/README.md index 09adfd1ca8a8..e393e84b94bd 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ ⭐从4.1.3版本开始,一定要在配置项中配置友链页的获取策略 ``` -目前 release 4.2.6 版本: +目前 release 4.3.0 版本: - 支持 gitee 和 github 上的 issuse 友链获取 - 支持butterfly、volantis、matery、sakura、fluid、nexmoe、Yun、stun、stellar、next主题的友链和文章获取 - 支持feed订阅规则,如atom、rss等规则(支持wordpress类型的博客) @@ -16,12 +16,13 @@ - 支持未适配的hexo主题和非hexo用户使用,在配置项选择开启配置项友链 - 额外的友链页同时爬取 - 支持添加HTTP代理 -- 新增数据存储配置,提供多种存储方式 -- 新增部署方式配置,可部署在本地服务端 +- 多种数据存储,提供leancloud,mysql,sqlite,mongodb存储方式 +- 多种方式部署,提供github,server,docker部署方式 - 将api整合到主仓库 - 新增友链获取策略的common规则 +- 新增api方式的配置项友链 -bug修复: +bug修复和改动: - wordpress类型博客的时间格式问题 - butterfly主题友链页解析不再抓取背景图片了 - 修复了github和gitee对volantis主题的友链获取 @@ -36,5 +37,6 @@ bug修复: - 移除旧订阅规则解析 - 修复butterfly的时间获取 - 额外友链页也可以配置获取策略 +- 修复过期文章清除不生效的问题 ``` diff --git a/api/main.py b/api/main.py index 3410b5228757..fab66a35df79 100644 --- a/api/main.py +++ b/api/main.py @@ -14,6 +14,8 @@ from api.leancloudapi import * elif settings.DATABASE == "mysql" or settings.DATABASE == "sqlite": from api.sqlapi import * +elif settings.DATABASE == "mongodb": + from api.mongodbapi import * app = FastAPI() diff --git a/api/mongodbapi.py b/api/mongodbapi.py new file mode 100644 index 000000000000..ba079002710f --- /dev/null +++ b/api/mongodbapi.py @@ -0,0 +1,122 @@ +# -*- coding:utf-8 -*- +# Author:yyyz +import os +import random + +from urllib import parse +from hexo_circle_of_friends import settings +from pymongo import MongoClient + + +def db_init(): + if settings.DEBUG: + URI = "mongodb+srv://root:@cluster0.wgfbv.mongodb.net/myFirstDatabase?retryWrites=true&w=majority" + else: + URI = os.environ.get("MONGODB_URI") + client = MongoClient(URI) + db = client.fcircle + posts = db.Post + friends = db.Friend + return posts, friends + + +def query_all(list, start: int = 0, end: int = -1, rule: str = "updated"): + post_collection, friend_db_collection = db_init() + article_num = post_collection.count_documents({}) + if end == -1: + end = min(article_num, 1000) + if start < 0 or start >= min(article_num, 1000): + return {"message": "start error"} + if end <= 0 or end > min(article_num, 1000): + return {"message": "end error"} + if rule != "created" and rule != "updated": + return {"message": "rule error, please use 'created'/'updated'"} + + posts = post_collection.find({}, {'_id': 0, "rule": 0}).sort([(rule, -1)]).limit(end - start).skip(start) + last_update_time = "1970-01-01 00:00:00" + post_data = [] + for k, post in enumerate(posts): + last_update_time = max(last_update_time, post.pop("createdAt")) + item = {'floor': start + k + 1} + item.update(post) + post_data.append(item) + + friends_num = friend_db_collection.count_documents({}) + active_num = friend_db_collection.count_documents({"error": False}) + error_num = friends_num - active_num + + data = {} + data['statistical_data'] = { + 'friends_num': friends_num, + 'active_num': active_num, + 'error_num': error_num, + 'article_num': article_num, + 'last_updated_time': last_update_time + } + + data['article_data'] = post_data + return data + + +def query_friend(): + _, friend_db_collection = db_init() + friends = friend_db_collection.find({}, {"_id": 0, "createdAt": 0, "error": 0}) + friend_list_json = [] + if friends: + for friend in friends: + friend_list_json.append(friend) + else: + # friends为空直接返回 + return {"message": "not found"} + return friend_list_json + + +def query_random_friend(): + _, friend_db_collection = db_init() + friends = friend_db_collection.find({}, {"_id": 0, "createdAt": 0, "error": 0}) + friends_num = friend_db_collection.count_documents({}) + random_friend = friends[random.randint(0, friends_num - 1)] + + return random_friend if random_friend else {"message": "not found"} + + +def query_random_post(): + post_collection, _ = db_init() + posts = post_collection.find({}, {'_id': 0, "rule": 0, "createdAt": 0}) + posts_num = post_collection.count_documents({}) + random_post = posts[random.randint(0, posts_num - 1)] + return random_post if random_post else {"message": "not found"} + + +def query_post(link, num, rule): + post_collection, friend_db_collection = db_init() + if link is None: + friend = query_random_friend() + domain = parse.urlsplit(friend.get("link")).netloc + else: + domain = parse.urlsplit(link).netloc + friend = friend_db_collection.find_one({'link': {'$regex': domain}}, {"_id": 0, "createdAt": 0, "error": 0}) + + if rule != "created" and rule != "updated": + return {"message": "rule error, please use 'created'/'updated'"} + + posts = post_collection.find( + {'link': {'$regex': domain}}, + {'_id': 0, "rule": 0, "createdAt": 0, "avatar": 0, "author": 0} + ).sort([(rule, -1)]).limit(num if num > 0 else 0) + + data = [] + for floor, post in enumerate(posts): + post["floor"] = floor + 1 + data.append(post) + if friend: + friend["article_num"] = len(data) + api_json = {"statistical_data": friend, "article_data": data} + else: + # 如果user为空直接返回 + return {"message": "not found"} + return api_json + + +def query_post_json(jsonlink, list, start, end, rule): + return {"message": "not found"} diff --git a/hexo_circle_of_friends/pipelines/leancloud_pipe.py b/hexo_circle_of_friends/pipelines/leancloud_pipe.py index 8db9f584b94b..1bb463e2b773 100644 --- a/hexo_circle_of_friends/pipelines/leancloud_pipe.py +++ b/hexo_circle_of_friends/pipelines/leancloud_pipe.py @@ -2,13 +2,13 @@ # Author:yyyz import os import leancloud -import datetime import re from .. import settings -from datetime import datetime,timedelta +from datetime import datetime, timedelta today = (datetime.now() + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S') + class LeancloudPipeline: def __init__(self): self.userdata = [] @@ -48,17 +48,17 @@ def process_item(self, item, spider): return item if "title" in item.keys(): - if item["name"] in self.nonerror_data: + if item["author"] in self.nonerror_data: pass else: # 未失联的人 - self.nonerror_data.add(item["name"]) + self.nonerror_data.add(item["author"]) # print(item) for query_item in self.query_post_list: try: if query_item.get("link") == item["link"]: - item["time"] = min(item['time'], query_item.get('created')) + item["created"] = min(item['created'], query_item.get('created')) delete = self.Friendspoor.create_without_data(query_item.get('objectId')) delete.destroy() # print("----deleted %s ----"%item["title"]) @@ -106,10 +106,10 @@ def outdate_clean(self, time_limit): out_date_post = 0 for query_i in self.query_post_list: - created = query_i.get('created') + updated = query_i.get('updated') try: - query_time = datetime.datetime.strptime(created, "%Y-%m-%d") - if (datetime.datetime.today() - query_time).days > time_limit: + query_time = datetime.strptime(updated, "%Y-%m-%d") + if (datetime.today() + timedelta(hours=8) - query_time).days > time_limit: delete = self.Friendspoor.create_without_data(query_i.get('objectId')) out_date_post += 1 delete.destroy() @@ -151,14 +151,14 @@ def friendlist_push(self): def friendpoor_push(self, item): friendpoor = self.Friendspoor() friendpoor.set('title', item['title']) - friendpoor.set('created', item['time']) + friendpoor.set('created', item['created']) friendpoor.set('updated', item['updated']) friendpoor.set('link', item['link']) - friendpoor.set('author', item['name']) - friendpoor.set('avatar', item['img']) + friendpoor.set('author', item['author']) + friendpoor.set('avatar', item['avatar']) friendpoor.set('rule', item['rule']) friendpoor.save() print("----------------------") - print(item["name"]) - print("《{}》\n文章发布时间:{}\t\t采取的爬虫规则为:{}".format(item["title"], item["time"], item["rule"])) + print(item["author"]) + print("《{}》\n文章发布时间:{}\t\t采取的爬虫规则为:{}".format(item["title"], item["created"], item["rule"])) self.total_post_num += 1 diff --git a/hexo_circle_of_friends/pipelines/mongodb_pipe.py b/hexo_circle_of_friends/pipelines/mongodb_pipe.py new file mode 100644 index 000000000000..214afd296114 --- /dev/null +++ b/hexo_circle_of_friends/pipelines/mongodb_pipe.py @@ -0,0 +1,142 @@ +# -*- coding:utf-8 -*- +# Author:yyyz +import os +import re +from datetime import datetime, timedelta +from pymongo import MongoClient +from .. import settings + +today = (datetime.now() + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S') + + +class MongoDBPipeline: + def __init__(self): + self.userdata = [] + self.nonerror_data = set() # 能够根据友链link获取到文章的人 + self.query_post_list = [] + + def open_spider(self, spider): + + if settings.DEBUG: + URI = "mongodb+srv://root:@cluster0.wgfbv.mongodb.net/myFirstDatabase?retryWrites=true&w=majority" + else: + URI = os.environ.get("MONGODB_URI") + client = MongoClient(URI) + db = client.fcircle + self.posts = db.Post + self.friends = db.Friend + self.query_post_num = self.posts.count_documents({}) + + for post in self.posts.find(): + self.query_post_list.append(post) + + self.friends.delete_many({}) + print("Initialization complete") + + def process_item(self, item, spider): + if "userdata" in item.keys(): + li = [] + li.append(item["name"]) + li.append(item["link"]) + li.append(item["img"]) + self.userdata.append(li) + # print(item) + return item + + if "title" in item.keys(): + if item["author"] in self.nonerror_data: + pass + else: + # 未失联的人 + self.nonerror_data.add(item["author"]) + + # print(item) + for query_item in self.query_post_list[:self.query_post_num]: + try: + if query_item.get("link") == item["link"]: + query_item['created'] = min(item['created'], query_item.get("created")) + post_id = query_item.get("_id") + self.posts.delete_one({"_id": post_id}) + return item + except: + pass + + self.friendpoor_save(item) + + return item + + def close_spider(self, spider): + # print(self.nonerror_data) + # print(self.userdata) + + count, error_num = self.friendlist_push() + self.outdate_clean(settings.OUTDATE_CLEAN) + num = self.friendpoor_push() + print("----------------------") + print("友链总数 : %d" % count) + print("失联友链数 : %d" % error_num) + print("共 %d 篇文章" % num) + print("最后运行于:%s" % today) + print("done!") + + def outdate_clean(self, time_limit): + out_date_post = 0 + for query_item in self.query_post_list: + updated = query_item.get("updated") + try: + query_time = datetime.strptime(updated, "%Y-%m-%d") + if (datetime.today() + timedelta(hours=8) - query_time).days > time_limit: + self.posts.delete_one({"_id": query_item.get("_id")}) + query_item.clear() + out_date_post += 1 + except: + self.posts.delete_one({"_id": query_item.get("_id")}) + query_item.clear() + out_date_post += 1 + # print('\n') + # print('共删除了%s篇文章' % out_date_post) + # print('\n') + # print('-------结束删除规则----------') + + def friendlist_push(self): + friends = [] + error_num = 0 + for user in self.userdata: + friend = { + "name": user[0], + "link": user[1], + "avatar": user[2], + "createdAt": today, + } + if user[0] in self.nonerror_data: + # print("未失联的用户") + friend["error"] = False + elif settings.BLOCK_SITE: + error = True + for url in settings.BLOCK_SITE: + if re.match(url, friend["link"]): + friend["error"] = False + error = False + if error: + print("请求失败,请检查链接: %s" % friend["link"]) + friend["error"] = True + error_num += 1 + else: + print("请求失败,请检查链接: %s" % friend["link"]) + friend["error"] = True + error_num += 1 + friends.append(friend) + self.friends.insert_many(friends) + return len(friends), error_num + + def friendpoor_push(self): + post_list = [item for item in self.query_post_list if item] + self.posts.insert_many(post_list) + return len(post_list) + + def friendpoor_save(self, item): + item["createdAt"] = today + self.query_post_list.append(item.copy()) + print("----------------------") + print(item["author"]) + print("《{}》\n文章发布时间:{}\t\t采取的爬虫规则为:{}".format(item["title"], item["created"], item["rule"])) diff --git a/hexo_circle_of_friends/pipelines/pipelines.py b/hexo_circle_of_friends/pipelines/pipelines.py index eb37976d151e..ed6579a99beb 100644 --- a/hexo_circle_of_friends/pipelines/pipelines.py +++ b/hexo_circle_of_friends/pipelines/pipelines.py @@ -5,10 +5,12 @@ from scrapy.exceptions import DropItem from ..utils import process_time + class DuplicatesPipeline: def __init__(self): - self.data_set = set() # posts filter set 用于对post文章数据的去重 - self.friends_set = set() # friends filter set 用于对friends的去重 + self.data_set = set() # posts filter set 用于对post文章数据的去重 + self.friends_set = set() # friends filter set 用于对friends的去重 + def process_item(self, item, spider): if "userdata" in item.keys(): # userdata filter @@ -18,22 +20,22 @@ def process_item(self, item, spider): self.friends_set.add(link) return item - link = item['title'] - if link in self.data_set or link=="": + link = item['link'] + if link in self.data_set or link == "": # 重复数据清洗 raise DropItem("Duplicate found:%s" % link) if not item["title"]: raise DropItem("missing fields :'title'") - elif not re.match("^http.?://",item["link"]): + elif not re.match("^http.?://", item["link"]): # 链接必须是http开头,不能是相对地址 raise DropItem("invalid link ") # 时间检查 - if not process_time.format_check(item["time"],item["updated"]): + if not process_time.format_check(item["created"], item["updated"]): raise DropItem("invalid time ") - if not process_time.content_check(item["time"],item["updated"]): + if not process_time.content_check(item["created"], item["updated"]): raise DropItem("invalid time ") self.data_set.add(link) - return item \ No newline at end of file + return item diff --git a/hexo_circle_of_friends/pipelines/sql_pipe.py b/hexo_circle_of_friends/pipelines/sql_pipe.py index c7c3e4ec5a3b..04ca179e12c0 100644 --- a/hexo_circle_of_friends/pipelines/sql_pipe.py +++ b/hexo_circle_of_friends/pipelines/sql_pipe.py @@ -1,7 +1,6 @@ # -*- coding:utf-8 -*- # Author:yyyz import os -import datetime import re from .. import models, settings @@ -56,20 +55,18 @@ def process_item(self, item, spider): return item if "title" in item.keys(): - if item["name"] in self.nonerror_data: + if item["author"] in self.nonerror_data: pass else: # 未失联的人 - self.nonerror_data.add(item["name"]) + self.nonerror_data.add(item["author"]) # print(item) for query_item in self.query_post_list: try: if query_item.link == item["link"]: - item["time"] = min(item['time'], query_item.created) - self.session.query(models.Post).filter_by(id=query_item.id).delete() - self.session.commit() - # print("----deleted %s ----"%item["title"]) + item["created"] = min(item['created'], query_item.created) + self.session.query(models.Post).filter_by(link=query_item.link).delete() except: pass @@ -82,13 +79,12 @@ def close_spider(self, spider): # print(self.userdata) self.friendlist_push() - self.outdate_clean(settings.OUTDATE_CLEAN) print("----------------------") print("友链总数 : %d" % self.session.query(models.Friend).count()) print("失联友链数 : %d" % self.session.query(models.Friend).filter_by(error=True).count()) print("共 %d 篇文章" % self.session.query(models.Post).count()) - self.session.close() + print("最后运行于:%s" % today) print("done!") @@ -100,18 +96,19 @@ def query_post(self): def outdate_clean(self, time_limit): out_date_post = 0 + self.query_post() for query_item in self.query_post_list: - created = query_item.created + updated = query_item.updated try: - query_time = datetime.datetime.strptime(created, "%Y-%m-%d") - if (datetime.datetime.today() - query_time).days > time_limit: - self.session.query(models.Post).filter_by(id=query_item.id).delete() + query_time = datetime.strptime(updated, "%Y-%m-%d") + if (datetime.today()+timedelta(hours=8) - query_time).days > time_limit: + self.session.query(models.Post).filter_by(link=query_item.link).delete() out_date_post += 1 - self.session.commit() except: - self.session.query(models.Post).filter_by(id=query_item.id).delete() - self.session.commit() + self.session.query(models.Post).filter_by(link=query_item.link).delete() out_date_post += 1 + self.session.commit() + self.session.close() # print('\n') # print('共删除了%s篇文章' % out_date_post) # print('\n') @@ -145,15 +142,15 @@ def friendlist_push(self): def friendpoor_push(self, item): post = models.Post( title=item['title'], - created=item['time'], + created=item['created'], updated=item['updated'], link=item['link'], - author=item['name'], - avatar=item['img'], + author=item['author'], + avatar=item['avatar'], rule=item['rule'] ) self.session.add(post) self.session.commit() print("----------------------") - print(item["name"]) - print("《{}》\n文章发布时间:{}\t\t采取的爬虫规则为:{}".format(item["title"], item["time"], item["rule"])) + print(item["author"]) + print("《{}》\n文章发布时间:{}\t\t采取的爬虫规则为:{}".format(item["title"], item["created"], item["rule"])) diff --git a/hexo_circle_of_friends/requirements.txt b/hexo_circle_of_friends/requirements.txt index 631a684f03db..de8d01cf46dd 100644 --- a/hexo_circle_of_friends/requirements.txt +++ b/hexo_circle_of_friends/requirements.txt @@ -56,4 +56,6 @@ w3lib==1.22.0 Werkzeug==1.0.1 zope.event==4.5.0 zope.interface==5.4.0 -schedule==1.1.0 \ No newline at end of file +schedule==1.1.0 +pymongo==4.0.1 +dnspython==2.2.0 \ No newline at end of file diff --git a/hexo_circle_of_friends/run.py b/hexo_circle_of_friends/run.py index 5fa60c0e93a9..30bb726cef1c 100644 --- a/hexo_circle_of_friends/run.py +++ b/hexo_circle_of_friends/run.py @@ -33,7 +33,8 @@ def initsettings(setting): setting["ITEM_PIPELINES"]["hexo_circle_of_friends.pipelines.leancloud_pipe.LeancloudPipeline"] = 300 elif DATABASE == 'mysql' or DATABASE == "sqlite": setting["ITEM_PIPELINES"]["hexo_circle_of_friends.pipelines.sql_pipe.SQLPipeline"] = 300 - + elif DATABASE == "mongodb": + setting["ITEM_PIPELINES"]["hexo_circle_of_friends.pipelines.mongodb_pipe.MongoDBPipeline"] = 300 if __name__ == '__main__': if DEPLOY_TYPE == "docker" or DEPLOY_TYPE == "server": diff --git a/hexo_circle_of_friends/settings.py b/hexo_circle_of_friends/settings.py index 10b96d00e831..351aade36169 100644 --- a/hexo_circle_of_friends/settings.py +++ b/hexo_circle_of_friends/settings.py @@ -26,14 +26,16 @@ } # 配置项友链 -# 格式:["name", "link", "avatar","suffix"], -# 参数说明: -# name:必填,友链的名字 -# link:必填,友链主页地址 -# avatar:必填,头像地址 -# suffix:选填,自定义订阅后缀,主要针对不规范的网站订阅后缀,见示例2 +# enable:# 是否启用配置项友链 True/False(针对还未适配主题或者有定制需求的用户) +# json_api:通过api获取配置项友链,返回格式必须为:{"friends":[[友链1],[友链2],[友链3],[友链4]....]},友链内容同list字段格式 +# list字段填写格式:["name", "link", "avatar","suffix"],其中: +# name:必填,友链的名字 +# link:必填,友链主页地址 +# avatar:必填,头像地址 +# suffix:选填,自定义订阅后缀,主要针对不规范的网站订阅后缀,见示例2 SETTINGS_FRIENDS_LINKS = { - "enable": False, # 是否启用配置项友链 True/False(此项针对还未适配的主题用户) + "enable": False, + "json_api":"", "list": [ # 示例1: ["贰猹の小窝", "https://noionion.top/", "https://pub-noionion.oss-cn-hangzhou.aliyuncs.com/head.jpg"], @@ -89,7 +91,7 @@ # } ] -# 存储方式,可选项:leancloud,mysql, sqlite;默认为leancloud +# 存储方式,可选项:leancloud,mysql,sqlite,mongodb;默认为leancloud DATABASE = "leancloud" # 部署方式,可选项:github,server,docker;默认为github @@ -100,7 +102,7 @@ ##############################除非您了解本项目,否则请勿修改以下内容################################ -VERSION = "4.2.6" +VERSION = "4.3.0" # debug # debug模式 diff --git a/hexo_circle_of_friends/spiders/hexo_circle_of_friends.py b/hexo_circle_of_friends/spiders/hexo_circle_of_friends.py index ff0793fba44a..cee9ac8ac082 100644 --- a/hexo_circle_of_friends/spiders/hexo_circle_of_friends.py +++ b/hexo_circle_of_friends/spiders/hexo_circle_of_friends.py @@ -1,5 +1,5 @@ # -*- coding:utf-8 -*- - +# Author:yyyz import datetime import os import scrapy @@ -50,12 +50,10 @@ def start_requests(self): # 从配置文件导入友链列表 if settings.SETTINGS_FRIENDS_LINKS['enable']: for li in settings.SETTINGS_FRIENDS_LINKS["list"]: - # user_info = [li[0],li[1],li[2]] - # print('----------------------') - # print('好友名%r' % li[0]) - # print('头像链接%r' % li[2]) - # print('主页链接%r' % li[1]) self.friend_poor.put(li) + if re.match("^http.?://", settings.SETTINGS_FRIENDS_LINKS["json_api"]): + yield Request(settings.SETTINGS_FRIENDS_LINKS["json_api"], callback=self.settings_friends_json_parse) + if settings.GITEE_FRIENDS_LINKS['enable']: for number in range(1, 100): domain = 'https://gitee.com' @@ -76,6 +74,15 @@ def start_requests(self): for i, url in enumerate(self.start_urls): yield Request(url, callback=self.friend_poor_parse, meta={"theme": friendpage_theme[i]}) + def settings_friends_json_parse(self, response): + import json + try: + friends = json.loads(response)["friends"] + for friend in friends: + self.friend_poor.put(friend) + except: + pass + def init_start_urls(self): friendpage_link = [] friendpage_theme = [] @@ -392,8 +399,8 @@ def process_theme_postinfo(self, friend, links, titles, createds, updateds, rule def init_post_info(self, friend, rule): post_info = { - "name": friend[0], - "img": friend[2], + "author": friend[0], + "avatar": friend[2], "rule": rule } return post_info @@ -443,7 +450,7 @@ def process_time(self, createds, updateds, lenth): def generate_postinfo(self, init_post_info, title, created, updated, link): post_info = init_post_info post_info["title"] = title - post_info["time"] = created + post_info["created"] = created post_info["updated"] = updated post_info["link"] = link return post_info @@ -456,7 +463,3 @@ def errback_handler(self, error): # request = error.request # meta = error.request.meta pass - - def typecho_errback_handler(self, error): - yield Request(error.request.url, callback=self.post_feed_parse, dont_filter=True, meta=error.request.meta, - errback=self.errback_handler) diff --git a/requirements.txt b/requirements.txt index 991e5a833f86..1bfd1602f940 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,6 @@ uvicorn==0.14.0 requests PyMySQL==1.0.2 SQLAlchemy==1.4.31 -lxml==4.6.4 \ No newline at end of file +lxml==4.6.4 +pymongo==4.0.1 +dnspython==2.2.0 \ No newline at end of file diff --git a/server.sh b/server.sh index 0b3f7b415721..34b5d9869328 100644 --- a/server.sh +++ b/server.sh @@ -14,6 +14,7 @@ export APPKEY="" #export MYSQL_PASSWORD="" #export MYSQL_IP="" #export MYSQL_DB="" - +### mongodb配置 +#export MONGODB_URI="" nohup python3 -u ./hexo_circle_of_friends/run.py > /tmp/crawler.log 2>&1 & nohup python3 -u ./api/main.py > /tmp/api.log 2>&1 & \ No newline at end of file