-
Notifications
You must be signed in to change notification settings - Fork 544
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
新增mongodb存储,和对应的api 新增api方式的配置项友链 修复过期文章清除不生效的问题 其它bug修复
- Loading branch information
Showing
15 changed files
with
354 additions
and
74 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
# -*- coding:utf-8 -*- | ||
# Author:yyyz | ||
import os | ||
import random | ||
|
||
from urllib import parse | ||
from hexo_circle_of_friends import settings | ||
from pymongo import MongoClient | ||
|
||
|
||
def db_init(): | ||
if settings.DEBUG: | ||
URI = "mongodb+srv://root:@cluster0.wgfbv.mongodb.net/myFirstDatabase?retryWrites=true&w=majority" | ||
else: | ||
URI = os.environ.get("MONGODB_URI") | ||
client = MongoClient(URI) | ||
db = client.fcircle | ||
posts = db.Post | ||
friends = db.Friend | ||
return posts, friends | ||
|
||
|
||
def query_all(list, start: int = 0, end: int = -1, rule: str = "updated"): | ||
post_collection, friend_db_collection = db_init() | ||
article_num = post_collection.count_documents({}) | ||
if end == -1: | ||
end = min(article_num, 1000) | ||
if start < 0 or start >= min(article_num, 1000): | ||
return {"message": "start error"} | ||
if end <= 0 or end > min(article_num, 1000): | ||
return {"message": "end error"} | ||
if rule != "created" and rule != "updated": | ||
return {"message": "rule error, please use 'created'/'updated'"} | ||
|
||
posts = post_collection.find({}, {'_id': 0, "rule": 0}).sort([(rule, -1)]).limit(end - start).skip(start) | ||
last_update_time = "1970-01-01 00:00:00" | ||
post_data = [] | ||
for k, post in enumerate(posts): | ||
last_update_time = max(last_update_time, post.pop("createdAt")) | ||
item = {'floor': start + k + 1} | ||
item.update(post) | ||
post_data.append(item) | ||
|
||
friends_num = friend_db_collection.count_documents({}) | ||
active_num = friend_db_collection.count_documents({"error": False}) | ||
error_num = friends_num - active_num | ||
|
||
data = {} | ||
data['statistical_data'] = { | ||
'friends_num': friends_num, | ||
'active_num': active_num, | ||
'error_num': error_num, | ||
'article_num': article_num, | ||
'last_updated_time': last_update_time | ||
} | ||
|
||
data['article_data'] = post_data | ||
return data | ||
|
||
|
||
def query_friend(): | ||
_, friend_db_collection = db_init() | ||
friends = friend_db_collection.find({}, {"_id": 0, "createdAt": 0, "error": 0}) | ||
friend_list_json = [] | ||
if friends: | ||
for friend in friends: | ||
friend_list_json.append(friend) | ||
else: | ||
# friends为空直接返回 | ||
return {"message": "not found"} | ||
return friend_list_json | ||
|
||
|
||
def query_random_friend(): | ||
_, friend_db_collection = db_init() | ||
friends = friend_db_collection.find({}, {"_id": 0, "createdAt": 0, "error": 0}) | ||
friends_num = friend_db_collection.count_documents({}) | ||
random_friend = friends[random.randint(0, friends_num - 1)] | ||
|
||
return random_friend if random_friend else {"message": "not found"} | ||
|
||
|
||
def query_random_post(): | ||
post_collection, _ = db_init() | ||
posts = post_collection.find({}, {'_id': 0, "rule": 0, "createdAt": 0}) | ||
posts_num = post_collection.count_documents({}) | ||
random_post = posts[random.randint(0, posts_num - 1)] | ||
return random_post if random_post else {"message": "not found"} | ||
|
||
|
||
def query_post(link, num, rule): | ||
post_collection, friend_db_collection = db_init() | ||
if link is None: | ||
friend = query_random_friend() | ||
domain = parse.urlsplit(friend.get("link")).netloc | ||
else: | ||
domain = parse.urlsplit(link).netloc | ||
friend = friend_db_collection.find_one({'link': {'$regex': domain}}, {"_id": 0, "createdAt": 0, "error": 0}) | ||
|
||
if rule != "created" and rule != "updated": | ||
return {"message": "rule error, please use 'created'/'updated'"} | ||
|
||
posts = post_collection.find( | ||
{'link': {'$regex': domain}}, | ||
{'_id': 0, "rule": 0, "createdAt": 0, "avatar": 0, "author": 0} | ||
).sort([(rule, -1)]).limit(num if num > 0 else 0) | ||
|
||
data = [] | ||
for floor, post in enumerate(posts): | ||
post["floor"] = floor + 1 | ||
data.append(post) | ||
if friend: | ||
friend["article_num"] = len(data) | ||
api_json = {"statistical_data": friend, "article_data": data} | ||
else: | ||
# 如果user为空直接返回 | ||
return {"message": "not found"} | ||
return api_json | ||
|
||
|
||
def query_post_json(jsonlink, list, start, end, rule): | ||
return {"message": "not found"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
# -*- coding:utf-8 -*- | ||
# Author:yyyz | ||
import os | ||
import re | ||
from datetime import datetime, timedelta | ||
from pymongo import MongoClient | ||
from .. import settings | ||
|
||
today = (datetime.now() + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S') | ||
|
||
|
||
class MongoDBPipeline: | ||
def __init__(self): | ||
self.userdata = [] | ||
self.nonerror_data = set() # 能够根据友链link获取到文章的人 | ||
self.query_post_list = [] | ||
|
||
def open_spider(self, spider): | ||
|
||
if settings.DEBUG: | ||
URI = "mongodb+srv://root:@cluster0.wgfbv.mongodb.net/myFirstDatabase?retryWrites=true&w=majority" | ||
else: | ||
URI = os.environ.get("MONGODB_URI") | ||
client = MongoClient(URI) | ||
db = client.fcircle | ||
self.posts = db.Post | ||
self.friends = db.Friend | ||
self.query_post_num = self.posts.count_documents({}) | ||
|
||
for post in self.posts.find(): | ||
self.query_post_list.append(post) | ||
|
||
self.friends.delete_many({}) | ||
print("Initialization complete") | ||
|
||
def process_item(self, item, spider): | ||
if "userdata" in item.keys(): | ||
li = [] | ||
li.append(item["name"]) | ||
li.append(item["link"]) | ||
li.append(item["img"]) | ||
self.userdata.append(li) | ||
# print(item) | ||
return item | ||
|
||
if "title" in item.keys(): | ||
if item["author"] in self.nonerror_data: | ||
pass | ||
else: | ||
# 未失联的人 | ||
self.nonerror_data.add(item["author"]) | ||
|
||
# print(item) | ||
for query_item in self.query_post_list[:self.query_post_num]: | ||
try: | ||
if query_item.get("link") == item["link"]: | ||
query_item['created'] = min(item['created'], query_item.get("created")) | ||
post_id = query_item.get("_id") | ||
self.posts.delete_one({"_id": post_id}) | ||
return item | ||
except: | ||
pass | ||
|
||
self.friendpoor_save(item) | ||
|
||
return item | ||
|
||
def close_spider(self, spider): | ||
# print(self.nonerror_data) | ||
# print(self.userdata) | ||
|
||
count, error_num = self.friendlist_push() | ||
self.outdate_clean(settings.OUTDATE_CLEAN) | ||
num = self.friendpoor_push() | ||
print("----------------------") | ||
print("友链总数 : %d" % count) | ||
print("失联友链数 : %d" % error_num) | ||
print("共 %d 篇文章" % num) | ||
print("最后运行于:%s" % today) | ||
print("done!") | ||
|
||
def outdate_clean(self, time_limit): | ||
out_date_post = 0 | ||
for query_item in self.query_post_list: | ||
updated = query_item.get("updated") | ||
try: | ||
query_time = datetime.strptime(updated, "%Y-%m-%d") | ||
if (datetime.today() + timedelta(hours=8) - query_time).days > time_limit: | ||
self.posts.delete_one({"_id": query_item.get("_id")}) | ||
query_item.clear() | ||
out_date_post += 1 | ||
except: | ||
self.posts.delete_one({"_id": query_item.get("_id")}) | ||
query_item.clear() | ||
out_date_post += 1 | ||
# print('\n') | ||
# print('共删除了%s篇文章' % out_date_post) | ||
# print('\n') | ||
# print('-------结束删除规则----------') | ||
|
||
def friendlist_push(self): | ||
friends = [] | ||
error_num = 0 | ||
for user in self.userdata: | ||
friend = { | ||
"name": user[0], | ||
"link": user[1], | ||
"avatar": user[2], | ||
"createdAt": today, | ||
} | ||
if user[0] in self.nonerror_data: | ||
# print("未失联的用户") | ||
friend["error"] = False | ||
elif settings.BLOCK_SITE: | ||
error = True | ||
for url in settings.BLOCK_SITE: | ||
if re.match(url, friend["link"]): | ||
friend["error"] = False | ||
error = False | ||
if error: | ||
print("请求失败,请检查链接: %s" % friend["link"]) | ||
friend["error"] = True | ||
error_num += 1 | ||
else: | ||
print("请求失败,请检查链接: %s" % friend["link"]) | ||
friend["error"] = True | ||
error_num += 1 | ||
friends.append(friend) | ||
self.friends.insert_many(friends) | ||
return len(friends), error_num | ||
|
||
def friendpoor_push(self): | ||
post_list = [item for item in self.query_post_list if item] | ||
self.posts.insert_many(post_list) | ||
return len(post_list) | ||
|
||
def friendpoor_save(self, item): | ||
item["createdAt"] = today | ||
self.query_post_list.append(item.copy()) | ||
print("----------------------") | ||
print(item["author"]) | ||
print("《{}》\n文章发布时间:{}\t\t采取的爬虫规则为:{}".format(item["title"], item["created"], item["rule"])) |
Oops, something went wrong.