Skip to content

Commit

Permalink
release 4.3.0
Browse files Browse the repository at this point in the history
新增mongodb存储,和对应的api
新增api方式的配置项友链
修复过期文章清除不生效的问题
其它bug修复
  • Loading branch information
hiltay committed Feb 13, 2022
1 parent 3307d94 commit 0568a4c
Show file tree
Hide file tree
Showing 15 changed files with 354 additions and 74 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ env:
GITHUB_NAME: ${{ secrets.GH_NAME }} # 你的github昵称
GITHUB_EMAIL: ${{ secrets.GH_EMAIL }} # 你的github邮箱
GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} # github token


# mongodb配置
MONGODB_URI: ${{ secrets.MONGODB_URI }} # mongodb URI

jobs:
build:
Expand Down
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ ENV APPKEY=""
#ENV MYSQL_PASSWORD=""
#ENV MYSQL_IP=""
#ENV MYSQL_DB=""
### mongodb配置
#ENV MONGODB_URI=""
EXPOSE 8000
WORKDIR /
RUN cd ./hexo_circle_of_friends && pip3 install -r requirements.txt -i https://pypi.douban.com/simple/
Expand Down
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

⭐从4.1.3版本开始,一定要在配置项中配置友链页的获取策略
```
目前 release 4.2.6 版本:
目前 release 4.3.0 版本:
- 支持 gitee 和 github 上的 issuse 友链获取
- 支持butterfly、volantis、matery、sakura、fluid、nexmoe、Yun、stun、stellar、next主题的友链和文章获取
- 支持feed订阅规则,如atom、rss等规则(支持wordpress类型的博客)
Expand All @@ -16,12 +16,13 @@
- 支持未适配的hexo主题和非hexo用户使用,在配置项选择开启配置项友链
- 额外的友链页同时爬取
- 支持添加HTTP代理
- 新增数据存储配置,提供多种存储方式
- 新增部署方式配置,可部署在本地服务端
- 多种数据存储,提供leancloud,mysql,sqlite,mongodb存储方式
- 多种方式部署,提供github,server,docker部署方式
- 将api整合到主仓库
- 新增友链获取策略的common规则
- 新增api方式的配置项友链
bug修复
bug修复和改动
- wordpress类型博客的时间格式问题
- butterfly主题友链页解析不再抓取背景图片了
- 修复了github和gitee对volantis主题的友链获取
Expand All @@ -36,5 +37,6 @@ bug修复:
- 移除旧订阅规则解析
- 修复butterfly的时间获取
- 额外友链页也可以配置获取策略
- 修复过期文章清除不生效的问题
```

2 changes: 2 additions & 0 deletions api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from api.leancloudapi import *
elif settings.DATABASE == "mysql" or settings.DATABASE == "sqlite":
from api.sqlapi import *
elif settings.DATABASE == "mongodb":
from api.mongodbapi import *

app = FastAPI()

Expand Down
122 changes: 122 additions & 0 deletions api/mongodbapi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# -*- coding:utf-8 -*-
# Author:yyyz
import os
import random

from urllib import parse
from hexo_circle_of_friends import settings
from pymongo import MongoClient


def db_init():
if settings.DEBUG:
URI = "mongodb+srv://root:@cluster0.wgfbv.mongodb.net/myFirstDatabase?retryWrites=true&w=majority"
else:
URI = os.environ.get("MONGODB_URI")
client = MongoClient(URI)
db = client.fcircle
posts = db.Post
friends = db.Friend
return posts, friends


def query_all(list, start: int = 0, end: int = -1, rule: str = "updated"):
post_collection, friend_db_collection = db_init()
article_num = post_collection.count_documents({})
if end == -1:
end = min(article_num, 1000)
if start < 0 or start >= min(article_num, 1000):
return {"message": "start error"}
if end <= 0 or end > min(article_num, 1000):
return {"message": "end error"}
if rule != "created" and rule != "updated":
return {"message": "rule error, please use 'created'/'updated'"}

posts = post_collection.find({}, {'_id': 0, "rule": 0}).sort([(rule, -1)]).limit(end - start).skip(start)
last_update_time = "1970-01-01 00:00:00"
post_data = []
for k, post in enumerate(posts):
last_update_time = max(last_update_time, post.pop("createdAt"))
item = {'floor': start + k + 1}
item.update(post)
post_data.append(item)

friends_num = friend_db_collection.count_documents({})
active_num = friend_db_collection.count_documents({"error": False})
error_num = friends_num - active_num

data = {}
data['statistical_data'] = {
'friends_num': friends_num,
'active_num': active_num,
'error_num': error_num,
'article_num': article_num,
'last_updated_time': last_update_time
}

data['article_data'] = post_data
return data


def query_friend():
_, friend_db_collection = db_init()
friends = friend_db_collection.find({}, {"_id": 0, "createdAt": 0, "error": 0})
friend_list_json = []
if friends:
for friend in friends:
friend_list_json.append(friend)
else:
# friends为空直接返回
return {"message": "not found"}
return friend_list_json


def query_random_friend():
_, friend_db_collection = db_init()
friends = friend_db_collection.find({}, {"_id": 0, "createdAt": 0, "error": 0})
friends_num = friend_db_collection.count_documents({})
random_friend = friends[random.randint(0, friends_num - 1)]

return random_friend if random_friend else {"message": "not found"}


def query_random_post():
post_collection, _ = db_init()
posts = post_collection.find({}, {'_id': 0, "rule": 0, "createdAt": 0})
posts_num = post_collection.count_documents({})
random_post = posts[random.randint(0, posts_num - 1)]
return random_post if random_post else {"message": "not found"}


def query_post(link, num, rule):
post_collection, friend_db_collection = db_init()
if link is None:
friend = query_random_friend()
domain = parse.urlsplit(friend.get("link")).netloc
else:
domain = parse.urlsplit(link).netloc
friend = friend_db_collection.find_one({'link': {'$regex': domain}}, {"_id": 0, "createdAt": 0, "error": 0})

if rule != "created" and rule != "updated":
return {"message": "rule error, please use 'created'/'updated'"}

posts = post_collection.find(
{'link': {'$regex': domain}},
{'_id': 0, "rule": 0, "createdAt": 0, "avatar": 0, "author": 0}
).sort([(rule, -1)]).limit(num if num > 0 else 0)

data = []
for floor, post in enumerate(posts):
post["floor"] = floor + 1
data.append(post)
if friend:
friend["article_num"] = len(data)
api_json = {"statistical_data": friend, "article_data": data}
else:
# 如果user为空直接返回
return {"message": "not found"}
return api_json


def query_post_json(jsonlink, list, start, end, rule):
return {"message": "not found"}
26 changes: 13 additions & 13 deletions hexo_circle_of_friends/pipelines/leancloud_pipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
# Author:yyyz
import os
import leancloud
import datetime
import re
from .. import settings
from datetime import datetime,timedelta
from datetime import datetime, timedelta

today = (datetime.now() + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S')


class LeancloudPipeline:
def __init__(self):
self.userdata = []
Expand Down Expand Up @@ -48,17 +48,17 @@ def process_item(self, item, spider):
return item

if "title" in item.keys():
if item["name"] in self.nonerror_data:
if item["author"] in self.nonerror_data:
pass
else:
# 未失联的人
self.nonerror_data.add(item["name"])
self.nonerror_data.add(item["author"])

# print(item)
for query_item in self.query_post_list:
try:
if query_item.get("link") == item["link"]:
item["time"] = min(item['time'], query_item.get('created'))
item["created"] = min(item['created'], query_item.get('created'))
delete = self.Friendspoor.create_without_data(query_item.get('objectId'))
delete.destroy()
# print("----deleted %s ----"%item["title"])
Expand Down Expand Up @@ -106,10 +106,10 @@ def outdate_clean(self, time_limit):
out_date_post = 0
for query_i in self.query_post_list:

created = query_i.get('created')
updated = query_i.get('updated')
try:
query_time = datetime.datetime.strptime(created, "%Y-%m-%d")
if (datetime.datetime.today() - query_time).days > time_limit:
query_time = datetime.strptime(updated, "%Y-%m-%d")
if (datetime.today() + timedelta(hours=8) - query_time).days > time_limit:
delete = self.Friendspoor.create_without_data(query_i.get('objectId'))
out_date_post += 1
delete.destroy()
Expand Down Expand Up @@ -151,14 +151,14 @@ def friendlist_push(self):
def friendpoor_push(self, item):
friendpoor = self.Friendspoor()
friendpoor.set('title', item['title'])
friendpoor.set('created', item['time'])
friendpoor.set('created', item['created'])
friendpoor.set('updated', item['updated'])
friendpoor.set('link', item['link'])
friendpoor.set('author', item['name'])
friendpoor.set('avatar', item['img'])
friendpoor.set('author', item['author'])
friendpoor.set('avatar', item['avatar'])
friendpoor.set('rule', item['rule'])
friendpoor.save()
print("----------------------")
print(item["name"])
print("《{}》\n文章发布时间:{}\t\t采取的爬虫规则为:{}".format(item["title"], item["time"], item["rule"]))
print(item["author"])
print("《{}》\n文章发布时间:{}\t\t采取的爬虫规则为:{}".format(item["title"], item["created"], item["rule"]))
self.total_post_num += 1
142 changes: 142 additions & 0 deletions hexo_circle_of_friends/pipelines/mongodb_pipe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# -*- coding:utf-8 -*-
# Author:yyyz
import os
import re
from datetime import datetime, timedelta
from pymongo import MongoClient
from .. import settings

today = (datetime.now() + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S')


class MongoDBPipeline:
def __init__(self):
self.userdata = []
self.nonerror_data = set() # 能够根据友链link获取到文章的人
self.query_post_list = []

def open_spider(self, spider):

if settings.DEBUG:
URI = "mongodb+srv://root:@cluster0.wgfbv.mongodb.net/myFirstDatabase?retryWrites=true&w=majority"
else:
URI = os.environ.get("MONGODB_URI")
client = MongoClient(URI)
db = client.fcircle
self.posts = db.Post
self.friends = db.Friend
self.query_post_num = self.posts.count_documents({})

for post in self.posts.find():
self.query_post_list.append(post)

self.friends.delete_many({})
print("Initialization complete")

def process_item(self, item, spider):
if "userdata" in item.keys():
li = []
li.append(item["name"])
li.append(item["link"])
li.append(item["img"])
self.userdata.append(li)
# print(item)
return item

if "title" in item.keys():
if item["author"] in self.nonerror_data:
pass
else:
# 未失联的人
self.nonerror_data.add(item["author"])

# print(item)
for query_item in self.query_post_list[:self.query_post_num]:
try:
if query_item.get("link") == item["link"]:
query_item['created'] = min(item['created'], query_item.get("created"))
post_id = query_item.get("_id")
self.posts.delete_one({"_id": post_id})
return item
except:
pass

self.friendpoor_save(item)

return item

def close_spider(self, spider):
# print(self.nonerror_data)
# print(self.userdata)

count, error_num = self.friendlist_push()
self.outdate_clean(settings.OUTDATE_CLEAN)
num = self.friendpoor_push()
print("----------------------")
print("友链总数 : %d" % count)
print("失联友链数 : %d" % error_num)
print("共 %d 篇文章" % num)
print("最后运行于:%s" % today)
print("done!")

def outdate_clean(self, time_limit):
out_date_post = 0
for query_item in self.query_post_list:
updated = query_item.get("updated")
try:
query_time = datetime.strptime(updated, "%Y-%m-%d")
if (datetime.today() + timedelta(hours=8) - query_time).days > time_limit:
self.posts.delete_one({"_id": query_item.get("_id")})
query_item.clear()
out_date_post += 1
except:
self.posts.delete_one({"_id": query_item.get("_id")})
query_item.clear()
out_date_post += 1
# print('\n')
# print('共删除了%s篇文章' % out_date_post)
# print('\n')
# print('-------结束删除规则----------')

def friendlist_push(self):
friends = []
error_num = 0
for user in self.userdata:
friend = {
"name": user[0],
"link": user[1],
"avatar": user[2],
"createdAt": today,
}
if user[0] in self.nonerror_data:
# print("未失联的用户")
friend["error"] = False
elif settings.BLOCK_SITE:
error = True
for url in settings.BLOCK_SITE:
if re.match(url, friend["link"]):
friend["error"] = False
error = False
if error:
print("请求失败,请检查链接: %s" % friend["link"])
friend["error"] = True
error_num += 1
else:
print("请求失败,请检查链接: %s" % friend["link"])
friend["error"] = True
error_num += 1
friends.append(friend)
self.friends.insert_many(friends)
return len(friends), error_num

def friendpoor_push(self):
post_list = [item for item in self.query_post_list if item]
self.posts.insert_many(post_list)
return len(post_list)

def friendpoor_save(self, item):
item["createdAt"] = today
self.query_post_list.append(item.copy())
print("----------------------")
print(item["author"])
print("《{}》\n文章发布时间:{}\t\t采取的爬虫规则为:{}".format(item["title"], item["created"], item["rule"]))

0 comments on commit 0568a4c

Please sign in to comment.