Skip to content

Commit

Permalink
release 4.1.3
Browse files Browse the repository at this point in the history
  • Loading branch information
hiltay committed Jan 30, 2022
1 parent 3b1b385 commit d27616d
Show file tree
Hide file tree
Showing 6 changed files with 243 additions and 252 deletions.
14 changes: 8 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,38 @@

你是否经常烦恼于友链过多但没有时间浏览?那么友链朋友圈将解决这一痛点。你可以随时获取友链网站的更新内容,并了解友链的活跃情况 。

⭐从4.1.3版本开始,一定要在配置项中配置友链页的获取策略
```
目前 release 4.1.2 版本:
目前 release 4.1.3 版本:
- 支持 gitee 上的 issuse 友链获取
- 支持 github 上的 issuse 友链获取
- 支持 butterfly、volantis、matery、sakura、fluid主题的最新文章获取
- 新增目前最通用的atom和rss规则
- 支持站点屏蔽,在配置项选择开启
- 代码基于scrapy重构
- 支持更新时间和创建时间排序
- 支持未适配的hexo主题和非hexo用户使用,在配置项选择开启配置项友链
- 支持爬取typecho类型的博客
- 新增对nexmoe、Yun、stun主题的爬取
- 支持爬取wordpress类型的博客
- 优化文章去重规则
- 新增对nexmoe、Yun、stun主题的爬取
- 新增额外的友链页同时爬取,在配置项选择开启
- 新增对stellar主题的爬取
- 支持添加HTTP代理,在配置项选择开启
- 新增配置项友链选项,自定义订阅后缀和解析类型
- 逻辑重构,新增友链页获取策略配置
bug修复:
- wordpress类型博客的时间格式问题
- butterfly主题友链页解析不再抓取背景图片了
- 修复了github和gitee对volantis主题的友链获取
- 屏蔽站点现在不计入失联数
- 修复了sakura主题和nexmoe主题偶尔报错的问题
- 现在可以获取Yun主题的外置JSON友链
```

# 版本更新

发布新版本后,您只需要在您fork的仓库点击fetch即可更新到最新版本。
4.0以后的版本:发布新版本后,您只需要在您fork的仓库点击fetch即可更新到最新版本。
![img.png](img.png)
4.0以前的版本升级:建议重新fork

如果觉得本项目不错,请帮忙点个⭐Star,既是对我们的支持,还可以随时关注友链朋友圈的更新情况。

Expand Down
17 changes: 14 additions & 3 deletions hexo_circle_of_friends/run.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
from scrapy.cmdline import execute
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess

def main():
setting = get_project_settings()
process = CrawlerProcess(setting)
didntWorkSpider = ['xiaoso',]
for spider_name in process.spiders.list():
if spider_name in didntWorkSpider :
continue
# print("Running spider %s" % (spider_name))
process.crawl(spider_name)
process.start()

# execute(['scrapy','crawl','test'])
execute(['scrapy','crawl','hexo_circle_of_friends'])
if __name__ == '__main__':
main()
28 changes: 25 additions & 3 deletions hexo_circle_of_friends/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,33 @@



################################以下可以修改################################
# leancloud post data outdate_clean
################################请修改以下内容################################
# outdate_clean
# 过期文章清除(天)
OUTDATE_CLEAN = 60

# 友链页的获取策略
# 从4.1.3版本开始,为了程序更精准解析您的主题,需要配置此项
# 参数说明:
# strategy:必填,可选参数如下:
# - default:默认。指定友链页主题。示例:如果您的友链页为https://www.yyyzyyyz.cn/link/,请选择butterfly,以此类推
# - incompat:如果theme中不支持您的主题,请选择此项。此时建议使用配置项友链
# theme:必填,可选参数如下(这些是目前支持的主题):
# - butterfly:butterfly主题
# - fluid:fluid主题
# - matery:matery主题
# - nexmoe:nexmoe主题
# - stun:stun主题
# - sakura: sakura主题
# - volantis:volantis主题
# - Yun:Yun主题
# - stellar:stellar主题
FRIENDPAGE_STRATEGY={
"strategy": "default",
"theme": "butterfly" # 请修改为您的主题
}

# get links from settings
# 配置项友链
# 格式:["name", "link", "avatar","suffix","rules"],
# 参数说明:
# name:必填,友链的名字
Expand All @@ -107,6 +127,7 @@


# get links from gitee
# 从gitee issue中获取友链
GITEE_FRIENDS_LINKS={
"enable": False, # True 开启gitee issue兼容
"type": "normal", # volantis/stellar用户请在这里填写volantis
Expand All @@ -117,6 +138,7 @@


# get links from github
# 从github issue中获取友链
GITHUB_FRIENDS_LINKS = {
"enable": False, # True 开启github issue兼容
"type": "normal", # volantis/stellar用户请在这里填写volantis
Expand Down
112 changes: 49 additions & 63 deletions hexo_circle_of_friends/spiders/hexo_circle_of_friends.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
# -*- coding:utf-8 -*-

import calendar
import datetime
import time
import scrapy
import queue
from scrapy.http.request import Request
from hexo_circle_of_friends import settings
from bs4 import BeautifulSoup
from hexo_circle_of_friends.utils.get_theme_url import *
from hexo_circle_of_friends.utils.get_url import get_theme_url,Yun_async_link_handler
from hexo_circle_of_friends.utils.regulations import *
import sys

Expand Down Expand Up @@ -50,7 +49,7 @@ def start_requests(self):
"state"] + '&page=' + str(number)
yield Request(url, callback=self.friend_poor_parse, meta={"github": {"domain": domain}})
if settings.DEBUG:
friendpage_link = settings.FRIENPAGE_LINK
friendpage_link = settings.FRIENDPAGE_LINK
else:
friendpage_link = []
friendpage_link.append(sys.argv[3])
Expand Down Expand Up @@ -109,30 +108,16 @@ def friend_poor_parse(self, response):
pass

if "theme" in response.meta.keys():
user_info = []
link = get_link_url(response)
avatar = get_avatar_url(response)
name = get_name_url(response)
# print(link)
# print(avatar)
# print(name)
if len(link) == len(avatar) == len(name):
for i in range(len(link)):
if link[i] == "":
continue
user_info.append(name[i])
user_info.append(link[i])
user_info.append(avatar[i])
self.friend_poor.put(user_info)
user_info = []
# print("""------------------------\n
# name:%s
# avatar:%s
# link_list:%s
# """%(name[i],avatar[i],link[i]))
# print("total:%d"%i)

# print(self.friend_poor)
if settings.FRIENDPAGE_STRATEGY["strategy"] =="default":
theme = settings.FRIENDPAGE_STRATEGY["theme"]
async_link = get_theme_url(theme,response,self.friend_poor)
if async_link:
# Yun主题的async_link临时解决
yield Request(async_link,callback=self.friend_poor_parse,meta={"async_link":async_link},dont_filter=True)
else:
pass
if "async_link" in response.meta.keys():
Yun_async_link_handler(response,self.friend_poor)

# 要添加主题扩展,在这里添加一个请求
while not self.friend_poor.empty():
Expand Down Expand Up @@ -230,7 +215,7 @@ def post_rss2_parse(self, response):
link = sel.css("item guid::text").extract()
pubDate = sel.css("item pubDate::text").extract()
if len(link)>0:
l = len(title) if len(title) < 5 else 5
l = len(link) if len(link) < 5 else 5
try:
for i in range(l):
m = pubDate[i].split(" ")
Expand Down Expand Up @@ -259,7 +244,7 @@ def post_wordpress_parse(self, response):
link = [comm.split("#comments")[0] for comm in sel.css("item link+comments::text").extract()]
pubDate = sel.css("item pubDate::text").extract()
if len(link)>0:
l = len(title) if len(title) < 5 else 5
l = len(link) if len(link) < 5 else 5
try:
for i in range(l):
m = pubDate[i].split(" ")
Expand Down Expand Up @@ -420,29 +405,29 @@ def theme_sakura_parse(self, response):
main_content = soup.find_all(id='main')
time_excit = soup.find_all('div', {"class": "post-date"})
if main_content and time_excit:
link_list = main_content[0].find_all('div', {"class": "post-date"})
lasttime = datetime.datetime.strptime('1970-01-01', "%Y-%m-%d")
for index, item in enumerate(link_list):
date = item.text
date = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", date).group(0)
if lasttime < datetime.datetime.strptime(date, "%Y-%m-%d"):
lasttime = datetime.datetime.strptime(date, "%Y-%m-%d")
lasttime = lasttime.strftime('%Y-%m-%d')
# print('最新时间是', lasttime)
last_post_list = main_content[0].find_all('article', {"class": "post"})
for item in last_post_list:
time_created = item.find('div', {"class": "post-date"}).text.strip()
time_created = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", time_created).group(0)
time_created = datetime.datetime.strptime(time_created, "%Y-%m-%d").strftime("%Y-%m-%d")
if time_created == lasttime:
a = item.find('a')
alink = a['href']
alinksplit = alink.split("/", 1)
stralink = alinksplit[1].strip()
if link[-1] != '/':
link = link + '/'
link = link.split('/')[0]
try:
try:
link_list = main_content[0].find_all('div', {"class": "post-date"})
lasttime = datetime.datetime.strptime('1970-01-01', "%Y-%m-%d")
for index, item in enumerate(link_list):
date = item.text
date = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", date).group(0)
if lasttime < datetime.datetime.strptime(date, "%Y-%m-%d"):
lasttime = datetime.datetime.strptime(date, "%Y-%m-%d")
lasttime = lasttime.strftime('%Y-%m-%d')
# print('最新时间是', lasttime)
last_post_list = main_content[0].find_all('article', {"class": "post"})
for item in last_post_list:
time_created = item.find('div', {"class": "post-date"}).text.strip()
time_created = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", time_created).group(0)
time_created = datetime.datetime.strptime(time_created, "%Y-%m-%d").strftime("%Y-%m-%d")
if time_created == lasttime:
a = item.find('a')
alink = a['href']
alinksplit = alink.split("/", 1)
stralink = alinksplit[1].strip()
if link[-1] != '/':
link = link + '/'
link = link.split('/')[0]
post_info = {
'title': item.find('h3').text.strip(),
'time': lasttime,
Expand All @@ -453,8 +438,8 @@ def theme_sakura_parse(self, response):
'rule': "sakura"
}
yield post_info
except:
pass
except:
pass

def theme_volantis_parse(self, response):
# print("theme_volantis_parse---------->" + response.url)
Expand Down Expand Up @@ -510,13 +495,14 @@ def theme_nexmoe_parse(self, response):
partial_l = response.css("section.nexmoe-posts .nexmoe-post>a::attr(href)").extract()
title = response.css("section.nexmoe-posts .nexmoe-post h1::text").extract()
date = response.css("section.nexmoe-posts .nexmoe-post-meta a:first-child::text").extract()
if len(partial_l) == len(title) == len(date):
for i in range(len(partial_l)):
partial_l[i] = partial_l[i].lstrip("/")
r = re.split(r"[年月日]", date[i])
y, m, d = r[0], r[1], r[2]
date = y + "-" + m + "-" + d
try:
if len(partial_l)>0:
try:
l = len(partial_l) if len(partial_l) < 5 else 5
for i in range(l):
partial_l[i] = partial_l[i].lstrip("/")
r = re.split(r"[年月日]", date[i])
y, m, d = r[0], r[1], r[2]
date = y + "-" + m + "-" + d
post_info = {
'title': title[i],
'time': date,
Expand All @@ -527,8 +513,8 @@ def theme_nexmoe_parse(self, response):
'rule': "nexmoe"
}
yield post_info
except:
pass
except:
pass

def theme_Yun_parse(self, response):
# print("theme_Yun_parse---------->" + response.url)
Expand Down
Loading

0 comments on commit d27616d

Please sign in to comment.