diff --git a/README.md b/README.md index cb2fb38437ce..552d80239b7f 100644 --- a/README.md +++ b/README.md @@ -2,36 +2,38 @@ 你是否经常烦恼于友链过多但没有时间浏览?那么友链朋友圈将解决这一痛点。你可以随时获取友链网站的更新内容,并了解友链的活跃情况 。 +⭐从4.1.3版本开始,一定要在配置项中配置友链页的获取策略 ``` -目前 release 4.1.2 版本: +目前 release 4.1.3 版本: - 支持 gitee 上的 issuse 友链获取 - 支持 github 上的 issuse 友链获取 - 支持 butterfly、volantis、matery、sakura、fluid主题的最新文章获取 - 新增目前最通用的atom和rss规则 - 支持站点屏蔽,在配置项选择开启 -- 代码基于scrapy重构 - 支持更新时间和创建时间排序 - 支持未适配的hexo主题和非hexo用户使用,在配置项选择开启配置项友链 -- 支持爬取typecho类型的博客 -- 新增对nexmoe、Yun、stun主题的爬取 - 支持爬取wordpress类型的博客 -- 优化文章去重规则 +- 新增对nexmoe、Yun、stun主题的爬取 - 新增额外的友链页同时爬取,在配置项选择开启 - 新增对stellar主题的爬取 - 支持添加HTTP代理,在配置项选择开启 - 新增配置项友链选项,自定义订阅后缀和解析类型 +- 逻辑重构,新增友链页获取策略配置 bug修复: - wordpress类型博客的时间格式问题 - butterfly主题友链页解析不再抓取背景图片了 - 修复了github和gitee对volantis主题的友链获取 - 屏蔽站点现在不计入失联数 +- 修复了sakura主题和nexmoe主题偶尔报错的问题 +- 现在可以获取Yun主题的外置JSON友链 ``` # 版本更新 -发布新版本后,您只需要在您fork的仓库点击fetch即可更新到最新版本。 +4.0以后的版本:发布新版本后,您只需要在您fork的仓库点击fetch即可更新到最新版本。 ![img.png](img.png) +4.0以前的版本升级:建议重新fork 如果觉得本项目不错,请帮忙点个⭐Star,既是对我们的支持,还可以随时关注友链朋友圈的更新情况。 diff --git a/hexo_circle_of_friends/run.py b/hexo_circle_of_friends/run.py index 71bcfec483f6..dd669633cf84 100644 --- a/hexo_circle_of_friends/run.py +++ b/hexo_circle_of_friends/run.py @@ -1,5 +1,16 @@ -from scrapy.cmdline import execute +from scrapy.utils.project import get_project_settings +from scrapy.crawler import CrawlerProcess +def main(): + setting = get_project_settings() + process = CrawlerProcess(setting) + didntWorkSpider = ['xiaoso',] + for spider_name in process.spiders.list(): + if spider_name in didntWorkSpider : + continue + # print("Running spider %s" % (spider_name)) + process.crawl(spider_name) + process.start() -# execute(['scrapy','crawl','test']) -execute(['scrapy','crawl','hexo_circle_of_friends']) \ No newline at end of file +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/hexo_circle_of_friends/settings.py b/hexo_circle_of_friends/settings.py index 23fb4194110f..e7aa7a78356b 100644 --- a/hexo_circle_of_friends/settings.py +++ b/hexo_circle_of_friends/settings.py @@ -78,13 +78,33 @@ -################################以下可以修改################################ -# leancloud post data outdate_clean +################################请修改以下内容################################ +# outdate_clean # 过期文章清除(天) OUTDATE_CLEAN = 60 +# 友链页的获取策略 +# 从4.1.3版本开始,为了程序更精准解析您的主题,需要配置此项 +# 参数说明: +# strategy:必填,可选参数如下: +# - default:默认。指定友链页主题。示例:如果您的友链页为https://www.yyyzyyyz.cn/link/,请选择butterfly,以此类推 +# - incompat:如果theme中不支持您的主题,请选择此项。此时建议使用配置项友链 +# theme:必填,可选参数如下(这些是目前支持的主题): +# - butterfly:butterfly主题 +# - fluid:fluid主题 +# - matery:matery主题 +# - nexmoe:nexmoe主题 +# - stun:stun主题 +# - sakura: sakura主题 +# - volantis:volantis主题 +# - Yun:Yun主题 +# - stellar:stellar主题 +FRIENDPAGE_STRATEGY={ + "strategy": "default", + "theme": "butterfly" # 请修改为您的主题 +} -# get links from settings +# 配置项友链 # 格式:["name", "link", "avatar","suffix","rules"], # 参数说明: # name:必填,友链的名字 @@ -107,6 +127,7 @@ # get links from gitee +# 从gitee issue中获取友链 GITEE_FRIENDS_LINKS={ "enable": False, # True 开启gitee issue兼容 "type": "normal", # volantis/stellar用户请在这里填写volantis @@ -117,6 +138,7 @@ # get links from github +# 从github issue中获取友链 GITHUB_FRIENDS_LINKS = { "enable": False, # True 开启github issue兼容 "type": "normal", # volantis/stellar用户请在这里填写volantis diff --git a/hexo_circle_of_friends/spiders/hexo_circle_of_friends.py b/hexo_circle_of_friends/spiders/hexo_circle_of_friends.py index 134696235cbf..11866445a529 100644 --- a/hexo_circle_of_friends/spiders/hexo_circle_of_friends.py +++ b/hexo_circle_of_friends/spiders/hexo_circle_of_friends.py @@ -1,6 +1,5 @@ # -*- coding:utf-8 -*- -import calendar import datetime import time import scrapy @@ -8,7 +7,7 @@ from scrapy.http.request import Request from hexo_circle_of_friends import settings from bs4 import BeautifulSoup -from hexo_circle_of_friends.utils.get_theme_url import * +from hexo_circle_of_friends.utils.get_url import get_theme_url,Yun_async_link_handler from hexo_circle_of_friends.utils.regulations import * import sys @@ -50,7 +49,7 @@ def start_requests(self): "state"] + '&page=' + str(number) yield Request(url, callback=self.friend_poor_parse, meta={"github": {"domain": domain}}) if settings.DEBUG: - friendpage_link = settings.FRIENPAGE_LINK + friendpage_link = settings.FRIENDPAGE_LINK else: friendpage_link = [] friendpage_link.append(sys.argv[3]) @@ -109,30 +108,16 @@ def friend_poor_parse(self, response): pass if "theme" in response.meta.keys(): - user_info = [] - link = get_link_url(response) - avatar = get_avatar_url(response) - name = get_name_url(response) - # print(link) - # print(avatar) - # print(name) - if len(link) == len(avatar) == len(name): - for i in range(len(link)): - if link[i] == "": - continue - user_info.append(name[i]) - user_info.append(link[i]) - user_info.append(avatar[i]) - self.friend_poor.put(user_info) - user_info = [] - # print("""------------------------\n - # name:%s - # avatar:%s - # link_list:%s - # """%(name[i],avatar[i],link[i])) - # print("total:%d"%i) - - # print(self.friend_poor) + if settings.FRIENDPAGE_STRATEGY["strategy"] =="default": + theme = settings.FRIENDPAGE_STRATEGY["theme"] + async_link = get_theme_url(theme,response,self.friend_poor) + if async_link: + # Yun主题的async_link临时解决 + yield Request(async_link,callback=self.friend_poor_parse,meta={"async_link":async_link},dont_filter=True) + else: + pass + if "async_link" in response.meta.keys(): + Yun_async_link_handler(response,self.friend_poor) # 要添加主题扩展,在这里添加一个请求 while not self.friend_poor.empty(): @@ -230,7 +215,7 @@ def post_rss2_parse(self, response): link = sel.css("item guid::text").extract() pubDate = sel.css("item pubDate::text").extract() if len(link)>0: - l = len(title) if len(title) < 5 else 5 + l = len(link) if len(link) < 5 else 5 try: for i in range(l): m = pubDate[i].split(" ") @@ -259,7 +244,7 @@ def post_wordpress_parse(self, response): link = [comm.split("#comments")[0] for comm in sel.css("item link+comments::text").extract()] pubDate = sel.css("item pubDate::text").extract() if len(link)>0: - l = len(title) if len(title) < 5 else 5 + l = len(link) if len(link) < 5 else 5 try: for i in range(l): m = pubDate[i].split(" ") @@ -420,29 +405,29 @@ def theme_sakura_parse(self, response): main_content = soup.find_all(id='main') time_excit = soup.find_all('div', {"class": "post-date"}) if main_content and time_excit: - link_list = main_content[0].find_all('div', {"class": "post-date"}) - lasttime = datetime.datetime.strptime('1970-01-01', "%Y-%m-%d") - for index, item in enumerate(link_list): - date = item.text - date = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", date).group(0) - if lasttime < datetime.datetime.strptime(date, "%Y-%m-%d"): - lasttime = datetime.datetime.strptime(date, "%Y-%m-%d") - lasttime = lasttime.strftime('%Y-%m-%d') - # print('最新时间是', lasttime) - last_post_list = main_content[0].find_all('article', {"class": "post"}) - for item in last_post_list: - time_created = item.find('div', {"class": "post-date"}).text.strip() - time_created = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", time_created).group(0) - time_created = datetime.datetime.strptime(time_created, "%Y-%m-%d").strftime("%Y-%m-%d") - if time_created == lasttime: - a = item.find('a') - alink = a['href'] - alinksplit = alink.split("/", 1) - stralink = alinksplit[1].strip() - if link[-1] != '/': - link = link + '/' - link = link.split('/')[0] - try: + try: + link_list = main_content[0].find_all('div', {"class": "post-date"}) + lasttime = datetime.datetime.strptime('1970-01-01', "%Y-%m-%d") + for index, item in enumerate(link_list): + date = item.text + date = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", date).group(0) + if lasttime < datetime.datetime.strptime(date, "%Y-%m-%d"): + lasttime = datetime.datetime.strptime(date, "%Y-%m-%d") + lasttime = lasttime.strftime('%Y-%m-%d') + # print('最新时间是', lasttime) + last_post_list = main_content[0].find_all('article', {"class": "post"}) + for item in last_post_list: + time_created = item.find('div', {"class": "post-date"}).text.strip() + time_created = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", time_created).group(0) + time_created = datetime.datetime.strptime(time_created, "%Y-%m-%d").strftime("%Y-%m-%d") + if time_created == lasttime: + a = item.find('a') + alink = a['href'] + alinksplit = alink.split("/", 1) + stralink = alinksplit[1].strip() + if link[-1] != '/': + link = link + '/' + link = link.split('/')[0] post_info = { 'title': item.find('h3').text.strip(), 'time': lasttime, @@ -453,8 +438,8 @@ def theme_sakura_parse(self, response): 'rule': "sakura" } yield post_info - except: - pass + except: + pass def theme_volantis_parse(self, response): # print("theme_volantis_parse---------->" + response.url) @@ -510,13 +495,14 @@ def theme_nexmoe_parse(self, response): partial_l = response.css("section.nexmoe-posts .nexmoe-post>a::attr(href)").extract() title = response.css("section.nexmoe-posts .nexmoe-post h1::text").extract() date = response.css("section.nexmoe-posts .nexmoe-post-meta a:first-child::text").extract() - if len(partial_l) == len(title) == len(date): - for i in range(len(partial_l)): - partial_l[i] = partial_l[i].lstrip("/") - r = re.split(r"[年月日]", date[i]) - y, m, d = r[0], r[1], r[2] - date = y + "-" + m + "-" + d - try: + if len(partial_l)>0: + try: + l = len(partial_l) if len(partial_l) < 5 else 5 + for i in range(l): + partial_l[i] = partial_l[i].lstrip("/") + r = re.split(r"[年月日]", date[i]) + y, m, d = r[0], r[1], r[2] + date = y + "-" + m + "-" + d post_info = { 'title': title[i], 'time': date, @@ -527,8 +513,8 @@ def theme_nexmoe_parse(self, response): 'rule': "nexmoe" } yield post_info - except: - pass + except: + pass def theme_Yun_parse(self, response): # print("theme_Yun_parse---------->" + response.url) diff --git a/hexo_circle_of_friends/utils/get_theme_url.py b/hexo_circle_of_friends/utils/get_theme_url.py deleted file mode 100644 index 97a79461aa94..000000000000 --- a/hexo_circle_of_friends/utils/get_theme_url.py +++ /dev/null @@ -1,177 +0,0 @@ -# 获取朋友列表的url、name、avatar -# 每个函数负责一个功能 -def get_avatar_url(response): - # ------------- butterfly begin --------------- # - avatar = response.css('.flink-list .info img::attr(data-lazy-src)').extract() - if not avatar: - avatar = response.css('.flink-list a img::attr(data-lazy-src)').extract() - if not avatar: - avatar = response.css('.flink-list a .info img::attr(src)').extract() - if not avatar: - avatar = response.css('.flink-list a img::attr(src)').extract() - if not avatar: - avatar = response.css('.flink .site-card .info img::attr(data-lazy-src)').extract() - # ------------- butterfly end --------------- # - - # ------------- fluid begin --------------- # - if not avatar: - avatar = response.css('.link-avatar img::attr(src)').extract() - # ------------- fluid end --------------- # - - # ------------- matery begin --------------- # - if not avatar: - avatar = response.css('#friends-link .friend-div img::attr(src)').extract() - - # ------------- sakura begin --------------- # - if not avatar: - avatar = response.css('.link-item img::attr(src)').extract() - # ------------- sakura end --------------- # - - - # ------------- volantis begin --------------- # - if not avatar: - avatar = response.css('a.simpleuser img::attr(src)').extract() - if not avatar: - avatar = response.css('a.site-card img::attr(src)').extract() - if not avatar: - avatar = response.css('a.friend-card img::attr(src)').extract() - # ------------- volantis end --------------- # - - # ------------- nexmoe begin --------------- # - if not avatar: - avatar = response.css('.nexmoe-py ul img::attr(data-src)').extract() - # ------------- nexmoe end --------------- # - - # ------------- Yun begin --------------- # - if not avatar: - avatar = response.css('#links a img::attr(src)').extract() - # ------------- Yun end --------------- # - - # ------------- stun begin --------------- # - if not avatar: - avatar = response.css('.friends-plugin__item img::attr(data-src)').extract() - # ------------- stun end --------------- # - - # ------------- stellar begin --------------- # - if not avatar: - avatar = response.css('.card-link img::attr(data-src)').extract() - # ------------- stellar end --------------- # - # print(avatar) - return avatar - - -def get_link_url(response): - # ------------- butterfly begin --------------- # - link = response.css('.flink-list a::attr(href)').extract() - if not link: - link = response.css('.flink .site-card::attr(href)').extract() - # ------------- butterfly end --------------- # - - # ------------- fluid begin --------------- # - if not link: - link = response.css('.row.links a::attr(href)').extract() - # ------------- fluid end --------------- # - - # ------------- matery begin --------------- # - if not link: - link = response.css('#friends-link .friend-button>a::attr(href)').extract() - # ------------- matery end --------------- # - - # ------------- sakura begin --------------- # - if not link: - link = response.css('.link-item a::attr(href)').extract() - # ------------- sakura end --------------- # - - # ------------- volantis begin --------------- # - if not link: - link = response.css('a.simpleuser::attr(href)').extract() - if not link: - link = response.css('a.site-card::attr(href)').extract() - if not link: - link = response.css('a.friend-card::attr(href)').extract() - # ------------- volantis end --------------- # - - # ------------- nexmoe begin --------------- # - if not link: - link = response.css('.nexmoe-py ul a::attr(href)').extract() - # ------------- nexmoe end --------------- # - - - # ------------- Yun begin --------------- # - if not link: - link = response.css('#links a::attr(href)').extract() - # ------------- Yun end --------------- # - - # ------------- stun begin --------------- # - if not link: - link = response.css('.friends-plugin__item::attr(href)').extract() - # ------------- stun end --------------- # - - # ------------- stellar begin --------------- # - if not link: - link = response.css('.card-link::attr(href)').extract() - # ------------- stellar end --------------- # - # print(link) - return link - - -def get_name_url(response): - # ------------- butterfly begin --------------- # - name = response.css('.flink-list .flink-sitename::text').extract() - if not name: - name = response.css('.flink-list a .flink-item-name::text').extract() - if not name: - name = response.css('.flink .site-card .info .title::text').extract() - # ------------- butterfly end --------------- # - - # ------------- fluid begin --------------- # - if not name: - name = response.css('.link-title::text').extract() - # ------------- fluid end --------------- # - - # ------------- matery begin --------------- # - if not name: - name = response.css('#friends-link .friend-name::text').extract() - # ------------- matery end --------------- # - - # ------------- sakura begin --------------- # - if not name: - name = response.css('.link-item .sitename::text').extract() - if name: - for i ,n in enumerate(name): - name[i ] =name[i].strip("\n ") - # ------------- sakura end --------------- # - - # ------------- volantis begin --------------- # - if not name: - name = response.css('a.simpleuser span::text').extract() - if not name: - name = response.css('a.site-card span::text').extract() - if not name: - name = response.css('a.friend-card span::text').extract() - if not name: - name = response.css('a.friend-card p::text').extract() - # ------------- volantis end --------------- # - - - # ------------- nexmoe begin --------------- # - if not name: - name = response.css('.nexmoe-py ul a::attr(title)').extract() - # ------------- nexmoe end --------------- # - - # ------------- Yun begin --------------- # - if not name: - name = response.css('#links a::attr(title)').extract() - # ------------- Yun end --------------- # - - # ------------- stun begin --------------- # - if not name: - name = response.css('.friends-plugin__item-info__name::attr(title)').extract() - # ------------- stun end --------------- # - - # ------------- stellar begin --------------- # - if not name: - name = response.css('.card-link span::text').extract() - # ------------- stellar end --------------- # - # print(name) - return name \ No newline at end of file diff --git a/hexo_circle_of_friends/utils/get_url.py b/hexo_circle_of_friends/utils/get_url.py new file mode 100644 index 000000000000..f7c7aade1f3a --- /dev/null +++ b/hexo_circle_of_friends/utils/get_url.py @@ -0,0 +1,147 @@ +from scrapy import Request +import json +def get_theme_url(theme,response,queue): + # 根据主题获取要爬取的的友链列表,保存到user_info中 + if theme == "butterfly": + get_butterfly_url(response,queue) + if theme == "fluid": + get_fluid_url(response,queue) + if theme == "matery": + get_matery_url(response,queue) + if theme == "nexmoe": + get_nexmoe_url(response,queue) + if theme == "stun": + get_stun_url(response,queue) + if theme == "sakura": + get_sakura_url(response,queue) + if theme == "volantis": + get_volantis_url(response,queue) + if theme == "Yun": + async_link = get_Yun_url(response,queue) + return async_link + if theme == "stellar": + get_stellar_url(response,queue) + + +def get_butterfly_url(response,queue): + avatar = response.css('.flink-list .info img::attr(data-lazy-src)').extract() + if not avatar: + avatar = response.css('.flink-list a img::attr(data-lazy-src)').extract() + if not avatar: + avatar = response.css('.flink-list a .info img::attr(src)').extract() + if not avatar: + avatar = response.css('.flink-list a img::attr(src)').extract() + if not avatar: + avatar = response.css('.flink .site-card .info img::attr(data-lazy-src)').extract() + + link = response.css('.flink-list a::attr(href)').extract() + if not link: + link = response.css('.flink .site-card::attr(href)').extract() + + name = response.css('.flink-list .flink-sitename::text').extract() + if not name: + name = response.css('.flink-list a .flink-item-name::text').extract() + if not name: + name = response.css('.flink .site-card .info .title::text').extract() + handle(avatar,link,name,queue) + +def get_fluid_url(response,queue): + avatar = response.css('.link-avatar img::attr(src)').extract() + link = response.css('.row.links a::attr(href)').extract() + name = response.css('.link-title::text').extract() + handle(avatar,link,name,queue) + +def get_matery_url(response,queue): + avatar = response.css('#friends-link .friend-div img::attr(src)').extract() + link = response.css('#friends-link .friend-button>a::attr(href)').extract() + name = response.css('#friends-link .friend-name::text').extract() + handle(avatar,link,name,queue) + +def get_nexmoe_url(response,queue): + avatar = response.css('.nexmoe-py ul img::attr(data-src)').extract() + link = response.css('.nexmoe-py ul a::attr(href)').extract() + name = response.css('.nexmoe-py ul a::attr(title)').extract() + handle(avatar,link,name,queue) + +def get_stun_url(response,queue): + avatar = response.css('.friends-plugin__item img::attr(data-src)').extract() + link = response.css('.friends-plugin__item::attr(href)').extract() + name = response.css('.friends-plugin__item-info__name::attr(title)').extract() + handle(avatar,link,name,queue) + +def get_sakura_url(response,queue): + avatar = response.css('.link-item img::attr(src)').extract() + link = response.css('.link-item a::attr(href)').extract() + name = response.css('.link-item .sitename::text').extract() + if name: + for i ,n in enumerate(name): + name[i ] =name[i].strip("\n ") + handle(avatar, link, name,queue) + +def get_volantis_url(response,queue): + avatar = response.css('a.simpleuser img::attr(src)').extract() + if not avatar: + avatar = response.css('a.site-card img::attr(src)').extract() + if not avatar: + avatar = response.css('a.friend-card img::attr(src)').extract() + + link = response.css('a.simpleuser::attr(href)').extract() + if not link: + link = response.css('a.site-card::attr(href)').extract() + if not link: + link = response.css('a.friend-card::attr(href)').extract() + + name = response.css('a.simpleuser span::text').extract() + if not name: + name = response.css('a.site-card span::text').extract() + if not name: + name = response.css('a.friend-card span::text').extract() + if not name: + name = response.css('a.friend-card p::text').extract() + handle(avatar,link,name,queue) + +def get_Yun_url(response,queue): + async_link = response.css("#links script::text").re("https://.*links\.json")[0] + if async_link: + return async_link + avatar = response.css('#links a img::attr(src)').extract() + link = response.css('#links a::attr(href)').extract() + name = response.css('#links a::attr(title)').extract() + + handle(avatar,link,name,queue) + +def get_stellar_url(response,queue): + avatar = response.css('.card-link img::attr(data-src)').extract() + link = response.css('.card-link::attr(href)').extract() + name = response.css('.card-link span::text').extract() + handle(avatar,link,name,queue) + +def handle(avatar,link,name,queue): + user_info = [] + print(avatar) + print(link) + print(name) + n = min(len(avatar),len(link),len(name)) + if n != 0: + for i in range(n): + if link[i] == "": + # 初步筛选掉不符合规则的link + continue + user_info.append(name[i]) + user_info.append(link[i]) + user_info.append(avatar[i]) + queue.put(user_info) + user_info = [] + +def Yun_async_link_handler(response,queue): + user_info = [] + friends = json.loads(response.text) + for friend in friends: + name = friend["name"] + link = friend["url"] + avatar = friend["avatar"] + user_info.append(name) + user_info.append(link) + user_info.append(avatar) + queue.put(user_info) + user_info = [] \ No newline at end of file