release 4.1.3

Rock-Candy-Tea · Jan 30, 2022 · d27616d · d27616d
1 parent 3b1b385
commit d27616d
Show file tree

Hide file tree

Showing 6 changed files with 243 additions and 252 deletions.
diff --git a/README.md b/README.md
@@ -2,36 +2,38 @@
 
 你是否经常烦恼于友链过多但没有时间浏览？那么友链朋友圈将解决这一痛点。你可以随时获取友链网站的更新内容，并了解友链的活跃情况 。
 
+⭐从4.1.3版本开始，一定要在配置项中配置友链页的获取策略
 ```
-目前 release 4.1.2 版本：
+目前 release 4.1.3 版本：
 - 支持 gitee 上的 issuse 友链获取
 - 支持 github 上的 issuse 友链获取
 - 支持 butterfly、volantis、matery、sakura、fluid主题的最新文章获取
 - 新增目前最通用的atom和rss规则
 - 支持站点屏蔽，在配置项选择开启
-- 代码基于scrapy重构
 - 支持更新时间和创建时间排序
 - 支持未适配的hexo主题和非hexo用户使用，在配置项选择开启配置项友链
-- 支持爬取typecho类型的博客
-- 新增对nexmoe、Yun、stun主题的爬取
 - 支持爬取wordpress类型的博客
-- 优化文章去重规则
+- 新增对nexmoe、Yun、stun主题的爬取
 - 新增额外的友链页同时爬取，在配置项选择开启
 - 新增对stellar主题的爬取
 - 支持添加HTTP代理，在配置项选择开启
 - 新增配置项友链选项，自定义订阅后缀和解析类型
+- 逻辑重构，新增友链页获取策略配置
 
 bug修复：
 - wordpress类型博客的时间格式问题
 - butterfly主题友链页解析不再抓取背景图片了
 - 修复了github和gitee对volantis主题的友链获取
 - 屏蔽站点现在不计入失联数
+- 修复了sakura主题和nexmoe主题偶尔报错的问题
+- 现在可以获取Yun主题的外置JSON友链
 ```
 
 # 版本更新
 
-发布新版本后，您只需要在您fork的仓库点击fetch即可更新到最新版本。
+4.0以后的版本：发布新版本后，您只需要在您fork的仓库点击fetch即可更新到最新版本。
 ![img.png](img.png)
+4.0以前的版本升级：建议重新fork
 
 如果觉得本项目不错，请帮忙点个⭐Star，既是对我们的支持，还可以随时关注友链朋友圈的更新情况。
 

diff --git a/hexo_circle_of_friends/run.py b/hexo_circle_of_friends/run.py
@@ -1,5 +1,16 @@
-from scrapy.cmdline import execute
+from scrapy.utils.project import get_project_settings
+from scrapy.crawler import CrawlerProcess
 
+def main():
+    setting = get_project_settings()
+    process = CrawlerProcess(setting)
+    didntWorkSpider = ['xiaoso',]
+    for spider_name in process.spiders.list():
+        if spider_name in didntWorkSpider :
+            continue
+        # print("Running spider %s" % (spider_name))
+        process.crawl(spider_name)
+    process.start()
 
-# execute(['scrapy','crawl','test'])
-execute(['scrapy','crawl','hexo_circle_of_friends'])
+if __name__ == '__main__':
+    main()
diff --git a/hexo_circle_of_friends/settings.py b/hexo_circle_of_friends/settings.py
@@ -78,13 +78,33 @@
 
 
 
-################################以下可以修改################################
-# leancloud post data outdate_clean
+################################请修改以下内容################################
+# outdate_clean
 # 过期文章清除（天）
 OUTDATE_CLEAN = 60
 
+# 友链页的获取策略
+# 从4.1.3版本开始，为了程序更精准解析您的主题，需要配置此项
+# 参数说明：
+# strategy：必填，可选参数如下：
+#   - default：默认。指定友链页主题。示例：如果您的友链页为https://www.yyyzyyyz.cn/link/，请选择butterfly，以此类推
+#   - incompat：如果theme中不支持您的主题，请选择此项。此时建议使用配置项友链
+# theme：必填，可选参数如下（这些是目前支持的主题）：
+#   - butterfly：butterfly主题
+#   - fluid：fluid主题
+#   - matery：matery主题
+#   - nexmoe：nexmoe主题
+#   - stun：stun主题
+#   - sakura: sakura主题
+#   - volantis：volantis主题
+#   - Yun：Yun主题
+#   - stellar：stellar主题
+FRIENDPAGE_STRATEGY={
+    "strategy": "default",
+    "theme": "butterfly"  # 请修改为您的主题
+}
 
-# get links from settings
+# 配置项友链
 # 格式：["name", "link", "avatar","suffix","rules"]，
 # 参数说明：
 # name：必填，友链的名字
@@ -107,6 +127,7 @@
 
 
 # get links from gitee
+# 从gitee issue中获取友链
 GITEE_FRIENDS_LINKS={
     "enable": False,    # True 开启gitee issue兼容
     "type": "normal",  # volantis/stellar用户请在这里填写volantis
@@ -117,6 +138,7 @@
 
 
 # get links from github
+# 从github issue中获取友链
 GITHUB_FRIENDS_LINKS = {
     "enable": False,    # True 开启github issue兼容
     "type": "normal",  # volantis/stellar用户请在这里填写volantis

diff --git a/hexo_circle_of_friends/spiders/hexo_circle_of_friends.py b/hexo_circle_of_friends/spiders/hexo_circle_of_friends.py
@@ -1,14 +1,13 @@
 # -*- coding:utf-8 -*-
 
-import calendar
 import datetime
 import time
 import scrapy
 import queue
 from scrapy.http.request import Request
 from hexo_circle_of_friends import settings
 from bs4 import BeautifulSoup
-from hexo_circle_of_friends.utils.get_theme_url import *
+from hexo_circle_of_friends.utils.get_url import get_theme_url,Yun_async_link_handler
 from hexo_circle_of_friends.utils.regulations import *
 import sys
 
@@ -50,7 +49,7 @@ def start_requests(self):
                     "state"] + '&page=' + str(number)
                 yield Request(url, callback=self.friend_poor_parse, meta={"github": {"domain": domain}})
         if settings.DEBUG:
-            friendpage_link = settings.FRIENPAGE_LINK
+            friendpage_link = settings.FRIENDPAGE_LINK
         else:
             friendpage_link = []
             friendpage_link.append(sys.argv[3])
@@ -109,30 +108,16 @@ def friend_poor_parse(self, response):
                 pass
 
         if "theme" in response.meta.keys():
-            user_info = []
-            link = get_link_url(response)
-            avatar = get_avatar_url(response)
-            name = get_name_url(response)
-            # print(link)
-            # print(avatar)
-            # print(name)
-            if len(link) == len(avatar) == len(name):
-                for i in range(len(link)):
-                    if link[i] == "":
-                        continue
-                    user_info.append(name[i])
-                    user_info.append(link[i])
-                    user_info.append(avatar[i])
-                    self.friend_poor.put(user_info)
-                    user_info = []
-                    # print("""------------------------\n
-                    # name:%s
-                    # avatar:%s
-                    # link_list:%s
-                    # """%(name[i],avatar[i],link[i]))
-                    # print("total:%d"%i)
-
-        # print(self.friend_poor)
+            if settings.FRIENDPAGE_STRATEGY["strategy"] =="default":
+                theme = settings.FRIENDPAGE_STRATEGY["theme"]
+                async_link = get_theme_url(theme,response,self.friend_poor)
+                if async_link:
+                    # Yun主题的async_link临时解决
+                    yield Request(async_link,callback=self.friend_poor_parse,meta={"async_link":async_link},dont_filter=True)
+            else:
+                pass
+        if "async_link" in response.meta.keys():
+            Yun_async_link_handler(response,self.friend_poor)
 
         # 要添加主题扩展，在这里添加一个请求
         while not self.friend_poor.empty():
@@ -230,7 +215,7 @@ def post_rss2_parse(self, response):
         link = sel.css("item guid::text").extract()
         pubDate = sel.css("item pubDate::text").extract()
         if len(link)>0:
-            l = len(title) if len(title) < 5 else 5
+            l = len(link) if len(link) < 5 else 5
             try:
                 for i in range(l):
                     m = pubDate[i].split(" ")
@@ -259,7 +244,7 @@ def post_wordpress_parse(self, response):
         link = [comm.split("#comments")[0] for comm in sel.css("item link+comments::text").extract()]
         pubDate = sel.css("item pubDate::text").extract()
         if len(link)>0:
-            l = len(title) if len(title) < 5 else 5
+            l = len(link) if len(link) < 5 else 5
             try:
                 for i in range(l):
                     m = pubDate[i].split(" ")
@@ -420,29 +405,29 @@ def theme_sakura_parse(self, response):
         main_content = soup.find_all(id='main')
         time_excit = soup.find_all('div', {"class": "post-date"})
         if main_content and time_excit:
-            link_list = main_content[0].find_all('div', {"class": "post-date"})
-            lasttime = datetime.datetime.strptime('1970-01-01', "%Y-%m-%d")
-            for index, item in enumerate(link_list):
-                date = item.text
-                date = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", date).group(0)
-                if lasttime < datetime.datetime.strptime(date, "%Y-%m-%d"):
-                    lasttime = datetime.datetime.strptime(date, "%Y-%m-%d")
-            lasttime = lasttime.strftime('%Y-%m-%d')
-            # print('最新时间是', lasttime)
-            last_post_list = main_content[0].find_all('article', {"class": "post"})
-            for item in last_post_list:
-                time_created = item.find('div', {"class": "post-date"}).text.strip()
-                time_created = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", time_created).group(0)
-                time_created = datetime.datetime.strptime(time_created, "%Y-%m-%d").strftime("%Y-%m-%d")
-                if time_created == lasttime:
-                    a = item.find('a')
-                    alink = a['href']
-                    alinksplit = alink.split("/", 1)
-                    stralink = alinksplit[1].strip()
-                    if link[-1] != '/':
-                        link = link + '/'
-                    link = link.split('/')[0]
-                    try:
+            try:
+                link_list = main_content[0].find_all('div', {"class": "post-date"})
+                lasttime = datetime.datetime.strptime('1970-01-01', "%Y-%m-%d")
+                for index, item in enumerate(link_list):
+                    date = item.text
+                    date = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", date).group(0)
+                    if lasttime < datetime.datetime.strptime(date, "%Y-%m-%d"):
+                        lasttime = datetime.datetime.strptime(date, "%Y-%m-%d")
+                lasttime = lasttime.strftime('%Y-%m-%d')
+                # print('最新时间是', lasttime)
+                last_post_list = main_content[0].find_all('article', {"class": "post"})
+                for item in last_post_list:
+                    time_created = item.find('div', {"class": "post-date"}).text.strip()
+                    time_created = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", time_created).group(0)
+                    time_created = datetime.datetime.strptime(time_created, "%Y-%m-%d").strftime("%Y-%m-%d")
+                    if time_created == lasttime:
+                        a = item.find('a')
+                        alink = a['href']
+                        alinksplit = alink.split("/", 1)
+                        stralink = alinksplit[1].strip()
+                        if link[-1] != '/':
+                            link = link + '/'
+                        link = link.split('/')[0]
                         post_info = {
                             'title': item.find('h3').text.strip(),
                             'time': lasttime,
@@ -453,8 +438,8 @@ def theme_sakura_parse(self, response):
                             'rule': "sakura"
                         }
                         yield post_info
-                    except:
-                        pass
+            except:
+                pass
 
     def theme_volantis_parse(self, response):
         # print("theme_volantis_parse---------->" + response.url)
@@ -510,13 +495,14 @@ def theme_nexmoe_parse(self, response):
         partial_l = response.css("section.nexmoe-posts .nexmoe-post>a::attr(href)").extract()
         title = response.css("section.nexmoe-posts .nexmoe-post h1::text").extract()
         date = response.css("section.nexmoe-posts .nexmoe-post-meta a:first-child::text").extract()
-        if len(partial_l) == len(title) == len(date):
-            for i in range(len(partial_l)):
-                partial_l[i] = partial_l[i].lstrip("/")
-                r = re.split(r"[年月日]", date[i])
-                y, m, d = r[0], r[1], r[2]
-                date = y + "-" + m + "-" + d
-                try:
+        if len(partial_l)>0:
+            try:
+                l = len(partial_l) if len(partial_l) < 5 else 5
+                for i in range(l):
+                    partial_l[i] = partial_l[i].lstrip("/")
+                    r = re.split(r"[年月日]", date[i])
+                    y, m, d = r[0], r[1], r[2]
+                    date = y + "-" + m + "-" + d
                     post_info = {
                         'title': title[i],
                         'time': date,
@@ -527,8 +513,8 @@ def theme_nexmoe_parse(self, response):
                         'rule': "nexmoe"
                     }
                     yield post_info
-                except:
-                    pass
+            except:
+                pass
 
     def theme_Yun_parse(self, response):
         # print("theme_Yun_parse---------->" + response.url)