Permalink
Browse files

MOD: some regex, some strategy, fuck x-art

  • Loading branch information...
1 parent 0e19a3d commit 5dbca03fbc98b120df481671944c45edc413156e wangxu committed Feb 17, 2017
Showing with 40 additions and 18 deletions.
  1. +0 −1 conf.py
  2. +19 −14 crawler.py
  3. +21 −3 tidier.py
View
@@ -3,5 +3,4 @@
root = "D:/Downloads/video/x-art"
pic_root = os.path.join(root, "update")
-goagent_abs_path = "D:/Downloads/goagent/local/goagent.exe"
proxy = None
View
@@ -17,7 +17,9 @@ def __crawl(url, proxy=None, timeout=30):
opener = urllib.request.build_opener(urllib.request.ProxyHandler({"http":proxy}))
else:
opener = urllib.request.build_opener()
- response = opener.open(url, timeout=timeout)
+ headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
+ response = opener.open(urllib.request.Request(url, headers=headers), timeout=timeout)
if response.getheader("Content-Encoding") == "gzip":
return gzip.decompress(response.read())
else:
@@ -31,6 +33,8 @@ def __crawl(url, proxy=None, timeout=30):
response.close()
except:
return None
+
+
def crawl(url, charset=None, proxy=conf.proxy, retry=3, timeout=10):
retry = 1 if retry < 1 else retry
for i in range(0, retry):
@@ -42,36 +46,38 @@ def crawl(url, charset=None, proxy=conf.proxy, retry=3, timeout=10):
return None
update_url = "http://www.x-art.com/updates/"
+# update_url = "http://www.x-art.com/index.php?show=galleries&pref=items&page=1&catname=all&order=recent"
+
class regexs:
meta = "".join([r"<li>[\s]*?<a.*?href=['\"](.*?)['\"][^>]*?>[\s]*?<div class=\"item\" data-equalizer-watch>[\s\S]*?",
- r"<div class=\"item-img\">[\s]*?<img.*\[(.*?), \(large\)\].*?>[\s\S]*?",
+ r"<div class=\"item-img\">[\s]*?<img.*?data-interchange=\".*\[(.*?),\ \(large\)\]\".*?>[\s\S]*?",
r"<h1>(.*?)</h1>[\s]*?<h2>([\s\S]*?)(</h2>)?[\s]*?<h2>(.*?)</h2>[\s\S]*?"])
rate = r"<h2>(.*?)\(\d+ votes\).*?</h2>"
model_list = r"<h2><span>featuring</span>((\s*?<a.*?>(.*?)</a>\s*?\|?)*)</h2>"
model = r"<a.*?>(.*?)</a>"
- #comment = r"<p>\s*?<p>\s*?<span[^>]*?>(.*?)</span></p>\s*?</p>"
- comment = r"<p>([^<]*?)</p>"
+ comment = r"<p>(.*?)</p>"
+
def _seek_comment(s):
- comment = None
- for m in re.findall(regexs.comment, s):
- tmp = re.sub(r"<[^>]*>", "", m.strip().replace("\n", "").replace("&nbsp;", " "))
- if comment is None or len(tmp) > len(comment):
- comment = tmp
- return comment if comment is not None else "NO COMMENT!!"
+ cmt = None
+ for mth in re.findall(regexs.comment, s):
+ tmp = re.sub(r"<[^>]*>", "", mth.strip().replace("\n", "").replace("&nbsp;", " "))
+ if cmt is None or len(tmp) > len(cmt):
+ cmt = tmp
+ return cmt if cmt else "NO COMMENT!!"
if __name__ == "__main__":
- update_page = crawl(update_url, "utf-8")
+ update_page = crawl(update_url, "utf-8") # .replace("\\n", "\n").replace("\\", "")
add_cnt = 0
for m in re.findall(regexs.meta, update_page):
detail_url = m[0].replace(" ", "%20")
pic_url = m[1].replace(" ", "%20")
name, tp, time = m[2], m[3], m[5]
- comment = re.sub(r"<[^>]*>", "", m[5].replace("\n", "").replace("&nbsp;", " "))
if "first_item" not in locals():
first_item = "The first item is {%s, %s, %s}" % (name, time, tp.strip())
if "HD video".lower() not in tp.lower():
+ print("wrong type %s, ignore %s - %s" % (tp, time, name))
continue
parse_r = urlparse(pic_url)
pic_type = parse_r.path[parse_r.path.rfind(".")+1:]
@@ -80,7 +86,6 @@ def _seek_comment(s):
pic_name = "%s - %s.%s" % (ntime, name, pic_type)
pic_abs_path = os.path.join(conf.pic_root, pic_name)
if not os.path.exists(pic_abs_path) or os.path.getsize(pic_abs_path) == 0:
- print(detail_url)
detail_page = crawl(detail_url, "utf-8")
m = re.search(regexs.rate, detail_page)
rate = m.group(1) if m else None
@@ -97,7 +102,7 @@ def _seek_comment(s):
print("\tmodels:%s comment:%s rate:%s" % (models, comment, rate))
add_cnt += 1
img_data = crawl(pic_url)
- with open(pic_abs_path, "wb") as wp:
+ with open(pic_abs_path.replace("?", ""), "wb") as wp:
exif.copy_on_write(io.BytesIO(img_data), wp, {"model":models, "rate":rate, "desc":comment})
if add_cnt == 0:
print("first item is %s" % first_item)
View
@@ -1,5 +1,6 @@
import os
import re
+from datetime import datetime, timedelta
import conf
import exif
@@ -25,17 +26,20 @@ def read_info(root):
infos.append(info)
return infos
+
def safe_get(d, k):
if k in d:
return d[k]
else:
return None
-
+
+
def info2name(info):
return "%s - %s - %s" % (safe_get(info, "time"),
safe_get(info, "name"),
safe_get(info, "model"))
+
def guess_name(infos, fn):
idx = fn.rfind(".")
name = fn[0:idx]
@@ -52,6 +56,7 @@ def guess_name(infos, fn):
new_name = "%s.%s" % (info2name(info), ext)
return new_name
+
def update_pic(pic_abs_path, exif_meta):
tmp_abs_path = "%s.tmp" % pic_abs_path
with open(pic_abs_path, "rb") as rp:
@@ -61,6 +66,8 @@ def update_pic(pic_abs_path, exif_meta):
os.renames(tmp_abs_path, pic_abs_path)
ignore_tokens = ["x-art", "-"]
+
+
def do_seg(s):
raw = re.findall(r"[a-z0-9\-]+", s)
for banned in ignore_tokens:
@@ -82,7 +89,7 @@ def do_seg(s):
os.renames(abs_path, new_abs_path)
print("%s --> %s" % (fn, new_name))
- statistic = {"true":0, "never":0, "undefined":0, "dup":0}
+ statistic = {"true": 0, "never": 0, "undefined": 0, "dup": 0}
for info in infos:
if "status" in info and info["status"] in statistic:
statistic[info["status"]] += 1
@@ -102,5 +109,16 @@ def do_seg(s):
break
else:
print("undefined art: %s - %s" % (info["name"], info["time"]))
- statistic["undefined"] += 1
+ pic_name = "%s - %s.%s" % (info["time"], info["name"], "jpg")
+ pic_abs_path = os.path.join(conf.pic_root, pic_name)
+ ctime = datetime.fromtimestamp(os.path.getctime(pic_abs_path))
+ if (datetime.now() - ctime) > timedelta(days=10):
+ print("\tundefine time too long, change to never")
+ with open(pic_abs_path, "rb") as fp:
+ exif_meta = exif.parse_exif(fp)
+ exif_meta["status"] = "never"
+ update_pic(pic_abs_path, exif_meta)
+ statistic["never"] += 1
+ else:
+ statistic["undefined"] += 1
print(statistic)

0 comments on commit 5dbca03

Please sign in to comment.