-
Notifications
You must be signed in to change notification settings - Fork 4
/
cangls_spider.py
124 lines (106 loc) · 3.94 KB
/
cangls_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# coding:utf-8
import requests
from lxml import html
from lxml import etree
import os
from threading import Thread
import re
base_url = "http://www.cangls.com/tag/"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/61.0.3163.100 Safari/537.36"}
nr_girl_nude_photo = 1
def getxpath(html):
return etree.HTML(html)
def get_nr_page_url(girlname,nr):
if nr == 0:
url = base_url + girlname + ".html"
# http://www.cangls.com/tag/长泽梓.html
else:
url = base_url + girlname + "_"+str(nr)+".html"
# http://www.cangls.com/tag/长泽梓_1.html
return url
# response = requests.get(url).content
# selector = html.fromstring(response)
def get_girl_names():
girl_names = []
with open("./pornstar_names.txt",'r') as porn_names_file:
for line in porn_names_file.readlines():
name = line.strip()
if name.find("_") == -1:
girl_names.append(name)
return girl_names
def download_img(img_title,img_detail_websites):
# num = 1
global nr_girl_nude_photo
try :
for url,title in zip(img_detail_websites,img_title):
filename = title
print(filename+": "+url)
with open(str(nr_girl_nude_photo)+"__"+filename+".jpg",'wb') as f:
f.write(requests.get(url,headers=headers,timeout=5).content)
nr_girl_nude_photo += 1
# num += 1
except Exception as e:
print(repr(e))
return download_img
def get_str_nr(nr):
# 937 japanese girls
if(nr<10):
return "00"+str(nr)
elif(nr<100):
return "0"+str(nr)
return str(nr)
def get_girl_page(name,page_img_url):
try:
response = requests.get(page_img_url,headers=headers,timeout=7).content
selector = html.fromstring(response)
img_detail_websites = selector.xpath("//img/@src")
# print(img_detail_websites)
# we should delete the logo picture:/picture/logo-3-1.png
img_detail_websites.remove('/picture/logo-3-1.png')
fanhao = selector.xpath("//h2/a/text()")
# print(fanhao)
print("Begin to download...")
download_img(fanhao,img_detail_websites)
except Exception as e:
print(repr(e))
return get_girl_names
# # url = "http://www.cangls.com/tag/长泽梓.html"
# url = "http://www.cangls.com/tag/波多野结衣.html"
# response = requests.get(url).content
# selector = html.fromstring(response)
# img_detail_websites = selector.xpath("//img/@src")
# # print(img_detail_websites)
# # we should delete the logo picture:/picture/logo-3-1.png
# img_detail_websites.remove('/picture/logo-3-1.png')
# fanhao = selector.xpath("//h2/a/text()")
# # print(fanhao)
# print("Begin to download...")
# # download_img(fanhao,img_detail_websites)
if not os.path.exists("girl_images"):
os.makedirs("girl_images")
girl_names = get_girl_names()
print(girl_names)
print("total: "+str(len(girl_names))+" beautiful girls")
os.system("pwd")
print("changing directory: girl_images/")
os.chdir("girl_images"),os.system("pwd")
nr_girl = 1
for name in girl_names:
if not os.path.exists(get_str_nr(nr_girl)+"__"+name):
os.makedirs(get_str_nr(nr_girl)+"__"+name)
print("pornstar"+get_str_nr(nr_girl)+": "+name)
os.chdir(get_str_nr(nr_girl)+"__"+name),os.system("pwd")
nr_girl_nude_photo = 1
for i in range(0,35):
print(" "+get_nr_page_url(name,i))
get_girl_page(name,get_nr_page_url(name,i))
os.chdir("..")
nr_girl += 1
print(" OK, nude photos of beatiful girls have been successfully saved!!! : )")
print(" FBI WARNING: Copyright (C) 2018 Frank Curie (邱日)")
# reference: - https://zhuanlan.zhihu.com/p/26395979
# - https://zhuanlan.zhihu.com/p/25572729
# - https://zhuanlan.zhihu.com/p/25784651
# learn Xpath
# Python爬虫跳过异常