Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 90 lines (69 sloc) 2.16 kb
af41ecb @Tassandar first
authored
1
2 #自己写一个简单网络爬虫,准备把xmarks下面都爬出来。
3
4 require 'open-uri'
5 require 'thread'
6
7
8 $regexp_words = /<a class\=\"topic-instance\" href\=\"([\s|\S]*?)\">/;
9 $regexp_numbers = /:: Page 1 of (.*) ::/;
10 $regexp_pages = /<b>Page 1<\/b>/
11 $links = []
12 $regexp_websites = /<span class\=\"title\">([\s|\S]*?)<\/span>/
13 $regexp_web = /<a href\=\"([\s|\S]*?)" rel=\"nofollow\">/
14 $counter = 0
15
16 class Crawl
17 def initialize uri
18 @url = uri
19 @words = []
20
21
22 end
23 def get_links char
24 number = 0
25 herf = @url + "/topics/" + char + "/1" #先打开网站
26 begin
27 open(herf) do |f| #读入字母种子页面并获取页码数
28 number = f.read.scan($regexp_numbers).to_s.to_i
29 p number
30 end
31
32
33 1.upto(number) { |i|
34 herf = @url + "/topics/" + char + "/" + i.to_s
35 @html = open(herf) do |f| #每页读过去获取所有存在的单词
36 f.read.scan($regexp_words).join("/").scan(/\/topic\/([\s|\S]*?)\//) { |unusedlocal| @words.push(unusedlocal.to_s)}
37
38 end
39 p @words
40 }
41 rescue
42 @html = nil
43 p "error! #{$!}"
44 end
45 end
46 def crawl
47 begin
48 @words.each do |item|
49 $counter += 1
50 File.open("websites.txt", "a") { |file| file.puts $counter.to_s + "." + item }
51 Crawl.get_websites(1,item)
52
53 end
54 rescue
55 p "error! #{$!}"
56 end
57 end
58 def self.get_websites pager,item
59
60 herf = "http://www.xmarks.com" + "/topic/sites/" + pager.to_s + "/" + item + "?created=all"
61 p herf
62 begin
63 open(herf) do |f|
64 html = f.read
65 File.open("websites.txt", "a") do |file|
66 file.puts html.scan($regexp_websites).flatten.join(" ").scan($regexp_web).flatten.join("\n")
67 end
68 if html.slice(/\">Next/) #如果还有下一页
69 self.get_websites((pager+1),item)
70
71 end
72 end
73 rescue $!
74 p "error! #{$!}"
75
76 end
77 end
78 end
79
80 c = Crawl.new("http://www.xmarks.com")
81
82 'a'.upto('z') do |i|
83 c.get_links(i)
84 c.crawl
85
86
87 end
88
89
Something went wrong with that request. Please try again.