-
Notifications
You must be signed in to change notification settings - Fork 2
/
bookmarks_checker.py
243 lines (162 loc) · 5.69 KB
/
bookmarks_checker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" Bookmarks Checker """
import argparse
import os
import re
import threading
import time
import urllib.request
class BookmarksChecker(object):
"""
Bookmarks Checker
Verify links in a Chrome or Firefox exported bookmarks file.
Usage python bookmarks_checker.py [-f file]
Python Version 3.x
Author Martin Latter
Copyright Martin Latter 21/09/2017
Version 0.05
Credits Doug Hellmann (threading usage)
License GNU GPL version 3.0 (GPL v3); http://www.gnu.org/licenses/gpl.html
Link https://github.com/Tinram/Bookmarks-Checker.git
"""
DEBUG = False
USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0'
NUMBER_THREADS = 16
num_urls = 0
dead_link_counter = 0
url_parse_time = 0
parse_flag = False
url_index = {}
def __init__(self):
""" Initialise and execute methods. """
filename = self.get_args()
self.check_file(filename)
self.parse_file(filename)
def get_args(self):
""" Parse the command line arguments. """
parser = argparse.ArgumentParser()
parser.add_argument(
'-f', '--file',
dest='filename',
help='Specify filename of the bookmarks file to load',
default='bookmarks.html',
type=str,
action='store')
args = parser.parse_args()
return args.filename
def check_file(self, filename):
"""
Check bookmark file existence and access.
Args:
filename: name of bookmarks file.
"""
if not os.access(filename, os.R_OK):
print('\n %s cannot be found or cannot be read.\n' % filename)
os._exit(-1)
def parse_file(self, filename):
"""
Parse the file, extract links, and set-up threads.
Args:
filename: name of bookmarks file.
"""
urls = []
thread_holder = []
with open(filename) as bmfile:
for line in bmfile:
full_url = re.findall(r'(<a\s[^>]*href=\"([^\"]*)\"[^>]*>(.*)<\/a>)', line, re.I)
if full_url:
urls.append(full_url[0][1])
self.url_index[full_url[0][1]] = full_url[0][2]
if not len(urls):
print('\n No links extracted from %s\n' % filename)
os._exit(-1)
pool = ActivePool()
semaphore = threading.Semaphore(self.NUMBER_THREADS)
self.url_parse_time = time.time()
for url in urls:
current_url = url
thread = threading.Thread(
target=self.activate_thread,
name=current_url,
args=(semaphore, pool, current_url)
)
thread_holder.append(thread)
self.num_urls = len(urls)
print('\n %i links being checked ...' % self.num_urls)
if not self.DEBUG:
print('\n failures:\n')
for thrd in thread_holder:
thrd.start()
for thrd in thread_holder:
thrd.join()
self.display_final_info()
def activate_thread(self, semaphore, pool, url):
"""
Activate thread to check a URL.
Args:
semaphore: threading semaphore.
pool: instance of ActivePool()
url: a single URL.
"""
with semaphore:
name = threading.current_thread().getName()
pool.activate(name)
self.check_url(url)
pool.deactivate(name)
def check_url(self, url):
"""
Thread method to check URL access.
Args:
url: a single URL.
"""
headers = {'User-Agent': self.USER_AGENT}
try:
url_name = self.url_index[url]
req = urllib.request.Request(url, None, headers)
response = urllib.request.urlopen(req)
# print(response.getcode())
if self.DEBUG:
print(' ok: %s | %s' % (url_name, url))
except urllib.error.HTTPError as err2:
self.dead_link_counter += 1
if not self.DEBUG:
print(' F: %s | %s -- %s' % (url_name, url, str(err2.code)))
except urllib.error.URLError as err1:
self.dead_link_counter += 1
if not self.DEBUG:
print('\t %s | %s' % (url_name, url))
else:
print(' F: %s | %s -- %s' % (url_name, url, str(err1.reason)))
except:
pass
def display_final_info(self):
""" Display dead link count and URL parse time. """
print('\n %i links failed' % self.dead_link_counter)
print(' %i links verified\n' % (self.num_urls - self.dead_link_counter))
print(' URL parse time: %s secs\n' % str.format('{0:.5f}', (time.time() - self.url_parse_time)))
# end class
class ActivePool(object):
"""
Active pool of threads.
Python Version 3.x
Author Doug Hellmann
"""
def __init__(self):
super(ActivePool, self).__init__()
self.active = []
self.lock = threading.Lock()
def activate(self, name):
""" Activate thread. """
with self.lock:
self.active.append(name)
def deactivate(self, name):
""" Deactivate thread. """
with self.lock:
self.active.remove(name)
# end class
def main():
""" Invoke class. """
BookmarksChecker()
if __name__ == '__main__':
main()