-
Notifications
You must be signed in to change notification settings - Fork 0
/
Main.py
815 lines (655 loc) · 31 KB
/
Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# #
# Python Crawler and Site Map Generator v1.2.1 #
# #
# Copyright 2016, PedroHenriques #
# http://www.pedrojhenriques.com #
# https://github.com/PedroHenriques #
# #
# Free to use under the MIT license. #
# http://www.opensource.org/licenses/mit-license.php #
# #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
import traceback, threading, queue, os, time, tkinter as tk, subprocess, platform, urllib.parse
from classes import general, GUI, MapGenerator, Crawler
class Application :
"""This is the application's main class.
Contains the GUI and handles passing the information to and from the sitemap generator code."""
# class constant with the default # of threads to use
DEFAULT_NUM_THREADS_ = 8
# class constant with #secods between a save of data to files (in automated tasks)
SAVE_STATE_SECONDS_ = 5
# class constant with #secods between a save of data to files (in tasks when the user changes an input value)
SAVE_TMP_FILE_SECONDS_ = 30
# class constant with #secods between a GUI update (when needed)
GUI_UPDATE_SECONDS_ = 1
def __init__(self, app_window) :
# instance variable containing the # of threads to be used when crawling
self.num_threads = self.DEFAULT_NUM_THREADS_
# create the instance variables for data storage
self.resetDataVariables()
# instance variable with the path to the projects folder
self.path_projects = os.getcwd() + "/projects/"
# instance variables with the partial path to the temp sitemap file
self.path_tmp_folder = "/tmp/"
self.file_tmp_save_sitemap = "tmp_save_sitemap.txt"
# instance variable pointing to an instance of the GUI class
self.gui = GUI.GUI(self, app_window)
# this method creates/resets the instance variables used to store the
# data and their situation for the current project
def resetDataVariables(self) :
# instance variable with the currently opened project's data
self.root_url = ""
self.root_domain = ""
# instance variables used by the crawlers to store all URLs
# that have been found (they are either in the queue or processed)
# and the URLs that have been crawled already
self.known_urls = set()
self.crawled_urls = set()
# instance variable used by the crawlers to store URLs that were not
# possible to either access or get the html
self.failed_urls = set()
# instance variable used by the crawlers to store URLs pointing
# to external websites. NOTE: these links will not be crawled!
self.external_urls = set()
# instance variable used by the crawlers with the URLs still not crawled
self.url_queue = set()
# instance variable used to store the site map items and sorting info
self.site_map_items = []
self.site_map_items_sorting = []
# determined the domain from a URL
def getDomain(self, url) :
# parse the URL
parsed_url = urllib.parse.urlparse(self.root_url)
# build the domain
# if the url has a subdomain in the folder format, convert it to the subdomain format
if (parsed_url.path != "") :
domain = general.convertFoldersToSubdomain(parsed_url)
else :
# remove the port information, if it's present
port_index = parsed_url.netloc.find(":")
if (port_index != -1) :
domain = parsed_url.netloc[:port_index]
else :
domain = parsed_url.netloc
domain += parsed_url.path
# if the url passed ended with a / remove it
if (domain.endswith("/")) :
domain = domain[:-1]
return(domain)
# validate root URL
# return True if valid, False if not valid
def validateRootURL(self) :
# strip whitespaces from start and end of the root URL
self.root_url = self.root_url.strip()
# add the http:// to the start of the root URL, if not there already
if (not self.root_url.startswith("http://") and not self.root_url.startswith("https://")) :
self.root_url = "http://" + self.root_url
# parse the URL
parsed_url = urllib.parse.urlparse(self.root_url)
# check if there a main body to the URL and that there are no queries in the URL
if (parsed_url.netloc == "" or parsed_url.query != "") :
# there isn't a main body or there is a query, so fail validation
return(False)
# store the tentative final URL
self.root_url = parsed_url.geturl()
# local variables used to know if any changes to path were made below and upsate the final URL
update_url = False
cur_path = parsed_url.path
# if the URL ends with "index.[file type]" remove it
if (parsed_url.path.count(".") > 0) :
# remove everything after the last /, making sure it doesn't end with a /
new_path = parsed_url.path[:parsed_url.path.rindex("/")]
update_url = True
elif (parsed_url.path.endswith("/")) :
# remove the ending / from path
new_path = parsed_url.path[:-1]
update_url = True
# if needed, update the path with the changes made above
if (update_url) :
self.root_url = self.root_url.replace(cur_path, new_path)
return(True)
# this method scans the projects folder and returns a list of all the
# projects currently created
def projectList(self) :
# local variable where the project names will be added
projects = []
# loop each item and if it's a directory add it to the list
for item in os.listdir(self.path_projects) :
# check if this item is a folder
if (os.path.isdir(self.path_projects + item)) :
projects.append(item)
# return the project list, sorted by alphabetical order
projects.sort(key=str.lower)
return(projects)
# create the folder for a new project
def createProject(self) :
# grab the URL inserted by the user on the UI
self.root_url = self.gui.getRootURL()
# validate the root URL
if (not self.validateRootURL()) :
# the root URL is not valid, so give error
self.gui.showErrorMsgBox(title="An error occured", message="The root URL inserted is not valid.")
return
# get the domain name for the root url
self.root_domain = self.getDomain(self.root_url)
# build the folder and file paths for this project
self.buildFilePaths()
# check if the project path already exists
if (os.path.isdir(self.paths["project_folder"])) :
# the project already exists, so give error
self.gui.showErrorMsgBox(title="An error occured", message="A project already exists for the inserted root url!")
return
# project path doesn't exist, so create the necessary directory and files
os.mkdir(self.paths["project_folder"])
general.writeToFile(self.paths["webpages"], "w", "")
general.writeToFile(self.paths["queue"], "w", self.root_url + "\n")
general.writeToFile(self.paths["external_links"], "w", "")
general.writeToFile(self.paths["failed_links"], "w", "")
# open the newly created project
self.openProject(self.root_domain)
# open a project
def openProject(self, root_domain) :
# check if the project folder exists
if (not os.path.isdir(self.path_projects + root_domain)) :
self.gui.showErrorMsgBox(title="Operation Failed", message="The project folder does not exist.")
return
# reset the instance variables used to store the data of the project
self.resetDataVariables()
# set the new root domain
self.root_domain = root_domain
# build the folder and file paths for this project
self.buildFilePaths()
# set the root URL for this project
self.root_url = self.root_domain
self.validateRootURL()
# determine if this project has already been crawled
crawled = False
if (os.path.isfile(self.paths["queue"]) and os.path.getsize(self.paths["queue"]) == 0) :
# the only way the queue.txt file exists and is empty is if all pages have been crawled
crawled = True
# do the necessary GUI updates
self.gui.projectOpened(self.root_domain, crawled)
# operation completed, show complete message
self.gui.showInfoMsgBox(title="Operation Completed", message="The project has been successfully opened.")
# delete a project
def deleteProject(self, root_domain) :
path = self.path_projects + root_domain
# check if the project folder exists
if (not os.path.isdir(path)) :
self.gui.showErrorMsgBox(title="Operation Failed", message="The project folder does not exist.")
return
# delete the project
if (general.deleteDir(path)) :
# if the project selected to be deleted is the currenly opened project, start by closing it
if (self.root_domain == root_domain) :
# reset the instance variables used to store the data of the project
self.resetDataVariables()
# reset the GUI variables and frames
self.gui.resetAllVariables(True)
# update the application window title
self.gui.updateWindowTitle("")
# do the necessary GUI updates
self.gui.updateProjectList("delete_proj")
# operation completed, show complete message
self.gui.showInfoMsgBox(title="Operation Completed", message="The project has been successfully deleted.")
else :
# couldn't delete the selected project
self.gui.showErrorMsgBox(title="Operation Failed", message="The project could not be deleted.\nPlease try again.")
# this method builds the initial version of the site map, using the URLs stored in the .txt file
# those URLs will be cross-referenced with the robots.txt of the target domain
def buildSiteMapData(self) :
# if the current project's URL state variables are not set, do so
if (len(self.known_urls) == 0) :
self.setCrawlVariables(True)
# if the queue isn't empty, then this project hasn't been fully crawled
# so bail out
if (len(self.url_queue) != 0) :
self.gui.showErrorMsgBox(title="Operation Failed", message="The currently opened project hasn't been completely crawled.\n\nPlease do so before generating a site map.")
return
# local variable used by the crawlers to control access to the queue
queue_lock = threading.Lock()
# local list variable used as the queue, built from the known_urls set
queue = []
for item in self.known_urls :
queue.append(item)
# sort the queue by URL asc
queue.sort(key=str.lower, reverse=True)
# reset the instance variable with the site map data
self.site_map_items = []
# reset the instance variable with the site map data's sorting
# in the form [[col_alias, True/False], [col_alias, True/False]]
# the default is to sort by url asc
self.site_map_items_sorting = [["url", False]]
# local variable used to store all the crawler objects created
threads = []
# local variable with the current system date as YYYY-MM-DD
local_time_aux = time.localtime()
local_time_str = "{0}-{1}-{2}".format(str(local_time_aux.tm_year), str(local_time_aux.tm_mon).zfill(2), str(local_time_aux.tm_mday).zfill(2))
# disable all task buttons in the GUI
self.gui.setTaskButtons("disable_all")
# execute GUI changes for a building site map task
self.gui.manageTaskSiteMapInputElems("working")
# clear any site map data already drawn to the screen
self.gui.redrawSiteMapItems(None)
# set the MapGenerator's class variables, to keep all instances in sync
MapGenerator.MapGenerator.terminate_thread_ = False
MapGenerator.MapGenerator.queue_ = queue
MapGenerator.MapGenerator.queue_lock_ = queue_lock
MapGenerator.MapGenerator.root_url_ = self.root_url
MapGenerator.MapGenerator.local_time_ = local_time_str
MapGenerator.MapGenerator.site_map_items_ = self.site_map_items
# local variables used to control the periodic update of the GUI with
# the site map items
last_time_call = time.monotonic()
gui_update_cooldown = self.GUI_UPDATE_SECONDS_
# grab the # of threads the user wants to have created
self.getNumThreads()
# get from the GUI class the use/nouse of robots.txt file value
use_robots_file = self.gui.getRobotsUse()
# instantiate each MapGenerator thread, start it and store the object reference
for _ in range(self.num_threads) :
thread = MapGenerator.MapGenerator(use_robots_file)
thread.start()
threads.append(thread)
# loop until all items have been processed
# (checked after updating UI to make sure it updates with last items processed)
while True :
# update the gui update timers
gui_update_cooldown -= time.monotonic() - last_time_call
last_time_call = time.monotonic()
# check if it's time to update the GUI
if (gui_update_cooldown <= 0) :
# send the call for the GUI to update the site map items on the screen
self.gui.updateSiteMapItems(self.site_map_items)
# reset the gui update cooldown
gui_update_cooldown = self.GUI_UPDATE_SECONDS_
# check if all processing is done
if (len(self.known_urls) == len(self.site_map_items)) :
# all done
# send the call for the GUI to update the site map items on the screen
self.gui.updateSiteMapItems(self.site_map_items)
# exit loop
break
# wait 1 second before checking again
time.sleep(1)
# signal all instances of MapGenerator to terminate
MapGenerator.MapGenerator.terminate_thread_ = True
# wait for all the MapGenerator instances to terminate
for t in threads:
t.join()
# update the arrows in the table header of the GUI
self.gui.updateSortArrows()
# activate all task buttons in the GUI
self.gui.setTaskButtons("activate_all")
# execute GUI changes for a completed site map task
self.gui.manageTaskSiteMapInputElems("continued")
# show message to user to indicate that the process is complete
self.gui.showInfoMsgBox(message="The list of webpages for this project has been compiled.\n\nThe values for \"last modified\", \"change frequency\" and \"priority\" have been given default values, based on the best guess of the application.\n\nYou should confirm and, if necessary, change them before requesting the creation of the Site Map file.", title="Operation Completed")
# sorts the site map items based on the passed column sorting (in a list of lists)
# the columns should be passed with index=0 being the primary sorting col,
# index=1 being the secondary sorting col, and so on
# and for each col there should be an asc/desc bool value (asc=False, desc=True)
# sort_cols should be in the form [[col_alias, True/False], [col_alias, True/False]]
# the method returns True if successful, False otherwise
def sortSiteMapItems(self, new_sort) :
# try to do the requested sorting
try:
# create a copy of the site map items as they are right now
sorted_data = self.site_map_items.copy()
# create a reversed copy of new_sort, because we need to execute the sorting
# from the lowest to primary
new_sort_rev = new_sort.copy()
new_sort_rev.reverse()
# run each sort, starting from the lowest to the primary sorting
for sort_info in new_sort_rev :
# sort the data, using for each row the column and asc/desc indicated
sorted_data.sort(key=lambda data_row: data_row[sort_info[0]], reverse=sort_info[1])
except Exception as e:
return(False)
# if everything went ok, then store the new site map data and sorting
self.site_map_items = sorted_data.copy()
self.site_map_items_sorting = new_sort.copy()
return(True)
# this method will be called by the GUI whenever the user changes any of the site map's data
# it will check if the data should be saved to a temp file, based on a timer
def saveSiteMapData(self, timed) :
# build the path to the temp file
file_path = self.paths["project_folder"] + self.path_tmp_folder + self.file_tmp_save_sitemap
# if the temp file doesn't exist or we want to save now or the last save was longer than the save cooldown
if (not os.path.isfile(file_path) or not timed or time.time() - os.path.getmtime(file_path) >= self.SAVE_TMP_FILE_SECONDS_) :
# execute GUI changes for a loading site map task
self.gui.manageTaskSiteMapInputElems("working")
# save the current site map data to the temp file
general.saveTmpSiteMap(self.paths["project_folder"], self.file_tmp_save_sitemap, "w", self.site_map_items, self.site_map_items_sorting)
# update the button's state
self.gui.manageTaskSiteMapInputElems("continued")
# this method returns the last modified time stamp of the temp file with the site map data
def lastSiteMapSaveTime(self) :
file_path = self.paths["project_folder"] + self.path_tmp_folder + self.file_tmp_save_sitemap
# check if the the temp file exists
if (not os.path.isfile(file_path)) :
# if it doesn't exist, return empty string
return("")
# at this point, the file exists, so grab the modified time stamp
# local variable with the last modified time stamp
ts = os.path.getmtime(file_path)
ts_aux = time.localtime(ts)
return("{0}-{1}-{2} {3}:{4}:{5}".format(str(ts_aux.tm_year), str(ts_aux.tm_mon).zfill(2), str(ts_aux.tm_mday).zfill(2), str(ts_aux.tm_hour).zfill(2), str(ts_aux.tm_min).zfill(2), str(ts_aux.tm_sec).zfill(2)))
# this method checks if the temporary save file with site map data exists and has data
# returns True if the file eists and has data, False otherwise
def tempSaveSiteMapExists(self, file_path="") :
# if needed, build the path to the temp file
if (file_path == "") :
file_path = self.paths["project_folder"] + self.path_tmp_folder + self.file_tmp_save_sitemap
# if the temp file doesn't exist or it's empty, return False
if (not os.path.isfile(file_path) or os.path.getsize(file_path) == 0) :
return(False)
# return True if all checks pass
return(True)
# this method will populate the site_map_items variable with the data in the site map temp file
def loadSiteMapData(self) :
# build the path to the temp file
file_path = self.paths["project_folder"] + self.path_tmp_folder + self.file_tmp_save_sitemap
# if the temp file doesn't exist or it's empty, skip
if (not self.tempSaveSiteMapExists(file_path)) :
return
# execute GUI changes for a loading site map task
self.gui.manageTaskSiteMapInputElems("working")
# empty the current site map data and sorting info
self.site_map_items.clear()
self.site_map_items_sorting.clear()
# local bool variable to know if we've loaded the sorting info
first_line = True
with open(file_path, "r") as f :
# loop each line
for line in f :
# remove any new line characters returned
line = line.replace("\n", "")
if (line == "") :
continue
# split the line into the several information
line_parts = line.split("||")
# if we're on the 1st line, then we have the sorting info
if (first_line) :
# loop through each sorting item
for sorting_item in line_parts :
if (sorting_item == "") :
continue
# split this item into the 2 parts
sorting_item_parts = sorting_item.split(";")
# add it to the instance variable
self.site_map_items_sorting.append([sorting_item_parts[0], bool(int(sorting_item_parts[1]))])
first_line = False
else :
# we're not on the 1st line, so we have sitemap data
# build this item's data and add it to the site map data variable
self.site_map_items.append({"url" : line_parts[0], "lastmod" : line_parts[1], "changefreq" : line_parts[2], "priority" : line_parts[3]})
# update the button's state
self.gui.manageTaskSiteMapInputElems("continued")
# this method will be called when the save, load or create XML buttons are pressed
# it will call the necessary code for the requested operation
def processSiteMapFileOperations(self, command) :
if (command == "save") :
# save the current input values to the temp file
self.saveSiteMapData(False)
# give feedback to the user
self.gui.showInfoMsgBox(title="Data Saved", message="The current state of the data has been saved to a temporary file.")
elif (command == "load") :
# execute GUI changes for a loading site map task
self.gui.manageTaskSiteMapInputElems("working")
# load from the temp file the input values
self.loadSiteMapData()
# update the GUI with the loaded data
self.gui.redrawSiteMapItems(self.site_map_items)
# update the arrows in the table header of the GUI
self.gui.updateSortArrows()
# update the button's state
self.gui.manageTaskSiteMapInputElems("continued")
# give feedback to the user
self.gui.showInfoMsgBox(title="Data Loaded", message="The data has been updated with the information of the last save.")
elif (command == "xml") :
# execute GUI changes for a working site map task
self.gui.manageTaskSiteMapInputElems("working")
# save the current input values to the temp file
# this way the load command will laod the same data that was placed in the XML file
self.saveSiteMapData(False)
# we want to create the final site map XML file
general.createSiteMapXML(self.paths["project_folder"], self.site_map_items)
# update the button's state
self.gui.manageTaskSiteMapInputElems("continued")
# give feedback to the user
self.gui.showInfoMsgBox(title="Site Map File Created", message="The sitemap.xml file for this project has been created.\n\nThe file has been stored in the project's folder \"" + self.paths["project_folder"] + "\"")
# returns a dictionary with the number of URL known, crawled, external and failed for the current project
def getURLCounts(self) :
counts = {}
# count
c_known = len(self.known_urls)
c_crawled = len(self.crawled_urls)
c_external = len(self.external_urls)
c_failed = len(self.failed_urls)
# if the crawl instance variables aren't set yet, do so
if (c_known == 0 and c_crawled == 0 and c_external == 0 and c_failed == 0) :
# set the variables
self.setCrawlVariables(True)
# count again
c_known = len(self.known_urls)
c_crawled = len(self.crawled_urls)
c_external = len(self.external_urls)
c_failed = len(self.failed_urls)
counts["known"] = c_known
counts["crawled"] = c_crawled
counts["external"] = c_external
counts["failed"] = c_failed
return(counts)
# sets instance variables used in the crawl task
# if read_only is True, the variables will be set based on the files.
# if False, the treatment will depend on whether the project has been fully crawled already or not
# if it has, then it will reset to the initial values, else it will set from where we left off
def setCrawlVariables(self, read_only) :
# if the folder or any of the files don't exist, show error and return
if (not os.path.isdir(self.paths["project_folder"]) or not os.path.isfile(self.paths["webpages"]) or not os.path.isfile(self.paths["queue"]) or not os.path.isfile(self.paths["external_links"]) or not os.path.isfile(self.paths["failed_links"])) :
self.gui.showErrorMsgBox(title="An error occured", message="Some of the required files for this project could not be found.")
return(False)
# clear all the sets of past data
self.known_urls.clear()
self.crawled_urls.clear()
self.url_queue.clear()
self.external_urls.clear()
self.failed_urls.clear()
# if not in read_only mode and the queue.txt is empty, then build the variables
# with the initial values (as if the project was created) to crawl the website again
if (not read_only and os.path.getsize(self.paths["queue"]) == 0) :
self.known_urls.add(self.root_url)
self.url_queue.add(self.root_url)
else :
# build the variables based on the file's content
with open(self.paths["webpages"], "r") as f :
# loop each line
for line in f :
# remove any new line characters returned
line = line.replace("\n", "")
if (line == "") :
continue
# add this URL to the sets
self.crawled_urls.add(line)
self.known_urls.add(line)
with open(self.paths["queue"], "r") as f :
# loop each line
for line in f :
# remove any new line characters returned
line = line.replace("\n", "")
if (line == "") :
continue
# add this URL to the sets
self.url_queue.add(line)
self.known_urls.add(line)
with open(self.paths["external_links"], "r") as f :
# loop each line
for line in f :
# remove any new line characters returned
line = line.replace("\n", "")
if (line == "") :
continue
# add this URL to the set
self.external_urls.add(line)
with open(self.paths["failed_links"], "r") as f :
# loop each line
for line in f :
# remove any new line characters returned
line = line.replace("\n", "")
if (line == "") :
continue
# add this URL to the set
self.failed_urls.add(line)
return(True)
# grabs the number of threads input from the GUI and sets
# the relevant instance variable
def getNumThreads(self) :
# grab the # of threads the user wants to have created
try :
self.num_threads = self.gui.getNumThreads()
#validate the value
if (self.num_threads < 1) :
# use default value
raise(ValueError)
except ValueError :
# if int() throws an exception, use default value
self.num_threads = self.DEFAULT_NUM_THREADS_
# crawls the inserted website and gathers all the links (just internal or all)
# the links will be stored in a file inside a folder with the project's name
def crawlSite(self) :
# make sure a root URL is set
if (self.root_url == "") :
self.gui.showErrorMsgBox(title="An error occured", message="The root URL for this project could not be found.\n\nPlease make sure a project is opened.")
return
# grab the # of threads the user wants to have created
self.getNumThreads()
# local variables used by the crawlers to control access to the queue
queue_lock = threading.Lock()
# local variable used to store all the crawler objects created
threads = []
# if the queue is empty, then reset the data to start crawling again
# NOTE: if we reach ths code, we want to crawl the site
# either continue an incomplete crawl, or start from scratch
if (len(self.url_queue) == 0) :
# this project has been fully crawled before, so reset
if (not self.setCrawlVariables(False)) :
# the variables couldn't be set, so bail out
return
# update the GUI with the initial values
self.gui.updateCrawlTaskElems(len(self.crawled_urls), len(self.known_urls), len(self.external_urls), len(self.failed_urls))
# disable all task buttons in the GUI
self.gui.setTaskButtons("disable_all")
# execute GUI changes for a running crawl task
self.gui.manageTaskCrawlElems("continued")
# set the crawler's class variables, to keep all crawlers in sync
Crawler.Crawler.terminate_thread_ = False
Crawler.Crawler.known_urls_ = self.known_urls
Crawler.Crawler.crawled_urls_ = self.crawled_urls
Crawler.Crawler.external_urls_ = self.external_urls
Crawler.Crawler.failed_urls_ = self.failed_urls
Crawler.Crawler.queue_ = self.url_queue
Crawler.Crawler.queue_lock_ = queue_lock
Crawler.Crawler.root_url_ = self.root_url
# local variables used to control the periodic save of the queue to a file
last_time_call = time.monotonic()
state_save_cooldown = self.SAVE_STATE_SECONDS_
# instance boolean variable used to exit the loop below if the PAUSE button is pressed
self.paused = False
# instantiate each crawler, start it and store the object reference
for _ in range(self.num_threads) :
thread = Crawler.Crawler()
thread.start()
threads.append(thread)
# loop until all items have been processed
# (checked after updating UI to make sure it updates with last items processed)
while True :
# update the UI with the URLs crawled and found
self.gui.updateCrawlTaskElems(len(self.crawled_urls), len(self.known_urls), len(self.external_urls), len(self.failed_urls))
# update the queue save timers
state_save_cooldown -= time.monotonic() - last_time_call
last_time_call = time.monotonic()
# check if it's time to store the queue in a file
if (state_save_cooldown <= 0) :
# save the queue to the file
general.saveSetToFile(self.paths["queue"], "w", self.url_queue)
# save the processed URLs to the file
general.saveSetToFile(self.paths["webpages"], "w", self.crawled_urls)
# save the external URLs to the file
general.saveSetToFile(self.paths["external_links"], "w", self.external_urls)
# save the failed URLs to the file
general.saveSetToFile(self.paths["failed_links"], "w", self.failed_urls)
# reset the queue save cooldown
state_save_cooldown = self.SAVE_STATE_SECONDS_
# if all processing is done OR the pause button was pressed
# do final operations and exit loop
if ((len(self.known_urls) == len(self.crawled_urls) + len(self.failed_urls)) or self.paused) :
# all done
# save the queue to the file (to empty it)
general.saveSetToFile(self.paths["queue"], "w", self.url_queue)
# save the processed URLs to the file
general.saveSetToFile(self.paths["webpages"], "w", self.crawled_urls)
# save the external URLs to the file
general.saveSetToFile(self.paths["external_links"], "w", self.external_urls)
# save the failed URLs to the file
general.saveSetToFile(self.paths["failed_links"], "w", self.failed_urls)
# exit loop
break
# wait 1 second before checking again
time.sleep(1)
# signal all instances of the crawler to terminate
Crawler.Crawler.terminate_thread_ = True
# wait for all the crawlers to terminate
for t in threads:
t.join()
# do final operations for this method
if (len(self.known_urls) == len(self.crawled_urls) + len(self.failed_urls)) :
# the loop ended because the website was completely crawled
# operation completed, show complete message
self.gui.showInfoMsgBox(title="Operation Completed", message="The website has been successfully crawled.\n\n" + str(len(self.known_urls)) + " interal webpages found!\n\n" + str(len(self.failed_urls)) + " URLs could not be successfully crawled!\n\n" + str(len(self.external_urls)) + " external webpages found!")
# update the task buttons' state now that the current project has been crawled
self.gui.setTaskButtons("crawl_competed")
# execute GUI changes for a compleated(i.e., paused) crawl task
self.gui.manageTaskCrawlElems("completed")
else :
# the loop ended because the pause button was pressed
# update the task buttons' state for a not fully crawled project
self.gui.setTaskButtons("open_not_crawled")
# execute GUI changes for a paused crawl task
self.gui.manageTaskCrawlElems("paused")
# operation paused, show message
self.gui.showInfoMsgBox(title="Operation Paused", message="The crawl task has been paused.\nYou can continue crawling by clicking the \"Continue Crawl\" button.")
# builds the paths for a project's files
def buildFilePaths(self) :
self.paths = {}
self.paths["project_folder"] = self.path_projects + self.root_domain
self.paths["webpages"] = self.paths["project_folder"] + "/webpages.txt"
self.paths["queue"] = self.paths["project_folder"] + "/queue.txt"
self.paths["external_links"] = self.paths["project_folder"] + "/external_links.txt"
self.paths["failed_links"] = self.paths["project_folder"] + "/failed_links.txt"
# define the method to open a folder using the user's OS default browser
if (platform.system() == "Windows") :
def openFolder(self, path) :
os.startfile(path)
elif (platform.system() == "Darwin") :
def openFolder(self, path) :
subprocess.Popen(["open", path])
else :
def openFolder(self, path) :
subprocess.Popen(["xdg-open", path])
# code that starts the entire application
try:
# create the application window
app_window = tk.Tk()
app_window.minsize(960, 600)
app_window.state("zoomed")
app_window.rowconfigure(0, weight=1)
app_window.columnconfigure(0, weight=1)
# instantiate the application's main class
app = Application(app_window)
# start the GUI event listener loop
app_window.mainloop()
except Exception as e:
traceback.print_exc()
print("\n")