From f2673cd48408d02fb0faf5740333716a171a6479 Mon Sep 17 00:00:00 2001 From: Kenneth Chew Date: Sat, 16 Sep 2017 16:02:44 +0800 Subject: [PATCH 1/5] Changed from search num of results to num of pages --- .idea/vcs.xml | 6 + .idea/workspace.xml | 550 ++--- ENV/spider.py | 104 +- .../googlehtml/googleresultspage1.html | 1674 ++++++--------- dac1/googlehtml/googleresultspage2.html | 1884 +++++++++++++++++ dac1/googlehtml/googleresultspage3.html | 1870 ++++++++++++++++ dac1/googlelinks.txt | 28 + text.html | 771 ------- 8 files changed, 4629 insertions(+), 2258 deletions(-) create mode 100644 .idea/vcs.xml rename googleresults.html => dac1/googlehtml/googleresultspage1.html (62%) create mode 100644 dac1/googlehtml/googleresultspage2.html create mode 100644 dac1/googlehtml/googleresultspage3.html create mode 100644 dac1/googlelinks.txt delete mode 100644 text.html diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml index fbc4fe7..39765d1 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -1,7 +1,12 @@ - + + + + + + @@ -247,11 +44,15 @@ Skip to cookie cnt + class="r" + + @@ -271,6 +73,7 @@ + @@ -293,6 +96,11 @@ + + + + + @@ -419,11 +227,11 @@ - + - + - + @@ -543,33 +351,13 @@ - - - - - - - - - - - - - - - - - - - - - + - + @@ -579,7 +367,7 @@ - + @@ -587,192 +375,124 @@ - + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + diff --git a/ENV/spider.py b/ENV/spider.py index 918808c..8c0a871 100644 --- a/ENV/spider.py +++ b/ENV/spider.py @@ -3,35 +3,105 @@ import requests import json from bs4 import BeautifulSoup +import os +import math def print_greeting(): print("Crawler running....") - return input("Please enter your search terms: ") + # Replace any spaces in the directory name with underscores + dir_name = input("Please enter your project name: ").replace(" ", "_") + num = input("Please enter the number of pages to be searched: ") + queries = input("Please enter your search terms: ") + return dir_name,num,queries -def query_google(params): - search_term = {"q" : params} +def query_google(dir_name,num,params): + + + + # Create project directory + if not os.path.exists(dir_name): + print("Creating directory : " + dir_name + "....") + os.makedirs(dir_name) + print("Directory created") + else: + print("Directory already exists, proceeding with web scraping...") + + # Create dir to store raw html + search_dir = dir_name + "/googlehtml" + if not os.path.exists(search_dir): + os.makedirs(search_dir) + else: + print("Dir for Google search html already exists,proceeding with web scraping...") + + # Query Google user_agent = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"} - google = "https://www.google.com.sg/search" - # Spoofs a browser visit that times out after 5 sec(no reply within 5 sec) - r = requests.get(google, headers=user_agent, params=search_term, timeout=5.0) - print(r.request.url) - google_results = r.text.encode("utf-8") - soup = BeautifulSoup(google_results, "html.parser") - # Write to file - with open("googleresults.html", "wb") as file: - file.write(soup.prettify().encode("utf-8")) - return soup + + # Get number of pages to search -> If we want to search by num of results instead of pages + # results_per_page = 10 + # num = int(num) + # if(num % results_per_page is 0): + # pages = int(num / results_per_page) + # else: + # pages = int(math.ceil(num/10) % results_per_page) + # print("Number of google pages to search: " + str(pages)) + + # NOTE: There are 10 results per page, but not all 10 are URLs. + # E.g some may be link to gallery of images + results_per_page = 10 + # Return a set from this function containing the file names of all the pages scraped + file_names = set() + + for page in range(int(num)): + + cur_page = results_per_page * (page) + search_term = {"q": params, "start": cur_page} + # Spoofs a browser visit that times out after 5 sec(no reply within 5 sec) + url = "https://www.google.com.sg/search" + r = requests.get(url, headers=user_agent, params=search_term, timeout=5.0) + print("Searching page " + str(page + 1) + " ....\n") + # Add page to file_name set + google_results = r.text.encode("utf-8") + soup = BeautifulSoup(google_results, "html.parser") + # To be returned from this function + file_names.add(soup) + # Used to store the html files to be saved + file_name = search_dir + "/googleresultspage" + str(page + 1) + ".html" + + # Write to file + with open(file_name, "wb") as file: + file.write(soup.prettify().encode("utf-8")) + + + print("Saving results.....") + print("Results saved.\n\n") + return file_names + def get_search_links(search_results): - for result in search_results.body.find_all("a"): - print(result.get("href")) + links = set() + for result in search_results: + # For each

tag, find the tag and get the href value + link_set = result.body.find_all("h3", class_="r") + + # Add the href value to a set that will be saved to a file + for link in link_set: + print("Extracting link...\n") + links.add(link.find("a").get("href")) + + with open(dir_name + "/googlelinks.txt","a") as f: + print(str(len(links)) + " links extracted.") + print("Saving links...") + for each in links: + f.write(each + "\n") + print("Links saved.") + if __name__ == "__main__": - user_input = print_greeting() - query_results = query_google(user_input) + dir_name,num,params = print_greeting() + query_results = query_google(dir_name,num,params) get_search_links(query_results) \ No newline at end of file diff --git a/googleresults.html b/dac1/googlehtml/googleresultspage1.html similarity index 62% rename from googleresults.html rename to dac1/googlehtml/googleresultspage1.html index da4107b..e6fdd02 100644 --- a/googleresults.html +++ b/dac1/googlehtml/googleresultspage1.html @@ -5,10 +5,10 @@ - spider - Google Search + xianxia - Google Search + + + +
+
-
+

- - World's Largest Spider - National Geographic Video + + World of Xianxia - Novel Updates

- video.nationalgeographic.com/video/tarantula_goliath + www.novelupdates.com/series/world-of-xianxia/ - - The goliath birdeater tarantula of South America is arguably the biggest - - spider - - in the world. Watch as one hapless mouse wanders into a - - spider's - - deadly trap, ... - -
-
-
-
- -
-
-
-
-
-
-
-
-
-
-
-
-
-
- -
-
-
- - People also ask - -
-
-
- -
-
- - Loading... - -
-
-
- - - -
-
+ + The moment we stepped on the road of cultivation, there will be thousands of immortals upfront, but you are just like an ant wandering in there. The moment ... + +
-
-
+
-
-
-
- -
+
+ +
+
+

+ + Zenith Novels - Fantasy Chinese/Japanese Wuxia & Xianxia Light ... + +

+
+
+
+ + zenithnovels.com/ + +
+ + +
+
+ + Fantasy Chinese/Japanese Wuxia & + + Xianxia + + Light Novels Translation Updates. + +
- - Feedback -
+
-
-
-
-
- -
-
-

- - Spiders - Reddit - -

-
-
-
- - https://www.reddit.com/r/spiders/ - -
- -
@@ -1103,81 +1226,82 @@

-
+
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
-
-
-
- -
-
- - - Image result for spider - - - -
-
- {"cb":3,"clt":"n","ct":3,"id":"395EhSgioGESGM:","ml":{"278":{"bh":99,"bw":66},"366":{"bh":94,"bw":60},"454":{"bh":95,"bw":71}},"oh":321,"ou":"https://upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Theraphosa_blondi_MHNT.jpg/220px-Theraphosa_blondi_MHNT.jpg","ow":220,"pt":"Spider - Wikipedia","rh":"en.wikipedia.org","rid":"zqfcXCZTEAMlqM","rmt":0,"rt":0,"ru":"https://en.wikipedia.org/wiki/Spider","s":"","st":"Wikipedia","th":120,"tu":"https://encrypted-tbn0.gstatic.com/images?q\u003dtbn:ANd9GcS5H96Unh1PMCSRQsfnadnlC7XCPfjPnVTwodvmiSeDvv9bj2eVvGdkRzs","tw":82} -
-
- - -
-
- - - Image result for spider - - - -
-
- {"cl":6,"clt":"n","cr":3,"id":"CZ7L15cOy_SfsM:","ml":{"278":{"bh":99,"bw":117},"366":{"bh":94,"bw":107},"454":{"bh":95,"bw":131}},"oh":410,"ou":"http://i3.mirror.co.uk/incoming/article11010615.ece/ALTERNATES/s615b/Jumping-spider.jpg","ow":615,"pt":"Is spider season coming early? Huge creature discovered in east ...","rh":"mirror.co.uk","rid":"nky1zW4MCndO-M","rmt":0,"rt":0,"ru":"http://www.mirror.co.uk/news/uk-news/spider-season-coming-early-huge-11010490","s":"","st":"Mirror","th":104,"tu":"https://encrypted-tbn0.gstatic.com/images?q\u003dtbn:ANd9GcQmqfdgpX_b8kRtWOUJ0KQ3q56lvcsiCn9GJe-bSoE2Ny7ln2LOsSXTOBc","tw":156} -
-
- - -
- - -
- -
- {"cb":3,"cl":3,"clt":"n","cr":3,"ct":3,"id":"OPsL_Ncmc6uWiM:","ml":{"278":{"bh":86,"bw":131},"366":{"bh":94,"bw":111},"454":{"bh":95,"bw":140}},"oh":605,"ou":"https://cdn.orkin.com/images/spiders/brown-recluse-spider-illustration_1017x605.jpg","ow":1017,"pt":"Brown Recluse Spiders: Facts, Identification, Behavior \u0026 Control","rh":"orkin.com","rid":"sMDxmIzLUkme2M","rmt":0,"rt":0,"ru":"https://www.orkin.com/other/spiders/brown-recluse-spiders/","s":"","st":"Orkin","th":104,"tu":"https://encrypted-tbn0.gstatic.com/images?q\u003dtbn:ANd9GcTiGnStYGHUQv3oRrFK08nFOg0A5jgbBv7AdvuLXEtcrNOCHaqqECsyZJY","tw":175} -
-
- - -
-
- - - Image result for spider - - - -
-
- {"cb":15,"cl":6,"clt":"n","cr":3,"id":"-vWMveNH2RlNFM:","ml":{"278":{"bh":86,"bw":146},"366":{"bh":91,"bw":144},"454":{"bh":90,"bw":126}},"oh":426,"ou":"http://www.burkemuseum.org/sites/default/files/white-tailed-spider%20%281%29.jpg","ow":641,"pt":"Spider Myths | Burke Museum","rh":"burkemuseum.org","rid":"PcOEYfdnIjJt9M","rmt":0,"rt":0,"ru":"http://www.burkemuseum.org/blog/curated/spider-myths","s":"","st":"Burke Museum","th":97,"tu":"https://encrypted-tbn0.gstatic.com/images?q\u003dtbn:ANd9GcS0Q8sxmlA6VU4-zovPx00i2bKy0cHayEhlajNLnjzKYS3ZbyUOt7vdhuA","tw":146} -
-
- - - - - - - - - - -
-
-
-
-
-
-
-
- - -
-
-
-
-
-
- -
-
- -
- - - - - - -
- -
-
-
-
- - - -
-
-
-
-
-
-
-
-
- - Spider - -
-
- - Animal - -
-
-
- -
-
-
-
-
-
-
- -
-
-
- - Spiders are air-breathing arthropods that have eight legs and chelicerae with fangs that inject venom. They are the largest order of arachnids and rank seventh in total species diversity among all other orders of organisms. - - - - - - Wikipedia - - -
-
-
- -
-
- - -
-
- -
-
- - - Scientific name - - : - - - Araneae - -
-
- -
-
- -
-
- - - Class - - : - - - Arachnida - -
-
- -
-
- -
-
- - - Order - - : - - - Araneae; Clerck, 1757 - -
-
- -
-
- - - -
-
- -
-
- - - Lifespan - - : - - - - Brown recluse spider - - : 1 – 2 years, - - Goliath birdeater - - : 15 – 25 years, - - Southern black widow - - : 1 – 3 years - -
-
- -
-
- -
-
- - - Diet - - : - - - - Carnivore - - -
-
- -
-
-
- -
-
-
-
-
-
-
-
-
- -
-
- - Feedback - -
-
-
-
@@ -1850,12 +1440,12 @@

-
-
+
+ + + + + + +
+
+
+ +
+ + + +
+ +
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+ + Sign in + +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+ +
+ + +
+
+
+
+ +
+
+
+
+
+ Page 2 of about 571,000 results + + (0.27 seconds) + +
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+ +
+
+
+
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+
+ +
+

+ Searches related to xianxia +

+
+ + +
+
+ +
+
+
+
+ +
+
+
+
+
+
+ +
+
+
+
+
+
+
+ + + +
+
+
+
+ + +
+
+ +
+
+
+
+
+
+ +
+
+
+
+
+
+ +
+ +
+ + + + + + \ No newline at end of file diff --git a/dac1/googlehtml/googleresultspage3.html b/dac1/googlehtml/googleresultspage3.html new file mode 100644 index 0000000..6c229f4 --- /dev/null +++ b/dac1/googlehtml/googleresultspage3.html @@ -0,0 +1,1870 @@ + + + + + + + + xianxia - Google Search + + + + + + + + + +
+
+
+ +
+ + + +
+ +
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+ + Sign in + +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+ +
+ + +
+
+
+
+ +
+
+
+
+
+ Page 3 of about 571,000 results + + (0.28 seconds) + +
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+ +
+
+
+
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+
+ +
+

+ Searches related to xianxia +

+
+ + +
+
+ +
+
+
+
+ +
+
+
+
+
+
+ +
+
+
+
+
+
+
+ + + +
+
+
+
+ + +
+
+ +
+
+
+
+
+
+ +
+
+
+
+
+
+ +
+ +
+ + + + + + \ No newline at end of file diff --git a/dac1/googlelinks.txt b/dac1/googlelinks.txt new file mode 100644 index 0000000..d91bb38 --- /dev/null +++ b/dac1/googlelinks.txt @@ -0,0 +1,28 @@ +http://cocxianxia.wikia.com/wiki/CoC-Xianxia_Wiki +http://robynpaterson.com/?p=3823 +http://tvtropes.org/pmwiki/pmwiki.php/Main/Xianxia +http://www.books.shushengbar.com/?cat=4 +http://www.jesperkyd.com/music/world-of-xian-xia-2/ +http://www.novelupdates.com/genre/xianxia/ +http://www.novelupdates.com/series-ranking/ +http://www.novelupdates.com/series/world-of-xianxia/ +http://www.wuxiaworld.com/general-glossary-of-terms/ +http://www.wuxiaworld.com/wuxia-vs-xianxia/ +http://www.xianxiaworld.net/ +http://zenithnovels.com/ +https://chinese.yabla.com/chinese-english-pinyin-dictionary.php?define=xianxia +https://en.wikipedia.org/wiki/Xianxia_novel +https://forum.royalroadl.com/showthread.php?tid=97232 +https://orcid.org/0000-0003-4426-9784 +https://ro.my.com/forum/thread/1830-xianxia-wuxia-stories/ +https://twitter.com/search?q=%23XianXia&lang=en +https://wuxiadream.net/xianxia/ +https://www.deviantart.com/tag/xianxia +https://www.pinterest.com/mrbaibaoding/xianxia-wuxia-martial-arts-and-fantasy/ +https://www.readlightnovel.org/category/xianxia +https://www.reddit.com/r/noveltranslations/comments/44afq0/rec_any_good_wuxiaxianxia_novels_to_recommend/ +https://www.reddit.com/r/noveltranslations/comments/5bexwt/xianxia_vs_xuanhuan/ +https://www.wattpad.com/tags/xianxia/hot +https://www.webnovel.com/popular/xianxia +https://www.youtube.com/watch?v=b2ABmPalvv4 +https://xianxiafr.com/ diff --git a/text.html b/text.html deleted file mode 100644 index f5d3cd1..0000000 --- a/text.html +++ /dev/null @@ -1,771 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - The world's leading software development platform · GitHub - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- Skip to content -
- - - - - - - - - - -
- -
- -
-
- - - -
- - -
-
-
-
-

Built for developers

-

- GitHub is a development platform inspired by the way you work. From open source to business, you can host and review code, manage projects, and build software alongside millions of other developers. -

-
-
-
-
-
- -
-
-
- - - -
-
-
- GitHub for teams -
-

- A better way to work together -

-

- GitHub brings teams together to work through problems, move ideas forward, and learn from each other along the way. -

- -
- -
-
-
-
- -
-
-

Write better code

-

- Collaboration makes perfect. The conversations and code reviews that happen in Pull Requests help your team share the weight of your work and improve the software you build. -

-

- Learn about code review on GitHub -

-
-
- -
-
- -
-
-

Manage your chaos

-

- Take a deep breath. On GitHub, project management happens in Issues and Projects, right alongside your code. All you have to do is mention a teammate to get them involved. -

-

- Learn about project management on GitHub -

-
-
-
-
- -
- -
-
- security-admin - -
- Security and administration -
-

- Boxes? Check. -

-

- We worried about your administrative and security needs so you don’t have to. From flexible hosting to authentication options, GitHub can help you meet your team’s requirements. -

- -

- - Learn about GitHub for Business - -

- -
-
- -
-
-

Code security

-

- Prevent problems before they happen. Protected branches, signed commits, and required status checks protect your work and help you maintain a high standard for your code. -

- -

Access controlled

-

- Encourage teams to work together while limiting access to those who need it with granular permissions and authentication through SAML/SSO and LDAP. -

- -

Hosted where you need it

-

- Securely and reliably host your work on GitHub.com. Or, deploy GitHub Enterprise on your own servers or in a private cloud using Amazon Web Services, Azure or Google Cloud Platform. -

-
-
-
-
- -
-
-
- integrations - -
- Integrations -
-

- Build on GitHub -

-

- Customize your process with GitHub apps and an intuitive API. Integrate the tools you already use or discover new favorites to create a happier, more efficient way of working. -

-

- Learn about integrations -

-
- -
-
-
-
-
-
-
-
-
- -
-

- Sometimes, there’s more than one tool for the job. Why not try something new? -

-

- Browse GitHub Marketplace -

-
-
-
- - - -
-
-

- Get started for free — join the millions of developers already using GitHub to share their code, work together, and build amazing things. -

-
-
-
-
- - - - -
- - - - - - -
- - - You can't perform that action at this time. -
- - - - - - - - - - -
- - You signed in with another tab or window. Reload to refresh your session. - You signed out in another tab or window. Reload to refresh your session. -
- - - - - - From d32875bc5d9a58478fe083c6025c7eda12edf5f2 Mon Sep 17 00:00:00 2001 From: Kenneth Chew Date: Sat, 16 Sep 2017 16:05:35 +0800 Subject: [PATCH 2/5] changed to search by num of pages --- .idea/workspace.xml | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 39765d1..b7f3c96 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,10 +2,7 @@ - - - + @@ -228,7 +229,7 @@ - + @@ -488,8 +489,8 @@ - - + + From 1474d558913f02133edc839e07fe6100560d828f Mon Sep 17 00:00:00 2001 From: Kenneth Chew Date: Sat, 16 Sep 2017 16:36:09 +0800 Subject: [PATCH 4/5] Test commit2 --- ENV/spider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ENV/spider.py b/ENV/spider.py index 8c0a871..71d8450 100644 --- a/ENV/spider.py +++ b/ENV/spider.py @@ -5,7 +5,7 @@ from bs4 import BeautifulSoup import os import math - +# TEST def print_greeting(): print("Crawler running....") # Replace any spaces in the directory name with underscores From 65845e7291af67cd11dba3e215b0370f1cc0d979 Mon Sep 17 00:00:00 2001 From: Kenneth Chew Date: Sat, 16 Sep 2017 20:14:21 +0800 Subject: [PATCH 5/5] Basic Functionality Up and Running --- .DS_Store | Bin 6148 -> 8196 bytes .idea/workspace.xml | 51 +- ENV/spider.py | 82 +- dac1/googlehtml/googleresultspage1.html | 2209 ----------------------- dac1/googlehtml/googleresultspage2.html | 1884 ------------------- dac1/googlehtml/googleresultspage3.html | 1870 ------------------- dac1/googlelinks.txt | 28 - 7 files changed, 107 insertions(+), 6017 deletions(-) delete mode 100644 dac1/googlehtml/googleresultspage1.html delete mode 100644 dac1/googlehtml/googleresultspage2.html delete mode 100644 dac1/googlehtml/googleresultspage3.html delete mode 100644 dac1/googlelinks.txt diff --git a/.DS_Store b/.DS_Store index bb5933c3349ba21216ed43d5d4c6318a760453a1..58af5eb2732a7a6b449a27506b320eaf3e33ee8c 100644 GIT binary patch literal 8196 zcmeHMU2GLa6rS(4z^*KGTB{W8hFc3#no?Ry+7iXw{sH~7)t3H1q1=0SOE+wH%iX;# zP|+erW5Sbv@#lePG)8$*gO6bJ3H8a?gn-cp<;54&2YtXqJ+redw59rlAz>%k`DW&v zIcLt~-PX|@O6@XA& z0fItpE*gaK5ECIzOC$*u$f<;KD#9xUO=jOW4M!zZq_lpz)pA4AEmj0 zqhVe&HMO1Vz)I zcJ1t#;|=rtbka^|?WAuu*tSrnu{$w@fDKIw7 zdDb(ZG|84VVK$RRGF2eFb&U%%UZ0uq2ul-HGC1|@>NV@umu=o!_h_PJCXeSA+*_z= zd+9wmzBOW*nLU%1;g1d`y|n2VmUF0o!t$(?ZT33JF*6?(sydTbwDy4qi*d3!jxqfpZa((c$8h3JK24UM{f zK)CHldVbfq>Cloc3f~mkPFm8SkTn>FSddbh=#t4RO_YTV35*3u?0MkVUdjGfqp-FO1~ zaR9>@#TZD{yI5624$!;MT#4}XY@y>)A*7w_8L+NQPK@l3zGTGIQz z``5%blyBL#{jtV58Z3DCH#0Dxo#a)(w^=>~uX-MdXXHN6*3Isbg&SfI(bYVvh+V^1 zNbQG;*DJ&{uE_4j;!O(e1s)5!T19cCN{r_@Qd+ImD8y~POiF9DdX?DEbEWhVZHGc6 z=gXy3*BVrnGuhc#+@dNR|4rS0CGI!u3j3M;&aM+-mthqOunuLYLM;*jS=-Q#PV}K4 zgV>89m>412S~v_JSxn+6QTG&{!}E9nFA{ZM4ygM&-o`t47w-{;Kf=fOWRAk0;|qL^ z8GMf)7ZA2%K4J0G4Z=?JWE|Ud4$;Ugz2_@Oz1xw29BH&j-2Ypa{{8=UxQ=W_8Hh4) zhhzY`%{|SHw3wwnzT&POr{^F&q6ocdiNO@Ago+psuH%Gre;86b9?B9SPD>;SmH+*R Pfav^>&i{aUcX#y<&%_{0 delta 125 zcmZp1XfcprU|?W$DortDU=RQ@Ie-{MGjdEU6q~50$jCG?VE1GL8J5ZX0?#&25G!Zg z*f5)AF*^r`ATv-I5D0Js30IJgjfLNtC-ci#Du9F;85o$LGz*9ZazJK-tYg?5&ohS^ E0Att_6951J diff --git a/.idea/workspace.xml b/.idea/workspace.xml index bb978dc..25fce27 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,7 +2,12 @@ + + + + + - - + - + @@ -358,7 +357,7 @@ - + @@ -378,7 +377,7 @@ - + @@ -479,20 +478,36 @@ + + + + + + + + + + + + + + + + - - + + - - + + - + diff --git a/ENV/spider.py b/ENV/spider.py index 71d8450..305e094 100644 --- a/ENV/spider.py +++ b/ENV/spider.py @@ -1,11 +1,11 @@ # -*- coding: utf-8 -*- -import re + import requests -import json from bs4 import BeautifulSoup import os -import math -# TEST +import sys + + def print_greeting(): print("Crawler running....") # Replace any spaces in the directory name with underscores @@ -72,9 +72,7 @@ def query_google(dir_name,num,params): with open(file_name, "wb") as file: file.write(soup.prettify().encode("utf-8")) - - print("Saving results.....") - print("Results saved.\n\n") + print("Saving results.....\n Results saved.\n\n") return file_names @@ -96,12 +94,80 @@ def get_search_links(search_results): for each in links: f.write(each + "\n") print("Links saved.") + return links + +def crawl_links(set_of_links): + + # Create results dir + results_dir = dir_name + "/results" + if not os.path.exists(results_dir): + print("Creating directory to store results") + os.makedirs(results_dir) + print("Directory created.") + else: + print("Directory already exists, continuing with web scraping") + + user_agent = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"} + results_set = set() + + for each in set_of_links: + print("Crawling through link...") + r = requests.get(each, headers=user_agent, timeout=10.0) + html = r.text.encode("utf-8") + soup = BeautifulSoup(html,"html.parser") + body = soup.body.encode("utf-8") + + # Get file name(E.g google from https://ww.google.com + print("Original: " + str(each)) + splitted = each.split(".") + # E.g http://www.abc.com/example/year/month/day/page.html -> length of 4 + # splitted[-3] => abc + if(len(splitted) > 3): + file_name = splitted[-3] + else: + # E.g http://abc.com OR http://www.abc.com + # splitted[-2] => abc + file_name = splitted[-2] + if "http" in file_name: + file_name = file_name.split("//")[1] + print("Domain Name: " + str(file_name)) + results_set.add((file_name,body)) + + # Used to check duplicate file names arising from multiple links from the same domain name + file_counter = 0 + + # Save results to file + for tup in results_set: + + # Check if name is too long(Indicative of error in getting file name above) + if len(tup[0]) > 20: + name = tup[0][:20] + else: + name = tup[0] + + # Replaces any / in the name with _ if they exists + if "/" in name: + name = name.replace("/", "_") + + # Check if file name already exists(Multiple pages from same site) + if os.path.exists(results_dir + "/" + name + ".html"): + file_counter += 1 + save_name = results_dir + "/" + name + str(file_counter) + ".html" + + else: + file_counter = 0 + save_name = results_dir + "/" + name + ".html" + + print("Saving file: " + name) + with open(save_name,"w") as f: + f.write(str(tup[1])) if __name__ == "__main__": dir_name,num,params = print_greeting() query_results = query_google(dir_name,num,params) - get_search_links(query_results) \ No newline at end of file + search_links = get_search_links(query_results) + crawl_links(search_links) \ No newline at end of file diff --git a/dac1/googlehtml/googleresultspage1.html b/dac1/googlehtml/googleresultspage1.html deleted file mode 100644 index e6fdd02..0000000 --- a/dac1/googlehtml/googleresultspage1.html +++ /dev/null @@ -1,2209 +0,0 @@ - - - - - - - - xianxia - Google Search - - - - - - - - - -
-
-
- -
- - - -
- -
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
-
-
-
- - Sign in - -
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
-
-
-
-
-
- -
-
-
-
-
-
-
-
- -
- - -
-
-
-
- -
-
-
-
-
- About 571,000 results - - (0.28 seconds) - -
-
-
-
- -
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
- -
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
-
- -
-

- Searches related to xianxia -

-
- - -
-
- -
-
-
-
- -
-
-
-
-
-
- -
-
-
-
-
-
-
- - - -
-
-
-
- - -
-
- -
-
-
-
-
-
- -
-
-
-
-
-
- -
- -
- - - - - - \ No newline at end of file diff --git a/dac1/googlehtml/googleresultspage2.html b/dac1/googlehtml/googleresultspage2.html deleted file mode 100644 index 391a1c1..0000000 --- a/dac1/googlehtml/googleresultspage2.html +++ /dev/null @@ -1,1884 +0,0 @@ - - - - - - - - xianxia - Google Search - - - - - - - - - -
-
-
- -
- - - -
- -
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
-
-
-
- - Sign in - -
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
-
-
-
-
-
- -
-
-
-
-
-
-
-
- -
- - -
-
-
-
- -
-
-
-
-
- Page 2 of about 571,000 results - - (0.27 seconds) - -
-
-
-
- -
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
- -
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
-
- -
-

- Searches related to xianxia -

-
- - -
-
- -
-
-
-
- -
-
-
-
-
-
- -
-
-
-
-
-
-
- - - -
-
-
-
- - -
-
- -
-
-
-
-
-
- -
-
-
-
-
-
- -
- -
- - - - - - \ No newline at end of file diff --git a/dac1/googlehtml/googleresultspage3.html b/dac1/googlehtml/googleresultspage3.html deleted file mode 100644 index 6c229f4..0000000 --- a/dac1/googlehtml/googleresultspage3.html +++ /dev/null @@ -1,1870 +0,0 @@ - - - - - - - - xianxia - Google Search - - - - - - - - - -
-
-
- -
- - - -
- -
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
-
-
-
- - Sign in - -
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
-
-
-
-
-
- -
-
-
-
-
-
-
-
- -
- - -
-
-
-
- -
-
-
-
-
- Page 3 of about 571,000 results - - (0.28 seconds) - -
-
-
-
- -
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
- -
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
-
- -
-

- Searches related to xianxia -

-
- - -
-
- -
-
-
-
- -
-
-
-
-
-
- -
-
-
-
-
-
-
- - - -
-
-
-
- - -
-
- -
-
-
-
-
-
- -
-
-
-
-
-
- -
- -
- - - - - - \ No newline at end of file diff --git a/dac1/googlelinks.txt b/dac1/googlelinks.txt deleted file mode 100644 index d91bb38..0000000 --- a/dac1/googlelinks.txt +++ /dev/null @@ -1,28 +0,0 @@ -http://cocxianxia.wikia.com/wiki/CoC-Xianxia_Wiki -http://robynpaterson.com/?p=3823 -http://tvtropes.org/pmwiki/pmwiki.php/Main/Xianxia -http://www.books.shushengbar.com/?cat=4 -http://www.jesperkyd.com/music/world-of-xian-xia-2/ -http://www.novelupdates.com/genre/xianxia/ -http://www.novelupdates.com/series-ranking/ -http://www.novelupdates.com/series/world-of-xianxia/ -http://www.wuxiaworld.com/general-glossary-of-terms/ -http://www.wuxiaworld.com/wuxia-vs-xianxia/ -http://www.xianxiaworld.net/ -http://zenithnovels.com/ -https://chinese.yabla.com/chinese-english-pinyin-dictionary.php?define=xianxia -https://en.wikipedia.org/wiki/Xianxia_novel -https://forum.royalroadl.com/showthread.php?tid=97232 -https://orcid.org/0000-0003-4426-9784 -https://ro.my.com/forum/thread/1830-xianxia-wuxia-stories/ -https://twitter.com/search?q=%23XianXia&lang=en -https://wuxiadream.net/xianxia/ -https://www.deviantart.com/tag/xianxia -https://www.pinterest.com/mrbaibaoding/xianxia-wuxia-martial-arts-and-fantasy/ -https://www.readlightnovel.org/category/xianxia -https://www.reddit.com/r/noveltranslations/comments/44afq0/rec_any_good_wuxiaxianxia_novels_to_recommend/ -https://www.reddit.com/r/noveltranslations/comments/5bexwt/xianxia_vs_xuanhuan/ -https://www.wattpad.com/tags/xianxia/hot -https://www.webnovel.com/popular/xianxia -https://www.youtube.com/watch?v=b2ABmPalvv4 -https://xianxiafr.com/