diff --git a/.DS_Store b/.DS_Store index bb5933c..58af5eb 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml index fbc4fe7..25fce27 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -1,7 +1,14 @@ - + + + + + + + + + + @@ -271,6 +75,7 @@ + @@ -415,15 +220,15 @@ - + - + - + - + @@ -543,43 +348,23 @@ - - - - - - - - - - - - - - - - - - - - - + - + - + - + @@ -587,194 +372,142 @@ - + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - + diff --git a/ENV/spider.py b/ENV/spider.py index 918808c..305e094 100644 --- a/ENV/spider.py +++ b/ENV/spider.py @@ -1,37 +1,173 @@ # -*- coding: utf-8 -*- -import re + import requests -import json from bs4 import BeautifulSoup +import os +import sys + def print_greeting(): print("Crawler running....") - return input("Please enter your search terms: ") + # Replace any spaces in the directory name with underscores + dir_name = input("Please enter your project name: ").replace(" ", "_") + num = input("Please enter the number of pages to be searched: ") + queries = input("Please enter your search terms: ") + return dir_name,num,queries + + +def query_google(dir_name,num,params): -def query_google(params): - search_term = {"q" : params} + + # Create project directory + if not os.path.exists(dir_name): + print("Creating directory : " + dir_name + "....") + os.makedirs(dir_name) + print("Directory created") + else: + print("Directory already exists, proceeding with web scraping...") + + # Create dir to store raw html + search_dir = dir_name + "/googlehtml" + if not os.path.exists(search_dir): + os.makedirs(search_dir) + else: + print("Dir for Google search html already exists,proceeding with web scraping...") + + # Query Google user_agent = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"} - google = "https://www.google.com.sg/search" - # Spoofs a browser visit that times out after 5 sec(no reply within 5 sec) - r = requests.get(google, headers=user_agent, params=search_term, timeout=5.0) - print(r.request.url) - google_results = r.text.encode("utf-8") - soup = BeautifulSoup(google_results, "html.parser") - # Write to file - with open("googleresults.html", "wb") as file: - file.write(soup.prettify().encode("utf-8")) - return soup + + # Get number of pages to search -> If we want to search by num of results instead of pages + # results_per_page = 10 + # num = int(num) + # if(num % results_per_page is 0): + # pages = int(num / results_per_page) + # else: + # pages = int(math.ceil(num/10) % results_per_page) + # print("Number of google pages to search: " + str(pages)) + + # NOTE: There are 10 results per page, but not all 10 are URLs. + # E.g some may be link to gallery of images + results_per_page = 10 + # Return a set from this function containing the file names of all the pages scraped + file_names = set() + + for page in range(int(num)): + + cur_page = results_per_page * (page) + search_term = {"q": params, "start": cur_page} + # Spoofs a browser visit that times out after 5 sec(no reply within 5 sec) + url = "https://www.google.com.sg/search" + r = requests.get(url, headers=user_agent, params=search_term, timeout=5.0) + print("Searching page " + str(page + 1) + " ....\n") + # Add page to file_name set + google_results = r.text.encode("utf-8") + soup = BeautifulSoup(google_results, "html.parser") + # To be returned from this function + file_names.add(soup) + # Used to store the html files to be saved + file_name = search_dir + "/googleresultspage" + str(page + 1) + ".html" + + # Write to file + with open(file_name, "wb") as file: + file.write(soup.prettify().encode("utf-8")) + + print("Saving results.....\n Results saved.\n\n") + return file_names + def get_search_links(search_results): - for result in search_results.body.find_all("a"): - print(result.get("href")) + links = set() + for result in search_results: + # For each

tag, find the tag and get the href value + link_set = result.body.find_all("h3", class_="r") + + # Add the href value to a set that will be saved to a file + for link in link_set: + print("Extracting link...\n") + links.add(link.find("a").get("href")) + + with open(dir_name + "/googlelinks.txt","a") as f: + print(str(len(links)) + " links extracted.") + print("Saving links...") + for each in links: + f.write(each + "\n") + print("Links saved.") + return links + + +def crawl_links(set_of_links): + + # Create results dir + results_dir = dir_name + "/results" + if not os.path.exists(results_dir): + print("Creating directory to store results") + os.makedirs(results_dir) + print("Directory created.") + else: + print("Directory already exists, continuing with web scraping") + + user_agent = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"} + + results_set = set() + + for each in set_of_links: + print("Crawling through link...") + r = requests.get(each, headers=user_agent, timeout=10.0) + html = r.text.encode("utf-8") + soup = BeautifulSoup(html,"html.parser") + body = soup.body.encode("utf-8") + + # Get file name(E.g google from https://ww.google.com + print("Original: " + str(each)) + splitted = each.split(".") + # E.g http://www.abc.com/example/year/month/day/page.html -> length of 4 + # splitted[-3] => abc + if(len(splitted) > 3): + file_name = splitted[-3] + else: + # E.g http://abc.com OR http://www.abc.com + # splitted[-2] => abc + file_name = splitted[-2] + if "http" in file_name: + file_name = file_name.split("//")[1] + print("Domain Name: " + str(file_name)) + results_set.add((file_name,body)) + + # Used to check duplicate file names arising from multiple links from the same domain name + file_counter = 0 + + # Save results to file + for tup in results_set: + + # Check if name is too long(Indicative of error in getting file name above) + if len(tup[0]) > 20: + name = tup[0][:20] + else: + name = tup[0] + + # Replaces any / in the name with _ if they exists + if "/" in name: + name = name.replace("/", "_") + + # Check if file name already exists(Multiple pages from same site) + if os.path.exists(results_dir + "/" + name + ".html"): + file_counter += 1 + save_name = results_dir + "/" + name + str(file_counter) + ".html" + + else: + file_counter = 0 + save_name = results_dir + "/" + name + ".html" + print("Saving file: " + name) + with open(save_name,"w") as f: + f.write(str(tup[1])) if __name__ == "__main__": - user_input = print_greeting() - query_results = query_google(user_input) - get_search_links(query_results) \ No newline at end of file + dir_name,num,params = print_greeting() + query_results = query_google(dir_name,num,params) + search_links = get_search_links(query_results) + crawl_links(search_links) \ No newline at end of file diff --git a/googleresults.html b/googleresults.html deleted file mode 100644 index da4107b..0000000 --- a/googleresults.html +++ /dev/null @@ -1,2645 +0,0 @@ - - - - - - - - spider - Google Search - - - - - - - - - -
-
-
- -
- - -
-
- -
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
-
-
-
- - Sign in - -
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
-
-
-
-
-
- -
-
-
-
-
-
-
-
- -
- - -
-
-
-
- -
-
-
-
-
- About 449,000,000 results - - (0.58 seconds) - -
-
-
-
- -
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
- -
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
-
- -
-

- Searches related to spider -

-
- - -
-
- -
-
-
-
- -
-
-
-
- -
-
- -
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
-
-
-
- -
-
- - - Image result for spider - - - -
-
- {"cb":3,"clt":"n","ct":3,"id":"395EhSgioGESGM:","ml":{"278":{"bh":99,"bw":66},"366":{"bh":94,"bw":60},"454":{"bh":95,"bw":71}},"oh":321,"ou":"https://upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Theraphosa_blondi_MHNT.jpg/220px-Theraphosa_blondi_MHNT.jpg","ow":220,"pt":"Spider - Wikipedia","rh":"en.wikipedia.org","rid":"zqfcXCZTEAMlqM","rmt":0,"rt":0,"ru":"https://en.wikipedia.org/wiki/Spider","s":"","st":"Wikipedia","th":120,"tu":"https://encrypted-tbn0.gstatic.com/images?q\u003dtbn:ANd9GcS5H96Unh1PMCSRQsfnadnlC7XCPfjPnVTwodvmiSeDvv9bj2eVvGdkRzs","tw":82} -
-
- - -
-
- - - Image result for spider - - - -
-
- {"cl":6,"clt":"n","cr":3,"id":"CZ7L15cOy_SfsM:","ml":{"278":{"bh":99,"bw":117},"366":{"bh":94,"bw":107},"454":{"bh":95,"bw":131}},"oh":410,"ou":"http://i3.mirror.co.uk/incoming/article11010615.ece/ALTERNATES/s615b/Jumping-spider.jpg","ow":615,"pt":"Is spider season coming early? Huge creature discovered in east ...","rh":"mirror.co.uk","rid":"nky1zW4MCndO-M","rmt":0,"rt":0,"ru":"http://www.mirror.co.uk/news/uk-news/spider-season-coming-early-huge-11010490","s":"","st":"Mirror","th":104,"tu":"https://encrypted-tbn0.gstatic.com/images?q\u003dtbn:ANd9GcQmqfdgpX_b8kRtWOUJ0KQ3q56lvcsiCn9GJe-bSoE2Ny7ln2LOsSXTOBc","tw":156} -
-
- - -
-
- - - Image result for spider - - - -
-
- {"cb":6,"cl":3,"clt":"n","cr":3,"ct":3,"id":"IjiAvEaH7AEJAM:","ml":{"278":{"bh":99,"bw":93},"366":{"bh":94,"bw":85},"454":{"bh":95,"bw":109}},"oh":600,"ou":"http://i1.mirror.co.uk/incoming/article4200837.ece/ALTERNATES/s615b/Tube-Web-spider.jpg","ow":615,"pt":"10 common spiders found in and around Britain\u0027s homes - but are they ...","rh":"mirror.co.uk","rid":"pHy9B9T8S46HCM","rmt":0,"rt":0,"ru":"http://www.mirror.co.uk/news/uk-news/10-common-spiders-found-around-4200933","s":"","st":"Mirror","th":126,"tu":"https://encrypted-tbn0.gstatic.com/images?q\u003dtbn:ANd9GcSRG8PWXRGUoc3TSnMNvpl31PqGdetu16xRLVMhyfxEulRAwOlYmw5kYRs","tw":129} -
-
- - -
-
- - - Image result for spider - - - -
-
- {"cb":3,"cl":3,"clt":"n","cr":3,"ct":3,"id":"OPsL_Ncmc6uWiM:","ml":{"278":{"bh":86,"bw":131},"366":{"bh":94,"bw":111},"454":{"bh":95,"bw":140}},"oh":605,"ou":"https://cdn.orkin.com/images/spiders/brown-recluse-spider-illustration_1017x605.jpg","ow":1017,"pt":"Brown Recluse Spiders: Facts, Identification, Behavior \u0026 Control","rh":"orkin.com","rid":"sMDxmIzLUkme2M","rmt":0,"rt":0,"ru":"https://www.orkin.com/other/spiders/brown-recluse-spiders/","s":"","st":"Orkin","th":104,"tu":"https://encrypted-tbn0.gstatic.com/images?q\u003dtbn:ANd9GcTiGnStYGHUQv3oRrFK08nFOg0A5jgbBv7AdvuLXEtcrNOCHaqqECsyZJY","tw":175} -
-
- - -
-
- - - Image result for spider - - - -
-
- {"cb":15,"cl":6,"clt":"n","cr":3,"id":"-vWMveNH2RlNFM:","ml":{"278":{"bh":86,"bw":146},"366":{"bh":91,"bw":144},"454":{"bh":90,"bw":126}},"oh":426,"ou":"http://www.burkemuseum.org/sites/default/files/white-tailed-spider%20%281%29.jpg","ow":641,"pt":"Spider Myths | Burke Museum","rh":"burkemuseum.org","rid":"PcOEYfdnIjJt9M","rmt":0,"rt":0,"ru":"http://www.burkemuseum.org/blog/curated/spider-myths","s":"","st":"Burke Museum","th":97,"tu":"https://encrypted-tbn0.gstatic.com/images?q\u003dtbn:ANd9GcS0Q8sxmlA6VU4-zovPx00i2bKy0cHayEhlajNLnjzKYS3ZbyUOt7vdhuA","tw":146} -
-
- - - - - - - - - - -
-
-
-
-
-
-
-
- - -
-
-
-
-
-
- -
-
- -
- - - - - - -
- -
-
-
-
- - - -
-
-
-
-
-
-
-
-
- - Spider - -
-
- - Animal - -
-
-
- -
-
-
-
-
-
-
- -
-
-
- - Spiders are air-breathing arthropods that have eight legs and chelicerae with fangs that inject venom. They are the largest order of arachnids and rank seventh in total species diversity among all other orders of organisms. - - - - - - Wikipedia - - -
-
-
- -
-
- - -
-
- -
-
- - - Scientific name - - : - - - Araneae - -
-
- -
-
- -
-
- - - Class - - : - - - Arachnida - -
-
- -
-
- -
-
- - - Order - - : - - - Araneae; Clerck, 1757 - -
-
- -
-
- - - -
-
- -
-
- - - Lifespan - - : - - - - Brown recluse spider - - : 1 – 2 years, - - Goliath birdeater - - : 15 – 25 years, - - Southern black widow - - : 1 – 3 years - -
-
- -
-
- -
-
- - - Diet - - : - - - - Carnivore - - -
-
- -
-
-
- -
-
-
-
-
-
-
-
-
- -
-
- - Feedback - -
-
-
-
-
-
-
-
-
-
-
- - - -
-
-
-
- - -
-
- -
-
-
-
-
-
- -
-
-
-
-
-
- -
- -
- - - - - - \ No newline at end of file diff --git a/text.html b/text.html deleted file mode 100644 index f5d3cd1..0000000 --- a/text.html +++ /dev/null @@ -1,771 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - The world's leading software development platform · GitHub - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- Skip to content -
- - - - - - - - - - -
- -
- -
-
- - - -
- - -
-
-
-
-

Built for developers

-

- GitHub is a development platform inspired by the way you work. From open source to business, you can host and review code, manage projects, and build software alongside millions of other developers. -

-
-
-
-
-
- -
-
-
- - - -
-
-
- GitHub for teams -
-

- A better way to work together -

-

- GitHub brings teams together to work through problems, move ideas forward, and learn from each other along the way. -

- -
- -
-
-
-
- -
-
-

Write better code

-

- Collaboration makes perfect. The conversations and code reviews that happen in Pull Requests help your team share the weight of your work and improve the software you build. -

-

- Learn about code review on GitHub -

-
-
- -
-
- -
-
-

Manage your chaos

-

- Take a deep breath. On GitHub, project management happens in Issues and Projects, right alongside your code. All you have to do is mention a teammate to get them involved. -

-

- Learn about project management on GitHub -

-
-
-
-
- -
- -
-
- security-admin - -
- Security and administration -
-

- Boxes? Check. -

-

- We worried about your administrative and security needs so you don’t have to. From flexible hosting to authentication options, GitHub can help you meet your team’s requirements. -

- -

- - Learn about GitHub for Business - -

- -
-
- -
-
-

Code security

-

- Prevent problems before they happen. Protected branches, signed commits, and required status checks protect your work and help you maintain a high standard for your code. -

- -

Access controlled

-

- Encourage teams to work together while limiting access to those who need it with granular permissions and authentication through SAML/SSO and LDAP. -

- -

Hosted where you need it

-

- Securely and reliably host your work on GitHub.com. Or, deploy GitHub Enterprise on your own servers or in a private cloud using Amazon Web Services, Azure or Google Cloud Platform. -

-
-
-
-
- -
-
-
- integrations - -
- Integrations -
-

- Build on GitHub -

-

- Customize your process with GitHub apps and an intuitive API. Integrate the tools you already use or discover new favorites to create a happier, more efficient way of working. -

-

- Learn about integrations -

-
- -
-
-
-
-
-
-
-
-
- -
-

- Sometimes, there’s more than one tool for the job. Why not try something new? -

-

- Browse GitHub Marketplace -

-
-
-
- - - -
-
-

- Get started for free — join the millions of developers already using GitHub to share their code, work together, and build amazing things. -

-
-
-
-
- - - - -
- - - - - - -
- - - You can't perform that action at this time. -
- - - - - - - - - - -
- - You signed in with another tab or window. Reload to refresh your session. - You signed out in another tab or window. Reload to refresh your session. -
- - - - - -