In [1]:
#Reference: https://github.com/parshva45/Web-Crawler

import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import time
import os
from urllib3.exceptions import InsecureRequestWarning
from urllib3 import disable_warnings

# keyword_matches returns True if one of the following is True:
# - Anchor text starts with the keyword
# - Anchor text contains a space followed by the keyword followed by a space
# - Anchor text contains a space followed by the keyword
# - Anchor text ends with a space followed by the keyword
# - Text within the URL starts with the keyword
# - Text within the URL contains an underscore followed by the keyword followed by an underscore
# - Text within the URL contains an underscore followed by the keyword
# - Text within the URL ends with an underscore followed by the keyword

working_dir = 'C:/Users/JWeinstein/Capstone-main/src/'

def keyword_matches(keyword, url_text, href_url):
    return url_text.startswith(keyword) or url_text.endswith(' '+keyword) or ' '+keyword+' ' in url_text or ' '+keyword in url_text or href_url.startswith(keyword) or href_url.endswith('_'+keyword) or '_'+keyword+'_' in href_url or '_'+keyword in href_url


def spider(seed_url, keyword, crawl_depth):

    # Wiki Main Page, which needs to be excluded while crawling
    main_page = 'https://en.wikipedia.org/wiki/Main_Page'
    # Count of number of URLs crawled
    crawled_count = 0
    # Maximum depth of crawling reached
    max_depth = 1

    # Frontier - A list of URLS depicting a queue containing next URLs to be crawled
    #          - will contain the seed URL initially
    frontier_urls = [seed_url]
    # Seen - A list of URLs containing URLs crawled, empty initially
    seen_urls = []

    # creation of Logs directory, if not present
    newpath = working_dir + r'Logs' 
    if not os.path.exists(newpath):
        os.makedirs(newpath)

    # creation of Raw_TXT_Downloads directory, if not present (commented below)

    newpath = working_dir + r'Raw_TXT_Downloads'
    if not os.path.exists(newpath):
        os.makedirs(newpath)

    # Mentioning the seed URL and keyword in focused_crawler_log.txt
    focused_crawler_log = open(working_dir + "Logs/focused_crawler_log.txt","w")
    focused_crawler_log.write("Seed : "+seed_url+"\n")
    focused_crawler_log.write("Keyword : "+keyword+"\n\n")

    # Writing into log file starting with Depth 1
    focused_crawler_log.write("Depth 1 :\n\n")
    crawled_count+=1
    focused_crawler_log.write(str(crawled_count)+") "+seed_url+"\n\n")

    # flag is True iff the limit of 1000 URLS has not been reached
    flag = True
    print("\n----------------------------------------- At depth 1--------------------------------------------------------")
    print(str(crawled_count)+") "+seed_url)

    # download web page into a txt file
    name = seed_url[seed_url.rfind('/')+1:]
    # Specify url of the web page, here: seed_url 
    source = urlopen(seed_url).read()
    # Make a soup 
    soup = BeautifulSoup(source,'lxml')
    # Extract the plain text content from paragraphs
    text = ''
    for paragraph in soup.find_all('p'):
        text += paragraph.text
    # Clean text
    text = re.sub(r'\[.*?\]+', '', text)
    text = text.replace('\n', '')
    # get title of the wikipedia article
    page_title = soup.find_all('title')[0].text[:-12]
    file = open(working_dir + "Raw_TXT_Downloads/"+(str(crawled_count))+") "+name+".txt","w", encoding='utf-8')
    file.write(seed_url + '\n' + page_title + '\n' + text)
    file.close()

    # assuming maximum depth to crawl as Depth 3
    for depth in range (2, crawl_depth+1):
        if flag:
            print("\n----------------------------------------- At depth "+str(depth)+"--------------------------------------------------------")
            focused_crawler_log.write("Depth "+str(depth)+" :\n\n")
            extracted_urls = []

            # Traversing through all the URLs to be crawled as pointed by the Frontier
            for frontier_url in frontier_urls:

                # Enter only if limit of 1000 URLs not reached
                if flag:

                    # get the soup
                    disable_warnings(InsecureRequestWarning)
                    source_code = requests.get(frontier_url, verify = False)
                    plain_text = source_code.text
                    soup = BeautifulSoup(plain_text,"html.parser")

                    # Filter URLs, choose only the ones starting with '/wiki/'
                    for link in soup.find_all('a', href=re.compile('^/wiki/')):

                        # check if limit reached or not
                        if crawled_count < 1000 and flag:

                            # retrieve the Anchor text and Text of the URL
                            url_text = link.text
                            href_url = link.get('href')
                            truncated_href_url = href_url[6:]

                            # Call helper function to check if keyword matches or not
                            if keyword_matches(keyword, url_text, truncated_href_url):

                                # Ignore all the Administrative URLs
                                if ':' not in href_url:

                                    # Handle URLs with '#' seperately
                                    if '#' not in href_url:
                                        url = 'https://en.wikipedia.org'+href_url

                                        # URL should not be in either of Frontier, Extracted or Seen lists and should not be Wiki Main Page too
                                        if url not in frontier_urls and url not in extracted_urls and url not in seen_urls and url != main_page:
                                            
                                            # Respecting the Politeness Policy
                                            time.sleep(0.2)

                                            # download web page into a txt file
                                            name = url[url.rfind('/')+1:]                                            
                                            # Specify url of the web page, here: url 
                                            source = urlopen(url).read()
                                            # Make a soup 
                                            soup = BeautifulSoup(source,'lxml')
                                            # Extract the plain text content from paragraphs
                                            text = ''
                                            for paragraph in soup.find_all('p'):
                                                text += paragraph.text
                                            # Clean text
                                            text = re.sub(r'\[.*?\]+', '', text)
                                            text = text.replace('\n', '')
                                            # get title of the wikipedia article
                                            page_title = soup.find_all('title')[0].text[:-12]
                                            file = open(working_dir + "Raw_TXT_Downloads/"+(str(crawled_count+1))+") "+name+".txt","w", encoding='utf-8')
                                            file.write(url + '\n' + page_title + '\n' + text)
                                            file.close()
                                            
                                            extracted_urls.append(url)
                                            crawled_count+=1
                                            focused_crawler_log.write(str(crawled_count)+") "+url+"\n")
                                            print(str(crawled_count)+") "+url)

                                    else:
                                        # Handle URLs with '#'
                                        hash_pos = href_url.index('#')

                                        # Trim the URL from the start till index before '#'
                                        url = 'https://en.wikipedia.org'+href_url[:hash_pos]

                                        # URL should not be in either of Frontier, Extracted or Seen lists and should not be Wiki Main Page too
                                        if url not in frontier_urls and url not in extracted_urls and url not in seen_urls and url != main_page:
                                            
                                            # Respecting the Politeness Policy
                                            time.sleep(0.2)

                                            # download web page into a txt file
                                            name = url[url.rfind('/')+1:]                                            
                                            # Specify url of the web page, here: url 
                                            source = urlopen(url).read()
                                            # Make a soup 
                                            soup = BeautifulSoup(source,'lxml')
                                            # Extract the plain text content from paragraphs
                                            text = ''
                                            for paragraph in soup.find_all('p'):
                                                text += paragraph.text
                                            # Clean text
                                            text = re.sub(r'\[.*?\]+', '', text)
                                            text = text.replace('\n', '')
                                            # get title of the wikipedia article
                                            page_title = soup.find_all('title')[0].text[:-12]
                                            file = open(working_dir + "Raw_TXT_Downloads/"+(str(crawled_count+1))+") "+name+".txt","w", encoding='utf-8')
                                            file.write(url + '\n' + page_title + '\n' + text)
                                            file.close()

                                            extracted_urls.append(url)
                                            crawled_count+=1
                                            focused_crawler_log.write(str(crawled_count)+") "+url+"\n")
                                            print(str(crawled_count)+") "+url)

                        else:
                            # limit of 1000 URLs reached
                            flag = False
                            print("Limit of 1000 URLs reached")
                            max_depth = depth
                            break

                    # Copy all the URLs from Frontier to Seen
                    seen_urls.append(frontier_url)

            # Case when no URLs found on the depth
            if len(extracted_urls) == 0:
                print("No matching URLs at Depth "+str(depth)+"\n")
                focused_crawler_log.write("No matching URLs at Depth "+str(depth)+"\n\n")
                flag = False
                max_depth = depth
                break
            # Copy all the Extracted URLs to the Frontier
            frontier_urls = extracted_urls
            focused_crawler_log.write("\n")

    # Maximum depth of Depth 3 reached        
    if flag:
        print(f"Searched till max depth {crawl_depth}")
        max_depth = crawl_depth

    focused_crawler_log.write("------------------------------------------------------------------------------------\n")
    focused_crawler_log.write("Logistics :\n\n")
    focused_crawler_log.write("Number of matching searches : "+str(crawled_count)+"\n")
    focused_crawler_log.write("Maximum depth reached : Depth "+str(max_depth)+"\n")
    focused_crawler_log.close()

seed_url = 'https://en.wikipedia.org/wiki/Health'
crawl_depth = 10
keyword = 'health'
spider(seed_url, keyword, crawl_depth)


----------------------------------------- At depth 1--------------------------------------------------------
1) https://en.wikipedia.org/wiki/Health

----------------------------------------- At depth 2--------------------------------------------------------
2) https://en.wikipedia.org/wiki/Mental_health
3) https://en.wikipedia.org/wiki/Health_care_provider
4) https://en.wikipedia.org/wiki/Healthy_community_design
5) https://en.wikipedia.org/wiki/Healthy_city
6) https://en.wikipedia.org/wiki/Healthy_environment
7) https://en.wikipedia.org/wiki/Health_care
8) https://en.wikipedia.org/wiki/Social_determinants_of_health
9) https://en.wikipedia.org/wiki/Health_science
10) https://en.wikipedia.org/wiki/Health_care_system
11) https://en.wikipedia.org/wiki/Health_policy
12) https://en.wikipedia.org/wiki/Global_health
13) https://en.wikipedia.org/wiki/Public_health
14) https://en.wikipedia.org/wiki/Health_education
15) https://en.wikipedia.org/wiki/School_health_services
16) https://en.wikipe

136) https://en.wikipedia.org/wiki/Health_data
137) https://en.wikipedia.org/wiki/Health_Human_Resources
138) https://en.wikipedia.org/wiki/List_of_countries_by_quality_of_health_care
139) https://en.wikipedia.org/wiki/List_of_countries_by_health_expenditure_covered_by_government
140) https://en.wikipedia.org/wiki/World_Health_Organization_ranking_of_health_systems
141) https://en.wikipedia.org/wiki/Euro_health_consumer_index
142) https://en.wikipedia.org/wiki/List_of_countries_by_total_health_expenditure_per_capita
143) https://en.wikipedia.org/wiki/Acronyms_in_healthcare
144) https://en.wikipedia.org/wiki/Catholic_Church_and_health_care
145) https://en.wikipedia.org/wiki/Comparison_of_the_health_care_systems_in_Canada_and_the_United_States
146) https://en.wikipedia.org/wiki/Consumer-driven_health_care
147) https://en.wikipedia.org/wiki/Publicly_funded_health_care
148) https://en.wikipedia.org/wiki/Timeline_of_global_health
149) https://en.wikipedia.org/wiki/Two-tier_health_care
150) 

266) https://en.wikipedia.org/wiki/Mental_health_and_socioeconomic_status
267) https://en.wikipedia.org/wiki/Mental_health_in_Canada
268) https://en.wikipedia.org/wiki/Mental_health_in_the_Middle_East
269) https://en.wikipedia.org/wiki/Mental_health_in_Israel
270) https://en.wikipedia.org/wiki/Mental_health_in_the_United_States
271) https://en.wikipedia.org/wiki/Mental_health_in_China
272) https://en.wikipedia.org/wiki/Impact_of_health_on_intelligence
273) https://en.wikipedia.org/wiki/Refugee_health_in_the_United_States
274) https://en.wikipedia.org/wiki/Refugee_health_care_in_Canada
275) https://en.wikipedia.org/wiki/Refugee_health
276) https://en.wikipedia.org/wiki/Mental_health_counseling
277) https://en.wikipedia.org/wiki/Mental_health_counselor
278) https://en.wikipedia.org/wiki/Clinical_psychology
279) https://en.wikipedia.org/wiki/Psychiatric_and_mental_health_nurse_practitioner
280) https://en.wikipedia.org/wiki/Global_Mental_Health
281) https://en.wikipedia.org/wiki/Mental_he

394) https://en.wikipedia.org/wiki/Health_regions_of_Canada
395) https://en.wikipedia.org/wiki/Case_management_(USA_health_system)
396) https://en.wikipedia.org/wiki/Behaviour_change_(public_health)
397) https://en.wikipedia.org/wiki/Planetary_diet
398) https://en.wikipedia.org/wiki/Maternal_and_child_health_in_Tanzania
399) https://en.wikipedia.org/wiki/Grossman_model_of_health_demand
400) https://en.wikipedia.org/wiki/Health_care_policy
401) https://en.wikipedia.org/wiki/Health_problem
402) https://en.wikipedia.org/wiki/Environmental_impact_of_meat_production
403) https://en.wikipedia.org/wiki/Planetary_health_diet
404) https://en.wikipedia.org/wiki/Copper_in_health
405) https://en.wikipedia.org/wiki/Health_risk_assessment
406) https://en.wikipedia.org/wiki/Occupational_health_and_safety
407) https://en.wikipedia.org/wiki/Healthcare_in_China
408) https://en.wikipedia.org/wiki/Women%27s_healthcare_in_the_People%27s_Republic_of_China
409) https://en.wikipedia.org/wiki/Child_health
410)

522) https://en.wikipedia.org/wiki/Comparison_of_Canadian_and_American_health_care_systems
523) https://en.wikipedia.org/wiki/Health_informatics_tools
524) https://en.wikipedia.org/wiki/Local_health_departments_in_the_United_States
525) https://en.wikipedia.org/wiki/Secretary_of_health
526) https://en.wikipedia.org/wiki/Health_inspector
527) https://en.wikipedia.org/wiki/Health_care_professional
528) https://en.wikipedia.org/wiki/State_health_agency
529) https://en.wikipedia.org/wiki/Health_record
530) https://en.wikipedia.org/wiki/Psychiatric_hospital
531) https://en.wikipedia.org/wiki/Federal_Compulsory_Medical_Insurance_Fund_(Russia)
532) https://en.wikipedia.org/wiki/Barangay_health_volunteer
533) https://en.wikipedia.org/wiki/Physical_Health
534) https://en.wikipedia.org/wiki/Mental_health_in_education
535) https://en.wikipedia.org/wiki/Comorbidity
536) https://en.wikipedia.org/wiki/Mental_health_among_female_offenders_in_the_United_States
537) https://en.wikipedia.org/wiki/Mental

644) https://en.wikipedia.org/wiki/Alcohol_consumption_and_health

----------------------------------------- At depth 10--------------------------------------------------------
No matching URLs at Depth 10

