<a href="https://colab.research.google.com/github/pelumiogunremu/Pelumi-Ogunremu/blob/main/canadian_business_extraction_by_location.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Project Outline

    - https://www.yellowpages.ca/locations/ is the website to be scraped
    - Request for the webpage content using the requests library
    - Parse the webpage content using the BeautifulSoup from the bs4 library
    - Get the tags using the tag name and class selector
    - Grab all the information needed from the tags and store each in a separate list
    - Import them into the dataframe and export them as a csv file
    - Repeat the steps for the provinces, locations, business categories, sub business categories,
    sub sub business categories and the businesses


In [45]:
import concurrent.futures
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [48]:
# Functional Programs

def get_tag_contents(webpage_url, tag_name, find_tag, class_selector):
    """
    requests for the html content of the web page.
    
    parse the requested web page content into its component parts
    for easy extraction.
    
    locate all the tags with the given tag name and class selector
    
    returns:
    extract the text(name) from each tag and append it to the 
    name texts list and
    extract the link from each tag and store it in the 
    links list
    """
    
    soup = BeautifulSoup(requests.get(webpage_url).text, "html.parser")
    
    tags = [
        tag.find(find_tag, href=True) 
        for tag in soup.find_all(tag_name, class_=class_selector)
    ]
    
    tag_texts = [tag.text for tag in tags]
    tag_links = [f"https://yellowpages.ca{tag['href']}" for tag in tags]
    
    return tag_texts, tag_links

def get_tags_contents(webpage_urls, tag_name, find_tag, class_selector):
    """
    requests for the html content of the web page.
    
    store the webpage contents in a list
    
    parse the webpage contents
    
    store the parsed contents in a list

    extract all the texts and links from the tags at once
    """
    
    web_contents = [
        webpage_content 
        for webpage_content in (
            requests.get(url).text for url in webpage_urls
        )
    ]

    parsed_contents = [
        parsed_content 
        for parsed_content in (
            BeautifulSoup(content, "html.parser")
            for content in web_contents
        )
    ]

    tags_texts = [
                  [
                   tag.text for tag in [
                   tag.find(find_tag, href=True)
                  for tag in parsed_content.find_all(
                      tag_name, class_=class_selector
                      )
                  ]
                ]
              for parsed_content in parsed_contents
     ]

    tags_links = [
                  [
                   f"https://yellowpages.ca{tag['href']}" for tag in [
                   tag.find(find_tag, href=True)
                  for tag in parsed_content.find_all(
                      tag_name, class_=class_selector
                      )                                                 
                  ]
                ]
              for parsed_content in parsed_contents 
    ]

    return tags_texts, tags_links

def for_df(contents):
  """
  adjust for the dataframe
  """
  list(
      set(
          [i for link in ypca_bus_cat for i in link]
      )
  )
  
def get_tags_texts(webpage_urls):
    """
    requests for the html content of the web page.
    
    store the webpage contents in a list
    
    parse the webpage contents
    
    store the parsed contents in a list

    extract needed texts from the tags at once
    """
    
    web_contents = [
        webpage_content 
        for webpage_content in (
            requests.get(url).text for url in webpage_urls
        )
    ]

    parsed_contents = [
        parsed_content 
        for parsed_content in (
            BeautifulSoup(content, "html.parser")
            for content in web_contents
        )
    ]

    tags_texts_1 = [
                  [
                   tag.text for tag in [
                   tag.find("a", href=True)
                  for tag in parsed_content.find_all(
                      "h3", class_="listing__name, jsMapBubbleName"
                      )
                  ]
                ]
              for parsed_content in parsed_contents
     ]
     
    tags_texts_2 = [
                [
                  tag.text for tag in [
                  tag.find("a", href=True)
                for tag in parsed_content.find_all(
                    "div", class_="listing__headings"
                    )
                ]
              ]
            for parsed_content in parsed_contents
    ]

    tags_texts_3 = [
                [
                  tag.text for tag in [
                  tag.find("div", href=True)
                for tag in parsed_content.find_all(
                    "span", class_="listing__address address mainLocal"
                    )
                ]
              ]
            for parsed_content in parsed_contents
    ]

    return tags_texts_1, tags_texts_2, tags_texts_3
     
#    clean the name
# [name.replace('"', '') for name in content_names]

### Canadian Provinces Extraction

In [49]:
ypca_url = "https://www.yellowpages.ca/locations/"
# ypca_prov, ypca_prov_links = get_tag_contents(ypca_url, "h3", "a", "categories-title catTitle")

ypca_prov_df_index = list(range(1, len(ypca_prov) + 1))
ypca_prov_df = pd.DataFrame({"Provinces": ypca_prov}, index=ypca_prov_df_index)

# ypca_prov_df.to_csv("canada_provinces.csv", index=False)

### Canadian Locations Extraction

In [52]:
# ypca_loc, ypca_loc_links = get_tags_contents(ypca_prov_links, "li", "a", "resp-list")

ypca_prov_columns = ["Location " + str(num + 1) for num in range(113)]
ypca_prov_locs_df = pd.DataFrame(ypca_loc, index=ypca_prov, columns=ypca_prov_columns)

# ypca_prov_locs_df.to_csv("canada_locations.csv")

### Canadian Business Categories Extraction

In [55]:
# ypca_loc_urls  = [i for link in ypca_loc_links for i in link]
# ypca_bus_cat, ypca_bus_cat_links = get_tags_contents(ypca_loc_urls, "h3", "a", "categories-title catTitle")

# ypca_bus_cat0 = for_df(ypca_bus_cat)
# ypca_bus_cat_df_index = list(range(1, len(ypca_bus_cat0) + 1))
# ypca_bus_cat_df = pd.DataFrame({"Business Categories": ypca_bus_cat0}, index=ypca_bus_cat_df_index)

# ypca_bus_cat_df.to_csv("canada_business_categories.csv", index=False)

### Canadian Sub Business Categories Extraction

In [32]:
# ypca_bus_cat_urls  = [i for link in ypca_bus_cat_links for i in link]
# ypca_subbus_cat, ypca_subbus_cat_links = get_tags_contents(ypca_bus_cat_urls, "h3", "a", "categories-title catTitle")

ypca_subbus_cat0 = for_df(ypca_subbus_cat)
ypca_subbus_cat_df_index = list(range(1, len(ypca_subbus_cat0) + 1))
ypca_subbus_cat_df = pd.DataFrame({"Business Sub Categories": ypca_subbus_cat0}, index=ypca_subbus_cat_df_index)

# ypca_subbus_cat_df.to_csv("canada_subbusiness_categories.csv", index=False)

### Canadian Sub Sub Business Categories Extraction

In [None]:
# ypca_subbus_cat_urls = [i for link in ypca_subbus_cat_links for i in link]
# ypca_subsubbus_cat, ypca_subsubbus_cat_links = get_tags_contents(ypca_subbus_cat_urls, "li", "a", "resp-list")

ypca_subsubbus_cat0 = for_df(ypca_subsubbus_cat)
ypca_subsubbus_cat_df_index = list(range(1, len(ypca_subsubbus_cat0) + 1))
ypca_subsubbus_cat_df = pd.DataFrame({"Business Sub Sub Categories": ypca_subsubbus_cat0}, index=ypca_subsubbus_cat_df_index)

# ypca_subsubbus_cat_df.to_csv("canada_sub_subbusiness_categories.csv", index=False)

### Canadian Businesses Extraction

In [None]:
# ypca_bus_urls  = [i for link in ypca_subsubbus_cat_links for i in link]
# ypca_bus, ypca_bus_type, ypca_bus_ad = get_all_texts(ypca_bus_urls)

ypca_bus_df_index = list(range(1, len(ypca_bus) + 1))
ypca_bus_df = pd.DataFrame(
                             {"Business name": ypca_bus,
                              "Business type": ypca_bus_type,
                              "Business address": ypca_bus_ad}, index=ypca_bus_df_index
                           )

# ypca_bus_cat_df.to_csv("canada_businesses.csv", index=False)