In [98]:
# IMPORTATION OF NECESSARY LIBRARIES

In [99]:
# -- imports the Beautiful Soup Library for Parsing HTML Code.

from bs4 import BeautifulSoup

# -- imports the request library for making HTML requests to a website.

import requests

# -- imports the regex library for parsing of data
import re

# -- imports pandas library for creating and working with dataframe (s)
import pandas as pd



In [100]:
class WebsiteAccess:

    """Class Initialization"""
    def __init__(self):
        pass

    # url1 = "https://en.wikipedia.org/wiki/List_of_universities_in_Nigeria"
    # url2 = "https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue"

    # --- method for taking in the URL from the user, and returning the website reponse back to the user    
    def website_response(self):
    
        url = input("Kindly enter the url of the website you want to scrape: ") # requests website link from the user
    
        # try-except block to handle any unexpected error that might occur
    
        try:
            response = requests.get(url) # sends a request to website and gets a response, 200 means success
            
            response.raise_for_status() # returns a HTPPError if the response code was unsuccessful
            
        except requests.exceptions.RequestException as e:
            print( f"Failed to get response from the website: {e}") # prints back the error that occurs
            
        # returns the website's response and raw html code in an easy to read hierarchical format    
        return response
    
    
    # PLEASE NOTE: check - https://developer.mozilla.org/en-US/docs/Web/HTTP/Status for meaning of other potential responses


    # --- method to get the selected table using its index
    
    def table_html_code(self, table_index : int, soup):
        
        table = soup.find_all("table") [table_index] # gets the html code for the selected table -- works for tables in any website
        # table = soup.find_all("table") # returns the total number of tables in the website -- only works for tables in wikipedia
        return table # returns htmll code for current table


In [101]:
# *** Definition of WebsiteScraping Class with Parent Class: WebsiteAcess

class WebsiteScraping(WebsiteAccess):
    
    """Class Initialization"""
    def __init__(self):
        pass

    #  --- method to get the name of the table
   
    def get_table_name(self, total_no_of_tables, soup, name_tag = "h2"):
        
        h2_tag = soup.find_all(name_tag, id=True) # gets the h2 hmtl tag for the current table using "table_index" together with its contents

        # NOTE "id = True" above ensures that the h2 tag of the current table
        table_name_lst = [title.text.strip("\n") for title in h2_tag] # loops through the content of the current table's h2 html tag and gets only text data type element while stripping off newline characters and stores it in a list
        regex_pattern = r'[^a-zA-Z\s]' # regex pattern to remove any other foreign text characters
        table_name_lst = [re.sub(regex_pattern, '', title) for title in table_name_lst] # removes all foreign characters from the names using the regex_pattern
        table_valid_names = table_name_lst[:(total_no_of_tables)] # gets the formatted text data element out of the list "table_name_lst" using list indexing 
        return table_valid_names # returns the list of valid table names in the html doc

    # --- method to define the generator function
    
    def index_generator(self, table_valid_names_lst, total_no_of_tables, soup):
        
        for i in range(len(self.get_table_name(total_no_of_tables, soup))):
            yield i # creates a generator (a kind of iterable) for the current index and resumes from there when called next
            
    # NOTE: Difference between yeild and return statement is that for a return statemnt, the function's state is discarded once call has been executed and subsequent call start
    # from the beginning but for a yield statement, the function's is resumed from where it left off allowing it to produce values over multiple call.
    # This was done to help return different index values for the tags since the tag_index() function was placed inside a for loop at the get_all_tables() function.
    
    
    # --- method to define the function that will use the generator
    
    def tag_index(self,gen, total_no_of_tables, soup):
        
        # try-except block to get the next generator or print an error statement incase of any errors
        try:
            # Get the next index from the generator
            index = next(gen) # retrieves the next index by advancing the generator to the next yield statement
            # print(index)
            # Access the list element using the index
            return index 
            
        except StopIteration:
            print("No more items to iterate.")    
       
        
    # --- method to get all rows in the table
    
    def table_data(self, table_index : int, soup, header_title_tag : str = "th" ): 
        
        table = self.table_html_code(table_index, soup) # calls the table_html_code method and returns table specified by index while assigning it to the "table" variable
        # header_cells = table.find_all(header_title_tag) # gets all header_cells from the table
        header_cells = table.find_all(header_title_tag) # gets all header_cells from the table

        # ********************************************************************************************************************************
        #  --- first option code to get only the string name of each header column e.g "State", "Abbreviation", "Location" etc
        
        # regex_pattern = r'<\/?[a-zA-Z]+>|\n' # regex pattern to remove opening or closing tags and newline characters
        # replacement = "" # replacement string for the matched regex pattern
        # header_title_names = [] # new list for the header_titles
        
        # # loops through each element of the header_cell list object and gets the only the String name of each column
        # for column_name in header_cells:
        #     # print(type(column_name)) # gets the type of each element in the header_cell list
        #     column_name = str(column_name) # converts each element of the header_cell list from a beautifulsoup  tag to string object
        #     column_name = re.sub(regex_pattern, replacement, column_name) # deletes the starting tags, ending tags and newline characters in each element of header_cells
        #     header_title_names.append(column_name) # appends the string name of the column to the new header_title list
        # *********************************************************************************************************************************

        # --- alternative code to the previous one using list comprehension
        
        header_title_names = [title.text.strip("\n") for title in header_cells] # loops through all elements of header_cells list and gets only text data type element while stripping off newline characters and appending them to the header_title list
        
        # print(header_title) # prints the header_title list

        
        all_row_data = [header_title_names] # creates a list that contains the "header_title_names" in a list.
        
        
        # --- code to get all the data for each row in the table the html raw code
        
        table_rows = table.find_all("tr")[1:] # gets all the rows in the table excluding the header_title_row using list slicing
        
    
        # loops through each element of the table_rows list object
        for row in table_rows:
            each_row = row.find_all("td") # gets all td tag elements form each row
            each_row_data = [data.text.strip("\n") for data in each_row] # gets only text data from the td tag elements of each row and strips off the newline character
            # print(each_row_data) # prints the formatted row for each element row in table_rows
        
            all_row_data.append(each_row_data) # appends each row data to the list "all_row_data"
    
        return all_row_data # returns all the rows in the table in a list object "all_row_data"
    
    
    

In [102]:
# *** Definition of WebsiteDataExtraction Class with Parent Class: WebsiteScraping

class WebsiteDataExtraction(WebsiteScraping):
    """Class Initialization"""
    def __init__(self):
        pass

    # --- method to create dataframe
    
    def get_into_df(self, table_index : int, soup):
        
        # table_df = pd.DataFrame(columns = self.table_data(table_index, soup)) # creates a new data frame for the table using the "table_data" method -- only works for wikipedia tables
        table_df = pd.DataFrame(self.table_data(table_index, soup)) # creates a new data frame for the table using the "table_data" method -- works for any tables
        return table_df # returns the created dataframe
    
    # --- method to write dataframe to a csv file
    
    def convert_df2_csv(self, total_no_of_tables, table_index, tg_index: int, soup):
        
        table_name = self.get_table_name(total_no_of_tables, soup)[tg_index] # calls the "get_table_name" function to get the name of the current table        
        # try-except block to handle any exceptions that may arise
        try:
            csv_file = self.get_into_df(table_index, soup).to_csv(f"{table_name}.csv", index = False) # writes to a csv file with table_name without an indexing column
            return csv_file # returns the created csv file
            
        except PermissionError as pe:
            print("Permission Error: The file you are trying to modify is currently in use. Kindly close it or use another file") # prints this out incase a Permission Error is raised
            
        except Exception as e:
            print(f"An Error Occured: {e}") # prints this incase any other error is raised.

    # --- method to write all available tables in the website to csv files.
    
    def get_all_tables(self):
        
        response = self.website_response() # method call for the website's response and assigns the result to the "response" variable
        soup = BeautifulSoup(response.text, "html") # takes the raw response and returns it in html format
        # total_no_of_tables = len(soup.find_all("table", class_ = "wikitable sortable")) # returns the total number of tables in the website -- only works for wikipedia tables
        total_no_of_tables = len(soup.find_all("table")) # returns the total number of tables in the website -- works for any table
        gen = self.index_generator(self.get_table_name(total_no_of_tables, soup), total_no_of_tables, soup) # gets generator object for the next valid tag index

        # for loop to iterate through all tables and name only tables that are not empty
        for table_index in range(total_no_of_tables):
            # conditional statement to check if table is empty
            if self.get_into_df(table_index, soup).empty != True:
                tg_index = self.tag_index(gen,total_no_of_tables, soup) # gets the next valid index if current table is not empty
                # print(tg_index)
                self.convert_df2_csv(total_no_of_tables, table_index, tg_index, soup) # writes the current table dataframe to a csv file
                         

In [103]:
# creates an instance of the WebsiteDataExtraction class 
extraction = WebsiteDataExtraction()

# calls the get_all_tables() method
extraction.get_all_tables()

Kindly enter the url of the website you want to scrape:  https://en.wikipedia.org/wiki/List_of_universities_in_Nigeria
