In [108]:
import requests                  # Request library making HTTP requests to web servers
from bs4 import BeautifulSoup    # Library used for web scraping and parsing HTML and XML documents
import json                      # Library used for manipulate with json file

In [109]:
# URL of the webpage with the table
# URL of list of largest bank in the world
url = 'https://web.archive.org/web/20200318083015/https://en.wikipedia.org/wiki/List_of_largest_banks'

In [110]:
# Send a GET request to the URL to fetch the webpage contents
response = requests.get(url)

In [111]:
# Common path in this project
c_path = "C:\\Users\\ASUS\\OneDrive\\Máy tính\\KHTN_HK2\\Webscraping_DE2\\"

In [112]:
# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Get the HTML content from the response
    html_content = response.text
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the table containing the data (using the table's 'wikitable' class)
    table = soup.find('table', class_='wikitable')
    
    # Create empty list to store data
    bank_data = []
    
    # Loop through table rows and extract data
    for row in table.find_all('tr')[1:]:  # Start from the second row to skip the header
        cells = row.find_all('td')
        rank = cells[0].get_text(strip=True) # strip=True to remove leading and trailing whitespace (spaces, tabs, and newline characters) from a string 
        
        # Find the 'a' tag that doesn't have a 'span' parent
        a_tags = cells[1].find_all('a', string=lambda text: text and not text.find_parents('span'))
        bank_name = a_tags[0]['title'] if a_tags else ""
        total_assets = cells[2].get_text(strip=True)
        
        bank_data.append({
                            'Rank': rank,
                            'Bank Name': bank_name,
                            'Total Assets (US$ Billion)': total_assets
                            })
    
    # Save the data in a JSON file 
    with open('bank_data.json', 'w') as json_file:
        # json.dump() for serialize (convert) Python objects into a JSON formatted string and write it to a file-like object, typically a text file
        json_string = str(json.dump(bank_data, json_file, separators=(",", ":")))
        # setting format for json file
        json_file.write("[")
        for i, line in enumerate(json_string.split('\n')):
            if i == 0:
                json_file.write(line)
            else:
                json_file.write(f"\n    {line}")
        json_file.write("]")
    
    print("Data scraped and saved in: " + c_path + "bank_data.json file.")
    
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

Data scraped and saved in: C:\Users\ASUS\OneDrive\Máy tính\KHTN_HK2\Webscraping_DE2\bank_data.json file.


In [113]:
import glob                         # this module helps in selecting files 
import pandas as pd                 # this module helps in processing CSV files
from datetime import datetime       # this mdule to get real time

In [114]:
logfile    = c_path + "logfile.txt"            # all event logs will be stored in this file
targetfile = c_path + "targetfile.csv"         # file where transformed data is stored

In [116]:
with open(c_path + "bank_data.json") as json_file:
    data = json.load(json_file)
    print(data)  # Print the loaded JSON data


[{'Rank': '1', 'Bank Name': 'Industrial and Commercial Bank of China', 'Total Assets (US$ Billion)': '4,027.44'}, {'Rank': '2', 'Bank Name': 'China Construction Bank', 'Total Assets (US$ Billion)': '3,376.52'}, {'Rank': '3', 'Bank Name': 'Agricultural Bank of China', 'Total Assets (US$ Billion)': '3,287.36'}, {'Rank': '4', 'Bank Name': 'Bank of China', 'Total Assets (US$ Billion)': '3,092.21'}, {'Rank': '5', 'Bank Name': 'Mitsubishi UFJ Financial Group', 'Total Assets (US$ Billion)': '3,069.20'}, {'Rank': '6', 'Bank Name': 'HSBC', 'Total Assets (US$ Billion)': '2,715.15'}, {'Rank': '7', 'Bank Name': 'JPMorgan Chase', 'Total Assets (US$ Billion)': '2,687.38'}, {'Rank': '8', 'Bank Name': 'Bank of America', 'Total Assets (US$ Billion)': '2,354.51'}, {'Rank': '9', 'Bank Name': 'BNP Paribas', 'Total Assets (US$ Billion)': '2,336.66'}, {'Rank': '10', 'Bank Name': 'Crédit Agricole', 'Total Assets (US$ Billion)': '2,123.61'}, {'Rank': '11', 'Bank Name': 'Citigroup', 'Total Assets (US$ Billion)

In [117]:
# Function to extract json file
def extract_data_from_json(load_file):
    # create data frame from json file
    dataframe = pd.read_json(load_file, lines=False)
    return dataframe

In [118]:
# location of json file
json_file = c_path + "bank_data.json"
dataframe = extract_data_from_json(json_file)

In [119]:
def get_CSV_file(data_frame):
    csv_filename = c_path + 'bank_data.csv'
    data_frame.to_csv(csv_filename, index=False)
    return None

In [120]:
def log(message):
    timestamp_format = '%H:%M:%S-%h-%d-%Y' #Hour-Minute-Second-MonthName-Day-Year
    now = datetime.now() # get current timestamp
    timestamp = now.strftime(timestamp_format)
    with open("logfile.txt","a") as f:
        f.write(timestamp + ',' + message + '\n')

In [121]:
#Call function extract data from json file
log("Extract data from json file")
json_file = c_path + "bank_data.json"
extracted_data = extract_data_from_json(json_file)
log("Extract data Ended")
# Log that you have started the Load step
log("Load phase Started")
# Call the Load function
get_CSV_file(extracted_data)
# Log that you have completed the Load step
log("Load phase Ended")