In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import os

In [2]:
# Function to extract the title and trackdata from HTML files.
def html_parser (input:str, output:str):
    # Initialize a list to collect data for all files
    all_data = []

    # Loop through all files in the directory
    for file_name in os.listdir(input):
        if file_name.endswith(".html"):  # Process only HTML files
            file_path = os.path.join(input, file_name)

            # Open and process each file
            with open(file_path, "r", encoding="utf-8") as file:
                soup = BeautifulSoup(file, "html.parser")
            
            # Extract the title
            title = soup.title.string if soup.title else "No title"

            # Trackdata pattern
            trackdata_pattern = re.compile(r"var trackData = {(.*?)};", re.DOTALL)
            track_data = ""

            # Extract trackData from <script> tags
            for script in soup.find_all("script"):
                match = trackdata_pattern.search(script.string if script.string else "")
                if match:
                    track_data = match.group(1)
                    break

            # Convert trackData to a dictionary
            if track_data:
                track_data_dict = {}
                for item in track_data.split(","):
                    key_value = item.split(":")
                    if len(key_value) == 2:
                        key = key_value[0].strip().strip('"')
                        value = key_value[1].strip().strip('"').strip()
                        track_data_dict[key] = value
            else:
                track_data_dict = {}
            
            data = {"Title": title}
            data.update(track_data_dict)

            # Append the data to the list
            all_data.append(data)

    # Create a DataFrame from the collected data
    df = pd.DataFrame(all_data)

    # Save the DataFrame to a CSV file
    try:
        df.to_excel(output, index=False)
    except Exception as ex:
        print(f"Error saving file: {ex}")
    else:
        print(f"Data extracted and saved to {output}")

In [None]:
# Directory containing multiple HTML files
input_dir = "/home/raghava/Projects/Bigdata_class/Input_data/"

# Output file location
output_file = "/home/raghava/Projects/Bigdata_class/Output_data/output.xlsx"

# html_parser function calling
html_parser(input= input_dir, output= output_file)