In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
import re

In [2]:
# Function to extract text from the column with the highest word count in an HTML table
def extract_column_with_highest_word_count(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
            table = soup.find('table')  # Find the first table in the HTML file
            if not table:
                print(f"No table found in the HTML file: {file_path}")
                return None

            rows = table.find_all('tr')  # Find all rows in the table
            if not rows:
                print(f"No rows found in the table: {file_path}")
                return None

            # Initialize a list to store columns' data
            columns_data = []
            for row in rows:
                cells = row.find_all('td')  # Find all cells in the row
                for i, cell in enumerate(cells):
                    if len(columns_data) <= i:
                        columns_data.append([])
                    columns_data[i].append(cell.get_text(separator=' ').strip())  # Store the text of each cell

            # Count words in each column
            word_counts = [sum(len(re.findall(r'\b\w+\b', cell)) for cell in column) for column in columns_data]

            # Find the column with the highest word count
            max_word_count_index = word_counts.index(max(word_counts))
            highest_word_count_column = columns_data[max_word_count_index]

            # Filter out entries that contain numbers
            filtered_column = [cell for cell in highest_word_count_column if not re.search(r'\d', cell)]

            return ' '.join(filtered_column)  # Return the text of the column as a single string

    except Exception as e:
        print(f"An error occurred while processing the file {file_path}: {e}")
        return None


In [3]:
# Function to convert the extracted text into a bag-of-words summary
def extract_and_summarize_bow(extracted_text):
    try:
        vectorizer = CountVectorizer()
        bow_matrix = vectorizer.fit_transform([extracted_text])  # Convert text to BoW matrix

        # Method 1: Total Word Count
        total_word_count = bow_matrix.sum()  # Sum of word counts

        return total_word_count

    except Exception as e:
        print(f"An error occurred during BoW summarization: {e}")
        return None

In [4]:
# Function to process all HTML files in a given folder
def process_html_files_in_folder(folder_path):
    results = []
    for subdir, _, files in os.walk(folder_path):  # Iterate through the folder and its subdirectories
        for file in files:
            if file.endswith('.html'):  # Process only HTML files
                file_path = os.path.join(subdir, file)
                extracted_text = extract_column_with_highest_word_count(file_path)  # Extract text from the HTML file
                if extracted_text:
                    bow_summary = extract_and_summarize_bow(extracted_text)  # Summarize the text to a single value
                    folder_name = os.path.basename(subdir)  # Get the name of the folder
                    results.append({
                        'file_name': file,
                        'bag_of_words': bow_summary,
                        'folder_name': folder_name
                    })

    return results

In [5]:
# Function to save the results to a CSV file
def save_results_to_csv(results, output_csv):
    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)  # Save the results to a CSV file without the index


In [7]:
# Example usage
folder_paths = [
    r'C:\Users\elcot\Desktop\VS Code\Projects\Table Classification\data\Balance Sheets',
    r'C:\Users\elcot\Desktop\VS Code\Projects\Table Classification\data\Cash Flow',
    r'C:\Users\elcot\Desktop\VS Code\Projects\Table Classification\data\Notes',
    r'C:\Users\elcot\Desktop\VS Code\Projects\Table Classification\data\Income Statement',
    r'C:\Users\elcot\Desktop\VS Code\Projects\Table Classification\data\Others'
]
output_csv = r'C:\Users\elcot\Desktop\VS Code\Projects\Table Classification\output.csv'

all_results = []
for folder_path in folder_paths:
    results = process_html_files_in_folder(folder_path)  # Process each folder
    all_results.extend(results)  # Collect all results

save_results_to_csv(all_results, output_csv)  # Save all results to a single CSV file

An error occurred during BoW summarization: empty vocabulary; perhaps the documents only contain stop words
An error occurred during BoW summarization: empty vocabulary; perhaps the documents only contain stop words
An error occurred during BoW summarization: empty vocabulary; perhaps the documents only contain stop words
An error occurred during BoW summarization: empty vocabulary; perhaps the documents only contain stop words
An error occurred during BoW summarization: empty vocabulary; perhaps the documents only contain stop words
An error occurred during BoW summarization: empty vocabulary; perhaps the documents only contain stop words
An error occurred during BoW summarization: empty vocabulary; perhaps the documents only contain stop words
An error occurred during BoW summarization: empty vocabulary; perhaps the documents only contain stop words
An error occurred during BoW summarization: empty vocabulary; perhaps the documents only contain stop words
An error occurred during BoW

In [9]:
import os
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Function to extract text from the column with the highest word count in an HTML table
def extract_column_with_highest_word_count(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
            table = soup.find('table')
            if not table:
                print(f"No table found in the HTML file: {file_path}")
                return None

            rows = table.find_all('tr')
            if not rows:
                print(f"No rows found in the table: {file_path}")
                return None

            columns_data = []
            for row in rows:
                cells = row.find_all('td')
                for i, cell in enumerate(cells):
                    if len(columns_data) <= i:
                        columns_data.append([])
                    columns_data[i].append(cell.get_text(separator=' ').strip())

            word_counts = [sum(len(re.findall(r'\b\w+\b', cell)) for cell in column) for column in columns_data]
            max_word_count_index = word_counts.index(max(word_counts))
            highest_word_count_column = columns_data[max_word_count_index]
            filtered_column = [cell for cell in highest_word_count_column if not re.search(r'\d', cell)]

            return ' '.join(filtered_column)
    except Exception as e:
        print(f"An error occurred while processing the file {file_path}: {e}")
        return None

# Function to process all HTML files in a given folder
def process_html_files_in_folder(folder_path):
    results = []
    for subdir, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.html'):
                file_path = os.path.join(subdir, file)
                extracted_text = extract_column_with_highest_word_count(file_path)
                if extracted_text:
                    folder_name = os.path.basename(subdir)
                    results.append({
                        'file_name': file,
                        'extracted_text': extracted_text,
                        'folder_name': folder_name
                    })
    return results

# Function to save the results to a CSV file
def save_results_to_csv(results, output_csv):
    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)

# Example usage
folder_paths = [
    r'C:\Users\elcot\Desktop\VS Code\Projects\Table Classification\data\Balance Sheets',
    r'C:\Users\elcot\Desktop\VS Code\Projects\Table Classification\data\Cash Flow',
    r'C:\Users\elcot\Desktop\VS Code\Projects\Table Classification\data\Notes',
    r'C:\Users\elcot\Desktop\VS Code\Projects\Table Classification\data\Income Statement',
    r'C:\Users\elcot\Desktop\VS Code\Projects\Table Classification\data\Others'
]
output_csv = r'C:\Users\elcot\Desktop\VS Code\Projects\Table Classification\output.csv'

all_results = []
for folder_path in folder_paths:
    results = process_html_files_in_folder(folder_path)
    all_results.extend(results)

# Save the extracted texts to a CSV
temp_csv = r'C:\Users\elcot\Desktop\VS Code\Projects\Table Classification\temp_extracted_texts.csv'
save_results_to_csv(all_results, temp_csv)

# Load the CSV and create TF-IDF features
df = pd.read_csv(temp_csv)

# Fill NaN values in the extracted_text column with an empty string
df['extracted_text'].fillna('', inplace=True)

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['extracted_text'])

# Add TF-IDF features to the DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
df = pd.concat([df, tfidf_df], axis=1)
df.drop(columns=['extracted_text'], inplace=True)  # Drop the text column if not needed

# Save the final DataFrame with TF-IDF features
final_output_csv = r'C:\Users\elcot\Desktop\VS Code\Projects\Table Classification\final_output.csv'
df.to_csv(final_output_csv, index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['extracted_text'].fillna('', inplace=True)
