In [1]:
from bs4 import BeautifulSoup

def extract_text_from_html(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
            return soup.get_text(separator=' ')
    except Exception as e:
        print(f"An error occurred: {e}")
        return None



In [3]:
from bs4 import BeautifulSoup
import re

def extract_text_from_column_with_words(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
            table = soup.find('table')
            if not table:
                print("No table found in the HTML file.")
                return None

            # Find all rows in the table
            rows = table.find_all('tr')
            if not rows:
                print("No rows found in the table.")
                return None

            # Initialize a list to store columns' data
            columns_data = []
            for row in rows:
                cells = row.find_all(['td', 'th'])
                for i, cell in enumerate(cells):
                    if len(columns_data) <= i:
                        columns_data.append([])
                    columns_data[i].append(cell.get_text(separator=' ').strip())

            # Function to check if a list of strings contains only words (no numbers)
            def contains_only_words(column):
                for cell in column:
                    if re.search(r'\d', cell):
                        return False
                return True

            # Find the first column that contains only words
            for column in columns_data:
                if contains_only_words(column):
                    return ' '.join(column)

            print("No column found that contains only words.")
            return None

    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [5]:
import re

def preprocess_text(text):
    # Remove non-alphanumeric characters (except spaces)
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def extract_text_from_html(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
            raw_text = soup.get_text(separator=' ')
            return preprocess_text(raw_text)
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [7]:
from bs4 import BeautifulSoup
import re

def extract_text_from_html(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
            text = soup.get_text(separator=' ')
            
            # Filter out segments containing numbers
            words_only_text = ' '.join(word for word in text.split() if not re.search(r'\d', word))
            
            return words_only_text
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [10]:
from bs4 import BeautifulSoup
import re

def extract_first_column_text_without_numbers(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
            table = soup.find('table')
            if not table:
                print("No table found in the HTML file.")
                return None

            # Extract text from the first column
            first_column_text = []
            rows = table.find_all('tr')
            for row in rows:
                cells = row.find_all('td')
                if cells:
                    first_cell_text = cells[0].get_text(separator=' ').strip()
                    if not re.search(r'\d', first_cell_text):  # Check if the cell text contains numbers
                        first_column_text.append(first_cell_text)

            return ' '.join(first_column_text)

    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [12]:
from bs4 import BeautifulSoup
import re

def extract_column_with_highest_word_count(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
            table = soup.find('table')
            if not table:
                print("No table found in the HTML file.")
                return None

            rows = table.find_all('tr')
            if not rows:
                print("No rows found in the table.")
                return None

            # Initialize a list to store columns' data
            columns_data = []
            for row in rows:
                cells = row.find_all('td')
                for i, cell in enumerate(cells):
                    if len(columns_data) <= i:
                        columns_data.append([])
                    columns_data[i].append(cell.get_text(separator=' ').strip())

            # Count words in each column
            word_counts = [sum(len(re.findall(r'\b\w+\b', cell)) for cell in column) for column in columns_data]

            # Find the column with the highest word count
            max_word_count_index = word_counts.index(max(word_counts))
            highest_word_count_column = columns_data[max_word_count_index]

            # Filter out entries that contain numbers
            filtered_column = [cell for cell in highest_word_count_column if not re.search(r'\d', cell)]

            return ' '.join(filtered_column)

    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [14]:
from sklearn.feature_extraction.text import CountVectorizer

def convert_to_bag_of_words(text):
    vectorizer = CountVectorizer()
    text_corpus = [text]
    bow_matrix = vectorizer.fit_transform(text_corpus)
    feature_names = vectorizer.get_feature_names_out()
    bow_array = bow_matrix.toarray()

    return bow_array, feature_names



In [15]:
# Example usage
file_path = r'C:\Users\elcot\Desktop\VS Code\Projects\Table Classification\data\Balance Sheets\18320959_3.html'
extracted_text = extract_column_with_highest_word_count(file_path)
if extracted_text:
    bow_array, feature_names = convert_to_bag_of_words(extracted_text)
    print("Bag-of-Words Array:\n", bow_array)
    print("Feature Names:\n", feature_names)

Bag-of-Words Array:
 [[ 1  3 10  1  1  1  2  2  2 13  1  1  5  1  8  5  3  1  1  1  1  1 12  1
   2  2  2  6  9  1  1  1  1  1  2  1  1  2  1  2  4  1  6  2  1]]
Feature Names:
 ['above' 'and' 'assets' 'balances' 'bank' 'bj' 'borrowings' 'capital'
 'cash' 'current' 'deferred' 'equipment' 'equity' 'equivalents'
 'financial' 'ii' 'iii' 'in' 'intangible' 'inventories' 'investments'
 'ivj' 'liabilities' 'liability' 'loans' 'long' 'net' 'non' 'other'
 'particulars' 'payables' 'plant' 'progress' 'property' 'provisions'
 'receivables' 'share' 'short' 'standalone' 'tax' 'term' 'than' 'total'
 'trade' 'work']


In [17]:
from sklearn.feature_extraction.text import CountVectorizer

def extract_and_summarize_bow(extracted_text):
    try:
        vectorizer = CountVectorizer()
        bow_matrix = vectorizer.fit_transform([extracted_text])

        # Method 1: Total Word Count
        total_word_count = bow_matrix.sum()

        # Method 2: Average Word Frequency
        average_word_frequency = bow_matrix.mean()

        return total_word_count, average_word_frequency

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Example usage
file_path = r'C:\Users\elcot\Desktop\VS Code\Projects\Table Classification\data\Balance Sheets\18320959_3.html'
extracted_text = extract_column_with_highest_word_count(file_path)
if extracted_text:
    total_word_count, average_word_frequency = extract_and_summarize_bow(extracted_text)
    print("Total Word Count:", total_word_count)
    print("Average Word Frequency:", average_word_frequency)


Total Word Count: 127
Average Word Frequency: 2.8222222222222206


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

def calculate_single_value_bow(texts):
    vectorizer = CountVectorizer()
    bow_matrix = vectorizer.fit_transform(texts)
    
    # Method 1: Total Word Count
    total_word_count = bow_matrix.sum()

    # Method 2: Average Word Frequency
    average_word_frequency = bow_matrix.mean()

    return total_word_count, average_word_frequency

# Example usage
texts = extracted_text
total_word_count, average_word_frequency = calculate_single_value_bow(texts)
print("Total Word Count:", total_word_count)
print("Average Word Frequency:", average_word_frequency)


ValueError: Iterable over raw text documents expected, string object received.

In [13]:
# Example usage
file_path = r'C:\Users\elcot\Desktop\VS Code\Projects\Table Classification\data\Balance Sheets\18320959_3.html'  # Replace with the actual file path
extracted_text = extract_column_with_highest_word_count(file_path)
print(extracted_text)

Standalone Particulars A Assets (a) Property, Plant and Equipment (b) Capital work in progress (c) Intangible assets (d) Financial assets (i) Non-current investments (ii) Loans (iii) Other financial assets (e) Other non-current assets  Total Non-Current Assets (A)  (a) Inventories (b) Financial assets (i) Trade Receivables (ii) Cash and cash equivalents (iii) Bank balances other than (ii) above (ivj Loans (v) Other Financial assets (c) Other current assets  Total Current Assets (B)  B Equity & Liabilities  (a) Equity share capital (b) Other Equity  Total Equity (A)  Non-Current Liabilities (a) Financial Liabilities (i) Long term borrowings (ii) Other Non Current Financial Liability (bj Long term provisions (c) Deferred tax liabilities (Net)  Total Non-Current Liabilities (B)  Current Liabilities (a) Financial Liabilities (i) Short term borrowings (ii) Trade payables (iii) Other Current Financial liabilities (b) Other current liabilities (c)Short term provisions (d) Current tax liabilit