# Extracting table header as list

In [1]:
import collections
collections.Callable = collections.abc.Callable

In [2]:
from bs4 import BeautifulSoup, NavigableString
from html_table_extractor.extractor import Extractor
import pandas as pd
import os
import re

In [3]:
relative_path_html = "../HTML/Output_html_Chapter15/kaggle/working/out/output/"
relative_path_pdf = "../HTML/Output_html_Chapter15/kaggle/working/out/pages/"
relative_dictionary_path = "../Dictionary/Chapter15_Attributes.csv"
relative_result_path = "../HTML/Output_html_Chapter15/Chapter15_Attributes.csv"
# Get the absolute path by joining the current directory with the relative path
absolute_path_html = os.path.join(os.getcwd(), relative_path_html)
absolute_path_pdf = os.path.join(os.getcwd(), relative_path_pdf)
absolute_dictionary_path = os.path.join(os.getcwd(), relative_dictionary_path)
absolute_result_path = os.path.join(os.getcwd(), relative_result_path)

In [4]:
absolute_path_html

'/home/riya/Downloads/ADT/2015-16/Processing_Files/../HTML/Output_html_Chapter15/kaggle/working/out/output/'

In [5]:
os.listdir(absolute_path_html)

['output_page_1.html',
 'output_page_2.html',
 'output_page_5.html',
 'output_page_8.html',
 'output_page_3.html',
 'output_page_4.html',
 'output_page_7.html',
 'output_page_6.html']

In [6]:
output_list = []

In [7]:
output_list

[]

In [8]:
allowed_two_letter_words = {'sl','sc', 'st', 'cc', 'tp', 'tb', 'ts', 'tg', 'km', 'gm', 'an', 'at', 'by', 'do', 'go', 'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'on', 'or', 'so', 'to', 'up', 'us'}

def replace_td_content(tag):
    if isinstance(tag, NavigableString):
        return  # Skip strings, as they don't have 'contents'

    if tag.name == 'td':
        # Extract only meaningful English words and numbers
        valid_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-.%")
        filtered_content = ''.join(char if char in valid_characters else ' ' for char in tag.get_text())
        
        # Remove extra spaces and words with less than 3 characters
        cleaned_content = ' '.join(word for word in filtered_content.split() if len(word) >= 3 or word.isdigit() or word.lower() in allowed_two_letter_words or word=='%')
        
        # Set the cleaned content as the tag's string
        tag.string = cleaned_content
    elif tag.contents:
        for child in tag.contents:
            replace_td_content(child)


In [9]:
def process_table_cells(target_table):
    if not target_table:
        print("Table not found in the HTML content.")
        return

    flag1 = 0
    flag2 = 0

    # Loop through each row in the table
    for row in target_table.find_all('tr'):
        # Loop through each cell in the row
        if(flag1 == 1 and flag2 == 1):
            row.decompose()
        else : 
            for cell in row.find_all(['td', 'th']):
                # Process the cell content (you can modify this part based on your requirements)
                cell_content = cell.get_text(strip=True)
                if(cell_content == '1'):
                    flag1 = 1
                if(cell_content == '2'):
                    flag2 = 1        
            

In [10]:
# iterate through all file 
for file in os.listdir(absolute_path_html): 
    # Check whether file is in text format or not 
    if file.endswith(".html"): 
        file_path = os.path.join(absolute_path_html, file)
        with open(file_path) as fp:
            soup = BeautifulSoup(fp, "html.parser")
            
            for tag in soup.find_all(attrs={'style': True}):
                tag.attrs.pop('style')
            
            table_tag = soup.find('table')
            
            if table_tag is not None:
                replace_td_content(table_tag)

                process_table_cells(table_tag)

                extractor1 = Extractor(table_tag)
                extractor1.parse()
                output_list.append(extractor1.return_list())
            else:
                print(file_path)

In [11]:
output_list

[[['15.1 twAi wAi Census Houses and their uses to which they put per 2011 Census',
   '15.1 twAi wAi Census Houses and their uses to which they put per 2011 Census',
   '15.1 twAi wAi Census Houses and their uses to which they put per 2011 Census',
   '15.1 twAi wAi Census Houses and their uses to which they put per 2011 Census',
   '15.1 twAi wAi Census Houses and their uses to which they put per 2011 Census',
   '15.1 twAi wAi Census Houses and their uses to which they put per 2011 Census',
   '15.1 twAi wAi Census Houses and their uses to which they put per 2011 Census',
   '15.1 twAi wAi Census Houses and their uses to which they put per 2011 Census',
   '15.1 twAi wAi Census Houses and their uses to which they put per 2011 Census'],
  ['',
   '.ffl',
   'No.of Census Houses',
   'No.of Census Houses',
   'No.of Census Houses',
   'Occupied Census Houses used',
   'Occupied Census Houses used',
   'Occupied Census Houses used',
   'Occupied Census Houses used'],
  ['',
   '.ffl',
 

In [12]:
def add_space_before_year(input_string):
    # Regular expression to find a year in 'yyyy' format with an optional alphabet character before it
    pattern = re.compile(r'([a-zA-Z])?(\d{4})')
    
    # Function to add space between alphabet character and year
    def replace(match):
        alphabet_part, year_part = match.groups()
        if alphabet_part:
            return f'{alphabet_part} {year_part}'
        else:
            return year_part

    # Using re.sub() to apply the replacement function to the input string
    result = pattern.sub(replace, input_string)

    return result

In [13]:
def remove_before_all_occurrence(input_string):
    # Regular expression to match numbers with dot similar to dd.dd, d.d, d.dd, or dd.d
    pattern = re.compile(r'\b(\d\.\d|\d{2}\.\d|\d\.\d{2}|\d{2}\.\d{2})\b')

    # Find all occurrences of the pattern in the input string
    matches = list(pattern.finditer(input_string))

    # If there are at least two occurrences of the pattern, remove everything before the second occurrence
    for i in range(len(matches)):
        second_occurrence_index = matches[i].start()
        input_string = input_string[second_occurrence_index:]

    return input_string.strip()

In [14]:
def convert_uppercase_to_camel_case(input_string):
    words = input_string.split()
    result_words = [word.capitalize() if word.isupper() else word for word in words]
    result_string = ' '.join(result_words)
    return result_string

In [15]:
def add_space_before_camelcase(input_string):
    # Use regular expression to find camel case words
    camelcase_pattern = re.compile(r'(?<=[a-z])(?=[A-Z])')
    result_string = camelcase_pattern.sub(' ', input_string)
    return result_string

In [16]:
def remove_extra_spaces(input_string):
    # Use regular expression to replace consecutive white spaces with a single space
    cleaned_string = re.sub(r'\s+', ' ', input_string)
    return cleaned_string.strip()

In [17]:
def add_space_before_word_with_condition(input_string):
    # Use regular expression to find words preceded by a number or special character
    pattern = re.compile(r'(?<=[%])\b(\w+)\b', flags=re.IGNORECASE)

    # Using re.sub() to add space before matched words
    result_string = pattern.sub(lambda x: ' ' + x.group(1), input_string)

    return result_string

In [18]:
def replace_words_with_blank(input_string, words_to_replace):
    # Constructing a regular expression pattern to match any of the words in the set
    # pattern = re.compile(r'\b(?:' + '|'.join(re.escape(word) for word in words_to_replace) + r')\b', flags=re.IGNORECASE)
    pattern = re.compile(r'(?<!\S)(?:' + '|'.join(re.escape(word) for word in words_to_replace) + r')(?!\S)', flags=re.IGNORECASE)

    # Using re.sub() to replace matched words with blank
    result = pattern.sub('', input_string)

    return result.strip()


In [19]:
words_to_replace = ['UAS', 'llsAi', 'ASN', 'mez', '3 jAz 6', 'AASU', 'tAvP', 'uka', 'QjAi', 'flAi', '30.9.2015', 'iaxled', '2005Dy', 'zPr', 'AUq', 'MAz', 'sfi', 'Raing', 'ema', '.CA', '.Dg', '.G.S', '.Jfl', '.Q.', '.ffl', '1 2', '1-10', '2 01', '2005 jAz 2015', '2006 jAz 2016', '2011 twAi rAi AaP fiU %', '2011 twAi wAi EAzs %', '20112.2', '20112.6', '2011g', '24 7', '31.03.2016', '31.3.201', '31.3.2016', '31.3.2016.', '33 jAz 6', '6 jAz 1', '6 jAz 14', '6 wAU', 'A.ffl', 'A.g', 'AdP', 'Adg', 'Affl', 'Afi', 'Afli', 'Aid', 'Arg', 'Azs', 'CAU', 'CAZ', 'CAv', 'CAw', 'CAz', 'Cfi', 'CrP', 'Dfi', 'Dfl', 'E.J .n. .Dg .Jfl .JA.', 'EAf', 'Efi', 'Efl', 'JfiP', 'Jfl', 'KfiQ', 'MlM', 'Pfi', 'QAi', 'Raing', 'Rfl', 'SQA', 'Three', 'VAi', 'dfl', 'eng cy.U', 'ffl', 'fiAi', 'fiP', 'fiQ', 'fiQAi', 'fiU', 'fig', 'fl.', 'gLi', 'gMl', 'jAU', 'jAi', 'lion nit', 'ofD', 'rAi', 'tAwAi', 'tP', 'twAi', 'tzx', 'wAi', 'tzs']

In [20]:
def replace_words(input_string, word_replacements):
    # Constructing a regular expression pattern to match whole words in the set
    # pattern = re.compile(r'\b(?:' + '|'.join(re.escape(word) for word in word_replacements.keys()) + r')\b')
    pattern = re.compile(r'(?<!\S)(?:' + '|'.join(re.escape(word) for word in word_replacements.keys()) + r')(?!\S)')
    
    # Using re.sub() to replace matched words with their corresponding replacements
    result = pattern.sub(lambda x: word_replacements.get(x.group(), x.group()), input_string)

    return result

In [21]:
word_replacements = { "used" : "used as", "jShop" : "Shop", ".Distribution" : "Distribution", "wScheduled" : "Scheduled", "B.R..AMBEDKAR" : "B.R.AMBEDKAR"}

In [22]:
def process_data(input_data):
    for k in range(len(input_data)):
        for i in range(len(input_data[k])):
            for j in range(len(input_data[k][i])):
                #print(k,i,j,input_data[k][i][j])

                input_data[k][i][j] = add_space_before_year(input_data[k][i][j])
                input_data[k][i][j] = replace_words_with_blank(input_data[k][i][j], words_to_replace)
                input_data[k][i][j] = replace_words(input_data[k][i][j], word_replacements)
                input_data[k][i][j] = remove_before_all_occurrence(input_data[k][i][j])
                input_data[k][i][j] = add_space_before_camelcase(input_data[k][i][j])
                input_data[k][i][j] = convert_uppercase_to_camel_case(input_data[k][i][j])
                input_data[k][i][j] = remove_extra_spaces(input_data[k][i][j])
                input_data[k][i][j] = add_space_before_word_with_condition(input_data[k][i][j])

In [23]:
process_data(output_list)
output_list

[[['15.1 Census Houses and their uses to which they put per 2011 Census',
   '15.1 Census Houses and their uses to which they put per 2011 Census',
   '15.1 Census Houses and their uses to which they put per 2011 Census',
   '15.1 Census Houses and their uses to which they put per 2011 Census',
   '15.1 Census Houses and their uses to which they put per 2011 Census',
   '15.1 Census Houses and their uses to which they put per 2011 Census',
   '15.1 Census Houses and their uses to which they put per 2011 Census',
   '15.1 Census Houses and their uses to which they put per 2011 Census',
   '15.1 Census Houses and their uses to which they put per 2011 Census'],
  ['',
   '',
   'No.of Census Houses',
   'No.of Census Houses',
   'No.of Census Houses',
   'Occupied Census Houses used as',
   'Occupied Census Houses used as',
   'Occupied Census Houses used as',
   'Occupied Census Houses used as'],
  ['',
   '',
   'Total no. of Census houses',
   'No.of vacant houses %',
   'No .of Occupi

In [24]:
#transpose the inner list
transp_list = [[list(pair) for pair in zip(*sublist)] for sublist in output_list]
transp_list

[[['15.1 Census Houses and their uses to which they put per 2011 Census',
   '',
   '',
   '1'],
  ['15.1 Census Houses and their uses to which they put per 2011 Census',
   '',
   '',
   '2'],
  ['15.1 Census Houses and their uses to which they put per 2011 Census',
   'No.of Census Houses',
   'Total no. of Census houses',
   '895'],
  ['15.1 Census Houses and their uses to which they put per 2011 Census',
   'No.of Census Houses',
   'No.of vacant houses %',
   '896'],
  ['15.1 Census Houses and their uses to which they put per 2011 Census',
   'No.of Census Houses',
   'No .of Occupied houses %',
   '897'],
  ['15.1 Census Houses and their uses to which they put per 2011 Census',
   'Occupied Census Houses used as',
   'Residence %',
   '898'],
  ['15.1 Census Houses and their uses to which they put per 2011 Census',
   'Occupied Census Houses used as',
   'Residence cum other use %',
   '899'],
  ['15.1 Census Houses and their uses to which they put per 2011 Census',
   'Occupied 

In [25]:
def find_table_details(input_string):
    # Regular expression to match numbers with dot similar to dd.dd, d.d, d.dd, or dd.d
    pattern = re.compile(r'\b(\d\.\d|\d{2}\.\d|\d\.\d{2}|\d{2}\.\d{2})\b')

    # Find all occurrences of the pattern in the input string
    matches = pattern.finditer(input_string)

    # Initialize variables to store the matches
    match_positions = []

    # Store the positions of the matches
    for match in matches:
        match_positions.append(match.start())

    # If there is at least one occurrence of the pattern
    if match_positions:
        table_id_index = match_positions[0]
        input_string = input_string[table_id_index:]  
        space_index = input_string.find(' ') 
        table_id = input_string[table_id_index:space_index].strip()
        table_name = input_string[space_index:].strip()
        return table_id, table_name    
    else:
        return None, None


In [26]:
def convert_to_number(string):
    try:
        number = int(string)
    except ValueError:
        try:
            number = float(string)
        except ValueError:
            raise ValueError("Input string is not a valid number")
    return number

In [28]:
chapter_id = 15
chapter_name = "Housing"
#processing each column
def process_dictionary(input_data):
    df = pd.DataFrame(columns = ['Attr_id', 'Chapter_id', 'Chapter_name', 'Table_id', 'Table_name', 'Description'])
    for inner_list in input_data:
        table_id, table_name = find_table_details(inner_list[0][0])
        previd=0
        for data_list in inner_list:
            i=0
            attr_id=0
            description = ''
            prev = ''
            for data in reversed(data_list):
                if data == '1' or data == '2':
                    break
                i = i+1;
                if i == 1 and data!='' and (previd==0 or convert_to_number(data)!=previd):
                    attr_id = convert_to_number(data)
                    previd = convert_to_number(data)
                if i > 1 and i < len(data_list):
                    if description == '':
                        description = data
                        prev = data
                    else : 
                        if prev != data and data!='':
                            description = ','.join([description, data]) #description + ' ' + data
                            prev = data
            if attr_id != 0 and description != "" :
                row = {'Attr_id' : attr_id, 'Chapter_id' : chapter_id, 'Chapter_name' : chapter_name, 'Table_id' : table_id, 'Table_name' : table_name, 'Description' : remove_extra_spaces(description)}
                df.loc[len(df)] = row
    df = df.sort_values(by=['Attr_id'])
    df.to_csv(absolute_result_path, sep=';', index=False)
    df.to_csv(absolute_dictionary_path, sep=';', index=False)
                                            

In [29]:
process_dictionary(transp_list)