# Extracting table header as list

In [43]:
import collections
collections.Callable = collections.abc.Callable
from collections.abc import Iterable  # Import Iterable from collections.abc

In [44]:
from bs4 import BeautifulSoup, NavigableString
from html_table_extractor.extractor import Extractor
import pandas as pd
import os
import re
import difflib
from PyPDF2 import PdfReader 
from nltk.tokenize import word_tokenize
from nltk.corpus import words as nltk_words

In [45]:
relative_path_html = "HTML/Output_html_Chapter1/kaggle/working/out/output/"
relative_path_pdf = "HTML/Output_html_Chapter1/kaggle/working/out/pages/"
# Get the absolute path by joining the current directory with the relative path
absolute_path_html = os.path.join(os.getcwd(), relative_path_html)
absolute_path_pdf = os.path.join(os.getcwd(), relative_path_pdf)

In [46]:
output_list = []
vocab = []

In [47]:
output_list

[]

In [48]:
vocab

[]

In [49]:
allowed_two_letter_words = {'sl','sc', 'st', 'cc', 'tp', 'tb', 'ts', 'tg', 'km', 'gm', 'an', 'at', 'by', 'do', 'go', 'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'on', 'or', 'so', 'to', 'up', 'us'}

def replace_td_content(tag):
    if isinstance(tag, NavigableString):
        return  # Skip strings, as they don't have 'contents'

    if tag.name == 'td':
        # Extract only meaningful English words and numbers
        valid_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-.%")
        filtered_content = ''.join(char if char in valid_characters else ' ' for char in tag.get_text())
        
        # Remove extra spaces and words with less than 3 characters
        cleaned_content = ' '.join(word for word in filtered_content.split() if len(word) >= 3 or word.isdigit() or word.lower() in allowed_two_letter_words or word=='%')
        
        # Set the cleaned content as the tag's string
        tag.string = cleaned_content
    elif tag.contents:
        for child in tag.contents:
            replace_td_content(child)


In [50]:
def process_table_cells(target_table):
    if not target_table:
        print("Table not found in the HTML content.")
        return

    flag1 = 0
    flag2 = 0

    # Loop through each row in the table
    for row in target_table.find_all('tr'):
        # Loop through each cell in the row
        if(flag1 == 1 and flag2 == 1):
            row.decompose()
        else : 
            for cell in row.find_all(['td', 'th']):
                # Process the cell content (you can modify this part based on your requirements)
                cell_content = cell.get_text(strip=True)
                if(cell_content == '1'):
                    flag1 = 1
                if(cell_content == '2'):
                    flag2 = 1        
            

In [51]:
def filter_english_characters(word_list):
    filtered = []

    for word in word_list:
        # Check if the word contains only English letters
        #if re.match("^[a-zA-Z]+$", word):
        if re.match("[a-zA-Z]+", word):
            filtered.append(word)

    return filtered

In [52]:
for file in os.listdir(absolute_path_pdf):
    # Check whether the file is in text format
    if file.endswith(".pdf"):
        file_path = os.path.join(absolute_path_pdf, file)  # Construct the full file path
        with open(file_path, 'rb') as fp:
            reader = PdfReader(fp)  
            page = reader.pages[0] 
            # Extract text from the page 
            text = page.extract_text()
            tokens = word_tokenize(text)
            clean_text = [word.lower() for word in tokens if word.isalpha()]
            filtered = filter_english_characters(clean_text)
            vocab.extend(filtered)

In [53]:
full_vocab = list(set(vocab))
full_vocab

['temporary',
 'sourace',
 'tmc',
 'uàzàuà',
 'council',
 'gddp',
 'statutory',
 'towns',
 'yet',
 'hassan',
 'note',
 'gross',
 'avàæ',
 'card',
 'aiàiázàvj',
 'zàqët',
 'nacs',
 'cc',
 'kannada',
 'inhabited',
 'govt',
 'tp',
 'gà',
 'monitoring',
 'yadgiri',
 'gáaiàäzàægàä',
 'davanagere',
 'apl',
 'above',
 'station',
 'permanent',
 'cylindercpàëaiàä',
 'female',
 'village',
 'value',
 'anthyodayaration',
 'as',
 'mysuru',
 'includes',
 'rs',
 'uttara',
 'water',
 'notified',
 'at',
 'kolar',
 'shivamogga',
 'state',
 'bodies',
 'ballari',
 'muncipal',
 'property',
 'civil',
 'r',
 'census',
 'working',
 'tumakuru',
 'without',
 'shops',
 'corporation',
 'current',
 'film',
 'rural',
 'revenue',
 'va',
 'bengaluru',
 'gvàûgà',
 'administration',
 'nada',
 'directorate',
 'fire',
 'nddp',
 'railway',
 'stations',
 'from',
 'dharawad',
 'chikkamagaluru',
 'capita',
 'lakhs',
 'municipal',
 'with',
 'primary',
 'urban',
 'villages',
 'mläö',
 'per',
 'police',
 'emergency',
 'of',
 'm

In [54]:
# iterate through all file 
for file in os.listdir(absolute_path_html): 
    # Check whether file is in text format or not 
    if file.endswith(".html"): 
        file_path = os.path.join(absolute_path_html, file)
        with open(file_path) as fp:
            soup = BeautifulSoup(fp, "html.parser")
            
            for tag in soup.find_all(attrs={'style': True}):
                tag.attrs.pop('style')
            
            table_tag = soup.find('table')
            
            if table_tag is not None:
                replace_td_content(table_tag)

                process_table_cells(table_tag)

                extractor1 = Extractor(table_tag)
                extractor1.parse()
                output_list.append(extractor1.return_list())
            else:
                print(file_path)
            

In [55]:
output_list

[[['1.10 ffl DISTRICT INCOME OF KARNATAKA',
   '1.10 ffl DISTRICT INCOME OF KARNATAKA',
   '1.10 ffl DISTRICT INCOME OF KARNATAKA',
   '1.10 ffl DISTRICT INCOME OF KARNATAKA',
   '1.10 ffl DISTRICT INCOME OF KARNATAKA',
   '1.10 ffl DISTRICT INCOME OF KARNATAKA',
   '1.10 ffl DISTRICT INCOME OF KARNATAKA',
   '1.10 ffl DISTRICT INCOME OF KARNATAKA'],
  ['Sl.No',
   'District',
   'At Current Prices 2013-14',
   'At Current Prices 2013-14',
   'At Constant 2011-12 Prices',
   'At Constant 2011-12 Prices',
   'Per capita NDDP In Rs.',
   'Per capita NDDP In Rs.'],
  ['Sl.No',
   'District',
   'Gross District Domestic Product GDDP Rs. inLakhs',
   'Net District Domestic Product NDDP Rs. inLakhs',
   'Gross District Domestic Product GDDP Rs. inLakhs',
   'Net District Domestic Product NDDP Rs. inLakhs',
   'At Current Prices 2013-14',
   'At Constant 2011-12 Prices'],
  ['1', '2', '52', '53', '54', '55', '56', '57']],
 [['General Information1.1 Nada Offices Village Accountant Circles Hobl

In [56]:
def add_space_before_year(input_string):
    # Regular expression to find a year in 'yyyy' format with an optional alphabet character before it
    pattern = re.compile(r'([a-zA-Z])?(\d{4})')
    
    # Function to add space between alphabet character and year
    def replace(match):
        alphabet_part, year_part = match.groups()
        if alphabet_part:
            return f'{alphabet_part} {year_part}'
        else:
            return year_part

    # Using re.sub() to apply the replacement function to the input string
    result = pattern.sub(replace, input_string)

    return result

In [57]:
def remove_before_all_occurrence(input_string):
    # Regular expression to match numbers with dot similar to dd.dd, d.d, d.dd, or dd.d
    pattern = re.compile(r'\b(\d\.\d|\d{2}\.\d|\d\.\d{2}|\d{2}\.\d{2})\b')

    # Find all occurrences of the pattern in the input string
    matches = list(pattern.finditer(input_string))

    # If there are at least two occurrences of the pattern, remove everything before the second occurrence
    for i in range(len(matches)):
        second_occurrence_index = matches[i].start()
        input_string = input_string[second_occurrence_index:]

    return input_string.strip()

In [58]:
def remove_after_all_occurrence(input_string):
    # Regular expression to match numbers with dot similar to dd.dd, d.d, d.dd, or dd.d
    pattern = re.compile(r'\b(\d\.\d|\d{2}\.\d|\d\.\d{2}|\d{2}\.\d{2})\b')

    # Find all occurrences of the pattern in the input string
    matches = list(pattern.finditer(input_string))

    # If there are at least two occurrences of the pattern, remove everything before the second occurrence
    for i in range(len(matches)):
        second_occurrence_index = matches[i].end()
        input_string = input_string[:second_occurrence_index]

    return input_string.strip()

In [59]:
def convert_uppercase_to_camel_case(input_string):
    words = input_string.split()
    result_words = [word.capitalize() if word.isupper() else word for word in words]
    result_string = ' '.join(result_words)
    return result_string

In [60]:
def add_space_before_camelcase(input_string):
    # Use regular expression to find camel case words
    camelcase_pattern = re.compile(r'(?<=[a-z])(?=[A-Z])')
    result_string = camelcase_pattern.sub(' ', input_string)
    return result_string

In [61]:
def remove_extra_spaces(input_string):
    # Use regular expression to replace consecutive white spaces with a single space
    cleaned_string = re.sub(r'\s+', ' ', input_string)
    return cleaned_string.strip()

In [62]:
def add_space_before_word_with_condition(input_string):
    # Use regular expression to find words preceded by a number or special character
    pattern = re.compile(r'(?<=[%])\b(\w+)\b', flags=re.IGNORECASE)

    # Using re.sub() to add space before matched words
    result_string = pattern.sub(lambda x: ' ' + x.group(1), input_string)

    return result_string

In [63]:
def process_one_data(input_data):
    for k in range(len(input_data)):
        for i in range(len(input_data[k])):
            for j in range(len(input_data[k][i])):
                #print(k,i,j,input_data[k][i][j])

                input_data[k][i][j] = add_space_before_year(input_data[k][i][j])
                input_data[k][i][j] = remove_before_all_occurrence(input_data[k][i][j])
                input_data[k][i][j] = add_space_before_camelcase(input_data[k][i][j])
                input_data[k][i][j] = convert_uppercase_to_camel_case(input_data[k][i][j])
                input_data[k][i][j] = remove_extra_spaces(input_data[k][i][j])
                input_data[k][i][j] = add_space_before_word_with_condition(input_data[k][i][j])

In [64]:
process_one_data(output_list)
output_list

[[['1.10 ffl District Income Of Karnataka',
   '1.10 ffl District Income Of Karnataka',
   '1.10 ffl District Income Of Karnataka',
   '1.10 ffl District Income Of Karnataka',
   '1.10 ffl District Income Of Karnataka',
   '1.10 ffl District Income Of Karnataka',
   '1.10 ffl District Income Of Karnataka',
   '1.10 ffl District Income Of Karnataka'],
  ['Sl.No',
   'District',
   'At Current Prices 2013-14',
   'At Current Prices 2013-14',
   'At Constant 2011-12 Prices',
   'At Constant 2011-12 Prices',
   'Per capita Nddp In Rs.',
   'Per capita Nddp In Rs.'],
  ['Sl.No',
   'District',
   'Gross District Domestic Product Gddp Rs. in Lakhs',
   'Net District Domestic Product Nddp Rs. in Lakhs',
   'Gross District Domestic Product Gddp Rs. in Lakhs',
   'Net District Domestic Product Nddp Rs. in Lakhs',
   'At Current Prices 2013-14',
   'At Constant 2011-12 Prices'],
  ['1', '2', '52', '53', '54', '55', '56', '57']],
 [['31.03.2016.',
   '31.03.2016.',
   '31.03.2016.',
   '31.03.201

In [65]:
def extract_unmatched_words(out_list, full_vocab):
    unmatched_words = set()

    # Helper function to flatten nested lists
    def flatten(items):
        for x in items:
            if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
                yield from flatten(x)
            else:
                yield x

    # Flatten the out_list and tokenize words
    words = ' '.join(flatten(out_list)).split()

    # Filter English words
    english_words = [word.lower() for word in words if re.match("[a-zA-Z]+", word)]

    # Find unmatched words
    for word in english_words:
        if word not in full_vocab:
            unmatched_words.add(word)

    return unmatched_words

# Extract unmatched words
unmatched_words = extract_unmatched_words(output_list, full_vocab)
print("Unmatched English words:", unmatched_words)

Unmatched English words: {'g', 'efi', 'sl.no.', 'anthyodaya', 'cylinder', 'received', 'cal', 'cities', 'tankers', 'protected', 'ema', 'sl.no', 'jfl', 'akshayawithcylinder', 'cav', 'tw', 'urba', 'prisoner', 'cau', 'emergencyfirecall', 'un-inhabited', 't', 'ffl', 'male', 'bplcard', 'no.of', 'valueofthes', 'cin', 'rs.', 'nos.'}


In [66]:
closest_matches = {}

# Find closest match for each unmatched word
for word in unmatched_words:
    closest_match = difflib.get_close_matches(word, full_vocab, n=1)
    if closest_match:
        closest_matches[word] = closest_match[0]

# Join matched words with spaces
for key in closest_matches:
    closest_matches[key] = ' '.join(closest_matches[key].split())

print("Closest matches:", closest_matches)

Closest matches: {'g': 'gà', 'anthyodaya': 'anthyodayaration', 'cylinder': 'cylinderbpl', 'cal': 'local', 'cities': 'circles', 'tankers': 'taluks', 'protected': 'product', 'ema': 'female', 'tw': 'town', 'urba': 'urban', 'prisoner': 'prisoners', 'emergencyfirecall': 'emergency', 'un-inhabited': 'inhabited', 't': 'tp', 'male': 'female', 'bplcard': 'card', 'valueofthes': 'value', 'cin': 'in', 'rs.': 'rs', 'nos.': 'nos'}


In [67]:
def replace_words_with_blank(input_string, words_to_replace):
    # Constructing a regular expression pattern to match any of the words in the set
    # pattern = re.compile(r'\b(?:' + '|'.join(re.escape(word) for word in words_to_replace) + r')\b', flags=re.IGNORECASE)
    pattern = re.compile(r'(?<!\S)(?:' + '|'.join(re.escape(word) for word in words_to_replace) + r')(?!\S)', flags=re.IGNORECASE)

    # Using re.sub() to replace matched words with blank
    result = pattern.sub('', input_string)

    return result.strip()


In [68]:
words_to_replace = ['g', 'efi', 'ema', 'jfl', 'cav', 'cau', 't', 'ffl']

In [69]:
def replace_words(input_string, word_replacements):
    # Constructing a regular expression pattern to match whole words in the set
    # pattern = re.compile(r'\b(?:' + '|'.join(re.escape(word) for word in word_replacements.keys()) + r')\b')
    pattern = re.compile(r'(?<!\S)(?:' + '|'.join(re.escape(word) for word in word_replacements.keys()) + r')(?!\S)')
    
    # Using re.sub() to replace matched words with their corresponding replacements
    result = pattern.sub(lambda x: word_replacements.get(x.group(), x.group()), input_string)

    return result

In [70]:
word_replacements = { 'cal': 'local', 'ema': 'female', 'tw': 'town', 'urba': 'urban', 'prisoner': 'prisoners', 'emergencyfirecall': 'emergency firecall', 'male': 'female', 'bplcard': 'bpl card', 'valueofthes': 'value of the'}

In [71]:
def process_data(input_data):
    for k in range(len(input_data)):
        for i in range(len(input_data[k])):
            for j in range(len(input_data[k][i])):
                #print(k,i,j,input_data[k][i][j])

                input_data[k][i][j] = replace_words_with_blank(input_data[k][i][j], words_to_replace)
                input_data[k][i][j] = replace_words(input_data[k][i][j], word_replacements)
                

In [72]:
process_data(output_list)
output_list

[[['1.10  District Income Of Karnataka',
   '1.10  District Income Of Karnataka',
   '1.10  District Income Of Karnataka',
   '1.10  District Income Of Karnataka',
   '1.10  District Income Of Karnataka',
   '1.10  District Income Of Karnataka',
   '1.10  District Income Of Karnataka',
   '1.10  District Income Of Karnataka'],
  ['Sl.No',
   'District',
   'At Current Prices 2013-14',
   'At Current Prices 2013-14',
   'At Constant 2011-12 Prices',
   'At Constant 2011-12 Prices',
   'Per capita Nddp In Rs.',
   'Per capita Nddp In Rs.'],
  ['Sl.No',
   'District',
   'Gross District Domestic Product Gddp Rs. in Lakhs',
   'Net District Domestic Product Nddp Rs. in Lakhs',
   'Gross District Domestic Product Gddp Rs. in Lakhs',
   'Net District Domestic Product Nddp Rs. in Lakhs',
   'At Current Prices 2013-14',
   'At Constant 2011-12 Prices'],
  ['1', '2', '52', '53', '54', '55', '56', '57']],
 [['31.03.2016.',
   '31.03.2016.',
   '31.03.2016.',
   '31.03.2016.',
   '31.03.2016.',
 

In [88]:
#transpose the inner list
transp_list = [[list(pair) for pair in zip(*sublist)] for sublist in output_list]
transp_list

[[['1.9',
   'Non-Priority Card Holders with and without gas cylinder on 31-3-2020 Cumulative',
   'Si.n',
   'Si.n',
   '1'],
  ['1.9',
   'Non-Priority Card Holders with and without gas cylinder on 31-3-2020 Cumulative',
   'Districts',
   'Districts',
   '2'],
  ['1.9',
   'Non-Priority Card Holders with and without gas cylinder on 31-3-2020 Cumulative',
   'with cylinder',
   'Urban',
   '55'],
  ['1.9',
   'Non-Priority Card Holders with and without gas cylinder on 31-3-2020 Cumulative',
   'with cylinder',
   'Rural',
   '56'],
  ['1.9',
   'Non-Priority Card Holders with and without gas cylinder on 31-3-2020 Cumulative',
   'with cylinder',
   'Total',
   '57'],
  ['1.9',
   'Non-Priority Card Holders with and without gas cylinder on 31-3-2020 Cumulative',
   'with cylinder',
   '% to State total',
   '58'],
  ['1.9',
   'Non-Priority Card Holders with and without gas cylinder on 31-3-2020 Cumulative',
   'without cylinder',
   'Urban',
   '59'],
  ['1.9',
   'Non-Priority Card 

In [89]:
def find_table_details(input_string):
    # Regular expression to match numbers with dot similar to dd.dd, d.d, d.dd, or dd.d
    pattern = re.compile(r'\b(\d\.\d|\d{2}\.\d|\d\.\d{2}|\d{2}\.\d{2})\b')

    # Find all occurrences of the pattern in the input string
    matches = pattern.finditer(input_string)

    # Initialize variables to store the matches
    match_positions = []

    # Store the positions of the matches
    for match in matches:
        match_positions.append(match.start())

    # If there is at least one occurrence of the pattern
    if match_positions:
        table_id_index = match_positions[0]
        input_string = input_string[table_id_index:]  
        space_index = input_string.find(' ') 
        table_id = input_string[table_id_index:space_index].strip()
        table_name = input_string[space_index:].strip()
        return table_id, table_name    
    else:
        return None, None


In [90]:
def convert_to_number(string):
    try:
        number = int(string)
    except ValueError:
        try:
            number = float(string)
        except ValueError:
            raise ValueError("Input string is not a valid number")
    return number

In [91]:
chapter_id = 1
chapter_name = "General Information"
#processing each column
def process_dictionary(input_data):
    df = pd.DataFrame(columns = ['Attr_id', 'Chapter_id', 'Chapter_name', 'Table_id', 'Table_name', 'Description'])
    for inner_list in input_data:
        table_id = remove_extra_spaces(inner_list[0][0])
        table_name = remove_extra_spaces(inner_list[0][1])
        previd=0
        for data_list in inner_list:
            i=0
            attr_id=0
            description = ''
            prev = ''
            for data in reversed(data_list):
                if data == '1' or data == '2':
                    break
                i = i+1;
                if i == 1 and data!='' and (previd==0 or convert_to_number(data)!=previd):
                    attr_id = convert_to_number(data)
                    previd = convert_to_number(data)
                if i > 1 and i < len(data_list)-1:
                    if description == '':
                        description = data
                        prev = data
                    else : 
                        if prev != data and data!='':
                            description = ','.join([description, data]) #description + ' ' + data
                            prev = data
            if attr_id != 0 and description != "" :
                row = {'Attr_id' : attr_id, 'Chapter_id' : chapter_id, 'Chapter_name' : chapter_name, 'Table_id' : table_id, 'Table_name' : table_name, 'Description' : remove_extra_spaces(description)}
                df.loc[len(df)] = row
    df = df.sort_values(by=['Attr_id'])
    df.to_csv('/home/riya/Downloads/ADT_2019-20/Output_html_Chapter1/Chapter1_Attributes.csv', sep=';', index=False)
    df.to_csv('/home/riya/Downloads/ADT_2019-20/Dictionary/Chapter1_Attributes.csv', sep=';', index=False)


In [92]:
process_dictionary(transp_list)

In [23]:
#Not used
def remove_before_second_occurrence(input_string):
    # Regular expression to match numbers with dot similar to dd.dd, d.d, d.dd, or dd.d
    pattern = re.compile(r'\b(\d\.\d|\d{2}\.\d|\d\.\d{2}|\d{2}\.\d{2})\b')

    # Find all occurrences of the pattern in the input string
    matches = list(pattern.finditer(input_string))

    # If there are at least two occurrences of the pattern, remove everything before the second occurrence
    if len(matches) >= 2:
        second_occurrence_index = matches[1].start()
        input_string = input_string[second_occurrence_index:]

    return input_string.strip()