In [11]:
import json
import pandas as pd
import os

In [12]:
def load_excel_files(folder_path):
    excel_files = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.xlsx'):
            excel_files.append(pd.read_excel(
                os.path.join(folder_path, filename)))
    return excel_files

In [13]:
# Function to check if column names are the same
def check_column_lengths(df_list):
    column_lengths = [set(df.columns) for df in df_list]
    return column_lengths

In [14]:
from collections import defaultdict

# Function to merge DataFrames with the same number of columns


def concatenate_dfs(df_list):
    # Group DataFrames by number of columns
    grouped_dfs = defaultdict(list)
    for df in df_list:
        grouped_dfs[len(df.columns)].append(df)

    # List to hold merged DataFrames
    merged_dfs = []

    # Merge DataFrames in each group
    for key in grouped_dfs:
        # Use the column names from the first DataFrame in the group for all DataFrames
        standard_columns = grouped_dfs[key][0].columns
        adjusted_dfs = []

        for df in grouped_dfs[key]:
            df_new = df.copy()
            df_new.columns = standard_columns

            # Append the adjusted DataFrame to the list
            adjusted_dfs.append(df_new)

        # Concatenate DataFrames in the same group
        merged_df = pd.concat(adjusted_dfs, ignore_index=True)
        merged_dfs.append(merged_df)

    return merged_dfs

In [15]:
def get_column_headers(table, col_count, table_index):
    col_headers = []
    cells = table.get('cells')
    if table_index == 0:
        for i in range(col_count):
            col_cells = [cell for cell in cells if cell.get(
                'column_index') == i and cell.get('kind') == 'columnHeader']
            if col_cells:
                max_row_in_column = max([cell.get('row_index')
                                        for cell in col_cells])
                for cell in col_cells:
                    if cell.get('row_index') == max_row_in_column:
                        header_content = cell.get('content')
                        if header_content == '':
                            col_headers.append(f"Unnamed: {i}")
                        else:
                            col_headers.append(cell.get('content'))
            else:
                col_headers.append(f"Unnamed: {i}")
    else:
        for i in range(col_count):
            col_headers.append(f"Unnamed: {i}")
    return col_headers

In [16]:
def get_table_data(table):
    table_data = []
    cells = table.get('cells')
    for row in range(table.get('row_count')):
        row_content = []
        for col in range(table.get('column_count')):
            cell = next((cell for cell in cells if cell.get(
                'row_index') == row and cell.get('column_index') == col), None)
            if cell and cell.get('kind') == 'content':
                row_content.append(cell.get('content'))
            else:
                row_content.append(None)
        # Check if all values in the row are None
        if not all(value is None for value in row_content):
            table_data.append(row_content)
    return table_data

In [17]:
# Function to append text to a file
def append_text_to_file(filename, text):
    # Open the file in append mode ('a') and text mode ('t'), hence 'at'
    with open(filename, 'at') as file:
        # Write the text followed by a newline character
        file.write(text + "\n")

In [18]:
import pandas as pd
import re

def clean_df(df):
    # Step 1: Replace ":unselected:" and ":selected:" with an empty string
    df.replace(to_replace=r":unselected:|:selected:", value="", regex=True, inplace=True)
    
    # Step 2: Extract last word if spaces are present, then remove non-alphanumeric characters
    def clean_cell(x):
        if isinstance(x, str):
            # Check if there are spaces and take the last part
            parts = x.strip().split()
            last_part = parts[-1] if parts else ""
            # Remove non-alphanumeric characters from the last part
            return re.sub(r'[^a-zA-Z0-9]', '', last_part)
        return x

    # Apply the cleaning function to each cell in the DataFrame
    cleaned_df = df.applymap(clean_cell)
    
    return cleaned_df

In [26]:

def final_run():
    json_dir = 'results/Parsed_Pdfs/Chhattisgarh/AE_2018'
    # json_dir = 'results/Parsed_Pdfs/Maharastra/Assembly Election 2019/JSON_Maharastra_2019_AC_230.json'
    # output_dir = 'Parsed_Pdfs/Maharastra/Assembly Election 2019/Excel_Files'
    # os.makedirs(output_dir, exist_ok=True)

    log_file_name = "llogs/CH_json_to_excel_AE_2018.txt"

    for filename in os.listdir(json_dir):
        if filename.endswith('.json'):
            # if filename.endswith('AC_038.json'):

            with open(os.path.join(json_dir, filename)) as f:
                data = json.load(f)
            try:
                tables = data['tables']

                tables_df = []
                for table_index, table in enumerate(tables):
                    table_data = get_table_data(table)
                    initial_df = pd.DataFrame(table_data)
                    current_headers = get_column_headers(
                        table, col_count=initial_df.shape[1], table_index=table_index)
                    # col_count = table.get('column_count', len(current_headers))
                    # if len(current_headers) < col_count:
                    #     current_headers.extend([f"Unnamed: {i}" for i in range(len(current_headers), col_count)])
                    # elif len(current_headers) > col_count:
                    #     current_headers = current_headers[:col_count]
                    df = pd.DataFrame(table_data, columns=current_headers)

                    # # Construct output filename and path
                    # output_filename = f"table_{table_index + 1}_from_{filename.replace('.json', '')}.xlsx"
                    # output_filepath = os.path.join(output_dir, output_filename)

                    # # df.replace({":unselected:": "", ":selected:": ""}, inplace=True)
                    # with pd.ExcelWriter(output_filepath, engine='xlsxwriter') as writer:
                    #     df.to_excel(writer, sheet_name='Sheet1', index=False)
                    tables_df.append(df)

                if len(tables_df) > 0:
                    merged_dfs = concatenate_dfs(tables_df)
                else:
                    print(f"No tables found in {filename}")
                    continue

                # excel_files = load_excel_files(output_dir)
                # if len(excel_files) < 2:
                #     print("There are not enough Excel files to combine.")
                #     return

                folder_path = 'results/Parsed_Excel/Chhattisgarh/AE_2018'
                os.makedirs(folder_path, exist_ok=True)

                output_file = os.path.join(
                    folder_path, f"combined_{filename}.xlsx")
                with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:

                    for index, df in enumerate(merged_dfs):
                        sheet_name = f'Sheet_{index+1}'

                        cleaned_df = clean_df(df)

                        cleaned_df.to_excel(
                            writer, index=False, sheet_name=sheet_name)

                print("Excel files combined and saved successfully.")
            except Exception as exc:
                print(f"Error processing {filename}: {exc}")
                append_text_to_file(log_file_name, f"{filename} \n")



In [27]:
final_run()

  cleaned_df = df.applymap(clean_cell)


Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel files combined and saved successfully.
Excel file

In [None]:

# def final_run():
#     # json_dir = 'results/Parsed_Pdfs/Maharastra/'
#     json_dir = 'results/Parsed_Pdfs/Maharastra/Assembly Election 2019/'
#     # output_dir = 'Parsed_Pdfs/Maharastra/Assembly Election 2019/Excel_Files'
#     # os.makedirs(output_dir, exist_ok=True)

#     log_file_name = "logs/maharastra_2019_excelfile_log.txt"

#     for filename in os.listdir(json_dir):
#         if filename.endswith('.json'):
#             # if filename.endswith('AC_038.json'):

#             with open(os.path.join(json_dir, filename)) as f:
#                 data = json.load(f)
#             try:
#                 tables = data['tables']

#                 tables_df = []
#                 for table_index, table in enumerate(tables):
#                     table_data = get_table_data(table)
#                     initial_df = pd.DataFrame(table_data)
#                     current_headers = get_column_headers(
#                         table, col_count=initial_df.shape[1], table_index=table_index)
#                     # col_count = table.get('column_count', len(current_headers))
#                     # if len(current_headers) < col_count:
#                     #     current_headers.extend([f"Unnamed: {i}" for i in range(len(current_headers), col_count)])
#                     # elif len(current_headers) > col_count:
#                     #     current_headers = current_headers[:col_count]
#                     df = pd.DataFrame(table_data, columns=current_headers)

#                     # # Construct output filename and path
#                     # output_filename = f"table_{table_index + 1}_from_{filename.replace('.json', '')}.xlsx"
#                     # output_filepath = os.path.join(output_dir, output_filename)

#                     # # df.replace({":unselected:": "", ":selected:": ""}, inplace=True)
#                     # with pd.ExcelWriter(output_filepath, engine='xlsxwriter') as writer:
#                     #     df.to_excel(writer, sheet_name='Sheet1', index=False)
#                     tables_df.append(df)

#                 if len(tables_df) > 0:
#                     merged_dfs = concatenate_dfs(tables_df)
#                 else:
#                     print(f"No tables found in {filename}")
#                     continue

#                 # excel_files = load_excel_files(output_dir)
#                 # if len(excel_files) < 2:
#                 #     print("There are not enough Excel files to combine.")
#                 #     return

#                 folder_path = 'results/Parsed_Excel/Rajasthan/2018'
#                 os.makedirs(folder_path, exist_ok=True)

#                 output_file = os.path.join(
#                     folder_path, f"combined_{filename}.xlsx")
#                 with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:

#                     for index, df in enumerate(merged_dfs):
#                         sheet_name = f'Sheet_{index+1}'

#                         cleaned_df = clean_df(df)

#                         cleaned_df.to_excel(
#                             writer, index=False, sheet_name=sheet_name)

#                 print("Excel files combined and saved successfully.")
#             except Exception as exc:
#                 print(f"Error processing {filename}: {exc}")
#                 append_text_to_file(log_file_name, f"{filename} \n")

# final_run()

In [None]:
# # check for the excel files have rowcount ress than 10

# import logging
# import os

# # Define the folder path containing the Excel files
# folder_path = 'results/Parsed_Excel/Maharastra/Assembly Election 2019'
# log_file_name = "logs/maharstra_excelfile_log_for_rowcount.txt"
# logging.basicConfig(filename=log_file_name, level=logging.ERROR,
#                     format='%(asctime)s - %(levelname)s - %(message)s')

# # Get a list of all files in the folder
# files = os.listdir(folder_path)


# for file in files:
#     if file.endswith('.xlsx') or file.endswith('.xls'):
#         file_path = os.path.join(folder_path, file)
#         df = pd.read_excel(file_path)
#         num_rows = df.shape[0]
#         if num_rows < 75:
#             logging.error(
#                 f"File '{file}' has less than 75 rows ({num_rows} rows).")