Version 1.0, 1-10-2025

# **1. Skript: Einlesen und Aggregierung der Worddateien (Antworten der VNL-Teilnehmenden)**

**Annahmen und Voraussetzungen in den docx files:**
- table name is in the first cell of the first row of each table to be imported
- 1 Spalte für Tabellen mit allg. Bemerkungen (nur "Bemerkungen")
- 5 Spalten für Tabellen mit Bemerkungen zu Artikeln (Artikel, Absatz, Buchstabe, Bemerkung, Textvorschlag)
- read table only if table has more than 2 rows (to suppress intro tables)
- skript imports the content of all columns from row 4 of each table

**Skript läuft in Azure Machine Learning Studio
Empfohlene Compute-Umgebung: 16 Kerne, 64 GB RAM, 400 GB Festplatte (CPU)
Kernel: Python 3.10 SDK v2**

**1. Zelle: Installationen von zusätzlichen Libraries**

In [None]:
%pip install python-docx==1.0.1 pandas==2.1.1 matplotlib==3.7.2 numpy==1.26.0 natsort openpyxl

**2. Zelle: Imports, define parameters, set hardcoded information**

In [None]:
import os
import docx
import pandas as pd
import matplotlib.pyplot as plt
from natsort import natsorted

# Define the folder path in the datastore where your .docx files are stored
input_folder_path = "/YOUR/INPUT/FOLDER/PATH/HERE"

# Define the folder where output documents will be saved
output_folder_path = "/YOUR/OUTPUT/FOLDER/PATH/HERE"  # Replace with the path to the folder where output documents will be saved

# Define output file name:
OUTPUT_FILE_NAME = "output.xlsx"

# specify the custom sheet names (one for every table from the docx files read), rest1, 2, 3 sind ein Schutz gegen weitere unerwartete Tabellen
custom_sheet_names = [
    "Sheet name 1", "Sheet name 2", "Rest1", "Rest2", "Rest 3"]

# Define keywords for the extraction of the VNL-Teilnehmer (do not change if it works!) (depends on the text in the docx. form)
KEYWORDS = ["Abkürzung der Firma / Organisation", "Name / Firma / Organisation", "Abréviation de la société / de l’organisation", "Abréviation de l’entr. / org.", "Nom / entreprise / organisation", "Nom / société / organisation", "Sigla della ditta / dell’organizzazione" ,"Sigla della Ditta / Organizzazione", "Nome / Ditta / Organizzazione", "Abbreviation Company / Organisation", "Name / Company / Organisation", "Abkürzung des Eintr. / der Org.", "Name / Unternehmen / Organisation", "Abréviation de l'entrée / de l'org", "Nom / Entreprise / Organisation", "Abkürzung des Eintr. / Org"]  # Add your alternative keywords here

# Define column header names for tables (do not change!)
HEADER_ALLG = [
    "Organisation", "Bemerkung"]
HEADER_ART = [
    "Organisation", "Artikel", "Absatz",
    "Buchstabe", "Bemerkung", "Textvorschlag"]



**3. Zelle: Hauptskript**

In [None]:
# only imports tables with at least 2 rows and more or less than 4 columns (specific exclusion of a table with exactly 4 columns in this VNL)
# optimized xlsx and plotting routines
#strikethrough text handling (replaces with unicode stricken characters)

def striken(text):
    return '\u0336'.join(text) + '\u0336'

def process_run(run):
    if run.font.strike:
        return striken(run.text)
    else:
        return run.text

def process_cell(cell):
    processed_text = []
    for paragraph in cell.paragraphs:
        for run in paragraph.runs:
            processed_text.append(process_run(run))
    return ''.join(processed_text)

# Initialize a dictionary to store dataframes for each table
table_dataframes = {}

# Initialize variables to store processing information
num_input_files_processed = 0
num_rows_added = {}

# Function to check if a row is empty (all cells are empty or all but the first one are empty)
def is_empty_row(row):
    return all(cell.text.strip() == '' for cell in row.cells)
    
def extract_organization_abbr(doc):
    #keywords = ["Abkürzung der Firma / Organisation", "Name / Firma / Organisation", "Abréviation de la société / de l’organisation", "Abréviation de l’entr. / org.", "Nom / entreprise / organisation", "Nom / société / organisation", "Sigla della ditta / dell’organizzazione" ,"Sigla della Ditta / Organizzazione", "Nome / Ditta / Organizzazione", "Abbreviation Company / Organisation", "Name / Company / Organisation", "Abkürzung des Eintr. / der Org.", "Name / Unternehmen / Organisation", "Abréviation de l'entrée / de l'org", "Nom / Entreprise / Organisation", "Abkürzung des Eintr. / Org"]  # Add your alternative keywords here
    keywords = KEYWORDS


    for keyword in keywords:
        for para in doc.paragraphs:
            if keyword in para.text:
                index = para.text.find(keyword)
                if index != -1:
                    extracted_text = para.text[index + len(keyword):].strip()[:30]
                    # Remove ":" and spaces until the first character
                    cleaned_text = extracted_text.replace(":", "").lstrip()
                    if cleaned_text:
                        return cleaned_text
    
    return ''

# Iterate through the .docx files in the input folder
for filename in os.listdir(input_folder_path):
    if filename.endswith('.docx'):
        # print which file is being read
        print(f"File processed: {filename}")
        
        # Open the Word document
        doc_path = os.path.join(input_folder_path, filename)
        doc = docx.Document(doc_path)

        # Extract the organization abbreviation from the document
        org_abbr = extract_organization_abbr(doc)

        # Iterate through the tables in the document
        for table in doc.tables:
            
            # Get the table name (you can customize this part based on your naming convention)
            # Here, we assume the table name is in the first cell of the first row
            table_name = table.cell(0, 0).text.strip()
            
            # Convert both the table_name and dictionary keys to lowercase for case-insensitive comparison
            table_name = table_name.lower()

            # Check if the table name already exists in the dictionary (case-insensitive)
            if any(table_name == key.lower() for key in table_dataframes):

                # Convert the table data to a pandas DataFrame
                table_data = []

                # read only if table has more than 2 rows and more or less than 4 columns (to suppress intro tables)
                if len(table.rows) > 2 and len(table.rows[2].cells) != 4:
                    # Initialize column headers for the current table; suppress this if you have text files with different table formats
                    num_columns = len(table.rows[2].cells)
                    if num_columns == 1:
                        headers = HEADER_ALLG
                    #elif num_columns == 4:
                        #headers = HEADER_WAHL
                    else:
                        headers = HEADER_ART

                    # Start importing all columns from row 4 (assuming rows are 0-indexed)
                    for i, row in enumerate(table.rows):
                        if i >= 3 and not is_empty_row(row):  # Skip empty rows
                            row_data = [process_cell(cell).strip() for cell in row.cells]

                            # Insert organisation abbrev as new cell 1 
                            row_data.insert(0, org_abbr)
                            table_data.append(row_data)

                    df = pd.DataFrame(table_data, columns=headers)

                    # Ensure unique column names by appending a suffix
                    #unique_cols = pd.Series(df.columns)
                    #col_counts = unique_cols.groupby(unique_cols).cumcount()
                    #df.columns = [f"{col}_{count}" if count > 0 else col for col, count in zip(df.columns, col_counts)]

                    # Append the data to the existing dataframe for this table
                    table_dataframes[table_name] = pd.concat([table_dataframes[table_name], df], ignore_index=True)

                    # Update the number of rows added for this table
                    if table_name not in num_rows_added:
                        num_rows_added[table_name] = 0
                    num_rows_added[table_name] += df.shape[0]

            else:
                # If the table name doesn't exist, create a new dataframe
                table_data = []

                # read only if table has more than 2 rows (to suppress intro tables)
                if len(table.rows) > 2 and len(table.rows[2].cells) != 4:
                    # Initialize column header for the current table; suppress this if you have text files with different table formats
                    num_columns = len(table.rows[2].cells)
                    if num_columns == 1:
                        headers = HEADER_ALLG
                    #elif num_columns == 4:
                        #headers = HEADER_WAHL
                    else:
                        headers = HEADER_ART

                    # Start importing from row 4 (assuming rows are 0-indexed)
                    for i, row in enumerate(table.rows):
                        if i >= 3 and not is_empty_row(row):  # Skip empty rows
                            row_data = [process_cell(cell).strip() for cell in row.cells]

                            # Insert organisation abbrev as new cell 1 
                            row_data.insert(0, org_abbr)
                            table_data.append(row_data)

                    df = pd.DataFrame(table_data, columns=headers)

                    # Ensure unique column names by appending a suffix
                    #unique_cols = pd.Series(df.columns)
                    #col_counts = unique_cols.groupby(unique_cols).cumcount()
                    #df.columns = [f"{col}_{count}" if count > 0 else col for col, count in zip(df.columns, col_counts)]

                    # Add the dataframe to the dictionary
                    table_dataframes[table_name] = df
                    
                    # Update the number of rows added for this table
                    if table_name not in num_rows_added:
                        num_rows_added[table_name] = 0
                    num_rows_added[table_name] += df.shape[0]

        num_input_files_processed += 1

# Für Diagnose: gebe Anzahl der Dataframes aus (=Anzahl Tabellen), Liste Namen auf
print(f"Eingelesene Tabellennamen: {list(table_dataframes)}")
print(f"Anzahl eingelesene Tabellen: {len(table_dataframes)}")
        
# Create a Pandas Excel writer using openpyxl as the engine
output_xlsx_path = os.path.join(output_folder_path, OUTPUT_FILE_NAME)
with pd.ExcelWriter(output_xlsx_path, engine="openpyxl") as excel_writer:

    # Save individual dataframes as separate tabs in the xlsx file
    for i, (table_name, df) in enumerate(table_dataframes.items()):
        custom_sheet_name = custom_sheet_names[i]
        df.to_excel(excel_writer, sheet_name=custom_sheet_name, index=False)

# After closing the Excel file (writer auto-closed by `with`), generate plots
for i, (table_name, df) in enumerate(table_dataframes.items()):
    custom_sheet_name = custom_sheet_names[i]

    if not df.empty and len(df.columns) >= 3:
        categories = df.iloc[:, 1].value_counts()
        categories = categories.reindex(natsorted(categories.index))

        plt.figure(figsize=(10, 6))
        categories.plot(kind='bar')
        plt.title(f'Bar Chart for {custom_sheet_name}')
        plt.xlabel('Artikel')
        plt.ylabel('Anzahl Stellungsnahmen')
        plt.xticks(rotation=90)
        plt.tight_layout()

        plot_filename = os.path.join(output_folder_path, f'{table_name}_bar_chart.png')
        plt.savefig(plot_filename)
        #plt.close()  # Close the plot to free memory

# Save processing information to info.txt
info_path = os.path.join(output_folder_path, "info.txt")
with open(info_path, "w") as info_file:
    info_file.write(f"# Number of input files processed: {num_input_files_processed}\n")
    info_file.write("\n# Number of rows added to each table:\n")
    for table_name, num_rows in num_rows_added.items():
        info_file.write(f"{table_name}: {num_rows}\n")

print(f"Tables saved in {output_xlsx_path} with custom sheet names.")
print("Processing information saved in info.txt.")
