Install dependencies if not installed in environment

In [3]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2
Note: you may need to restart the kernel to use updated packages.


Import dependencies

In [4]:
from docx import Document
import pandas as pd
import os

Load the Capstone Documentation .docx file, read in tables in the file to create one pandas dataframe for each sample output, and save all the pandas dataframes into a folder called `sample_table_outputs` as .csv files.

In [12]:
# Load the document
doc = Document("../docs/UChicago Capstone Documentation.docx")

# Create a directory to save CSVs
output_dir = "../samples/table_outputs"
os.makedirs(output_dir, exist_ok=True)

# Loop through tables and save each as a CSV
for i, table in enumerate(doc.tables):
    # Extract table data
    data = [[cell.text.strip() for cell in row.cells] for row in table.rows]
    
    # Handle empty tables
    if not data or not data[0]:
        continue

    # Create DataFrame (assuming first row is header)
    df = pd.DataFrame(data[1:], columns=data[0])
    
    # Quick examination of several rows of table
    print(df.head(5))

    # Save to CSV
    csv_filename = f"sample_table_{i+1}.csv"
    df.to_csv(os.path.join(output_dir, csv_filename), index=False)

    print(f"Exported: {csv_filename}\n")

    Month-Year Volume (Dths)
0   April-2022           115
1     May-2022           100
2    June-2022           100
3    July-2022           100
4  August-2022            90
Exported: sample_table_1.csv

  Month-Year Volume (kWh)
0   Nov-2020         3088
1   Dec-2020         7363
2   Jan-2021        21004
3   Feb-2021        24144
4   Mar-2021        20588
Exported: sample_table_2.csv

   Month-Year Volume (kWh)
0  April-2023         7120
1    May-2023         5500
2    Jun-2023         4250
3    Jul-2023         2500
4    Aug-2023         1000
Exported: sample_table_3.csv

  Month-Year    Volume (kWh)
0   Dec-2023  81+87+53 = 221
1   Jan-2024  79+85+51 = 215
2   Feb-2024  69+74+45 = 188
3   Mar-2024  66+72+43 = 181
4   Apr-2024  56+61+36 = 153
Exported: sample_table_4.csv

             Month-Year         Volume (MMBTU)
0  February (2022-2025)  272+336+120+207 = 935
1     March (2022-2025)  246+257+113+177 = 793
2     April (2022-2025)  224+216+103+128 = 671
3       May (2022-2025)   

----

Trial code

In [5]:
# Load the .docx file
doc = Document("../ProjectMaterials/UsageExtractionDocuments/UChicago Capstone Documentation.docx")

# Loop through tables
for table_index, table in enumerate(doc.tables):
    print(f"\nTable {table_index + 1}")
    for row in table.rows:
        row_data = [cell.text.strip() for cell in row.cells]
        print(row_data)


Table 1
['Month-Year', 'Volume (Dths)']
['April-2022', '115']
['May-2022', '100']
['June-2022', '100']
['July-2022', '100']
['August-2022', '90']
['September-2022', '85']
['October-2022', '100']
['November-2022', '120']
['December-2022', '140']
['January-2023', '150']
['February-2023', '150']
['March-2023', '150']
['April-2023', '115']
['May-2023', '110']
['June-2023', '100']
['July-2023', '100']
['August-2023', '90']
['September-2023', '85']
['October-2023', '100']
['November-2023', '120']
['December-2023', '140']
['January-2024', '150']
['February-2024', '150']
['March-2024', '150']
['April-2024', '115']
['May-2024', '110']
['June-2024', '100']
['July-2024', '100']
['August-2024', '90']
['September-2024', '85']
['October-2024', '100']
['November-2024', '120']
['December-2024', '140']
['January-2025', '150']
['February-2025', '150']
['March-2025', '150']

Table 2
['Month-Year', 'Volume (kWh)']
['Nov-2020', '3088']
['Dec-2020', '7363']
['Jan-2021', '21004']
['Feb-2021', '24144']
['Mar