In [None]:
import os
import pandas as pd
import re
from openpyxl import load_workbook

# Input and output
input_folder = "ExcelFolders"
output_file = os.path.join("cleanExcel", "cleanedBook.xlsx")
log_file = "log.txt"

# Make sure output folder exists
os.makedirs("cleanExcel", exist_ok=True)

# Load log of already processed files
if os.path.exists(log_file):
    with open(log_file, "r") as f:
        processed_files = set(f.read().splitlines())
else:
    processed_files = set()

# Collect all cleaned DataFrames
cleaned_dfs = []

# Loop through all Excel files in input folder
for filename in os.listdir(input_folder):
    if filename.endswith(".xlsx") and filename not in processed_files:
        filepath = os.path.join(input_folder, filename)

        print(f"Processing: {filename}")

        # 1. Load workbook and unmerge cells
        wb = load_workbook(filepath)
        ws = wb.active
        for merged_range in list(ws.merged_cells.ranges):
            ws.unmerge_cells(str(merged_range))
        wb.save(filepath)

        # 2. Read Excel into DataFrame
        df = pd.read_excel(filepath, header=None)

        # Drop completely empty rows
        df = df.dropna(how="all")

        # Drop the first column (index 0)
        if df.shape[1] > 0:
            df = df.drop(df.columns[0], axis=1)
        df.insert(0, "EmptyCol1", "")
        df.insert(0, "EmptyCol2", "")

        print("Number of columns in df:", df.shape[1])
        print("First few rows:")
        print(df.head(5))

        cleaned_dfs.append(df)

        # Add to log file after successful processing
        with open(log_file, "a") as f:
            f.write(filename + "\n")

# Combine all cleaned DataFrames
if cleaned_dfs:
    final_df = pd.concat(cleaned_dfs, ignore_index=True)
    # Trim to 36 columns max
    final_df = final_df.iloc[:, :36]
    final_df.to_excel(output_file, index=False, header=False)
    print("Final row count:", final_df.shape[0])
else:
    print("‚ö† No new Excel files to process.")
    sys.exit()   


‚ö† No new Excel files to process.


In [16]:
final_df = final_df.drop(df.columns[35], axis=1)

final_df.columns = [
    "Month_year", "Consultation_Type", "Case",
    "Under 1 Male", "Under 1 Female",
    "1-4 Male", "1-4 Female",
    "5-9 Male", "5-9 Female",
    "10-14 Male", "10-14 Female",
    "15-18 Male", "15-18 Female",
    "19-24 Male", "19-24 Female",
    "25-29 Male", "25-29 Female",
    "30-34 Male", "30-34 Female",
    "35-39 Male", "35-39 Female",
    "40-44 Male", "40-44 Female",
    "45-49 Male", "45-49 Female",
    "50-54 Male", "50-54 Female",
    "55-59 Male", "55-59 Female",
    "60-64 Male", "60-64 Female",
    "65-69 Male", "65-69 Female",
    "70 Over Male", "70 Over Female"
]
# 4. Save back to cleanedBook.xlsx
final_df.to_excel("cleanExcel/cleanedBook.xlsx", index=False)

KeyError: '[34] not found in axis'

In [None]:
month_year_map = {}
current_month_year = None

for i, row in final_df.iterrows():
    for cell in row.dropna().astype(str):
        if "MONTH AND YEAR:" in cell.upper():  # detect header rows
            # Extract "MONTH YEAR"
            match = re.search(r"MONTH AND YEAR:\s*([A-Z]+)\s+(\d{4})", cell.upper())
            if match:
                current_month_year = " ".join(cell.split(":")[1].strip().split()[:2])
                print(f"Found: {current_month_year} at row {i}")
    month_year_map[i] = current_month_year  # keep filling for all rows

# ‚úÖ Add/Update Month_year column
if "Month_year" in final_df.columns:
    final_df["Month_year"] = pd.Series(month_year_map)
else:
    final_df.insert(0, "Month_year", pd.Series(month_year_map))  # put as first column

# ‚úÖ Save back to Excel
output_file = os.path.join("cleanExcel", "cleanedBook.xlsx")
final_df.to_excel(output_file, index=False, header=True)

print(f"‚úÖ Updated file saved: {output_file}")
print("Number of rows:", final_df.shape[0])
print("Number of columns:", final_df.shape[1])

Found: AUGUST 2023 at row 3
Found: AUGUST 2023 at row 22
Found: AUGUST 2023 at row 41
Found: DECEMBER 2023 at row 60
Found: DECEMBER 2023 at row 79
Found: DECEMBER 2023 at row 98
Found: JULY 2023 at row 115
Found: JULY 2023 at row 134
Found: JULY 2023 at row 153
Found: JUNE 2023 at row 172
Found: JUNE 2023 at row 191
Found: MAY 2023 at row 210
Found: MAY 2023 at row 229
Found: MAY 2023 at row 248
Found: MAY 2023 at row 267
Found: NOVEMBER 2023 at row 284
Found: NOVEMBER 2023 at row 303
Found: NOVEMBER 2023 at row 322
Found: OCTOBER 2023 at row 341
Found: OCTOBER 2023 at row 360
Found: OCTOBER 2023 at row 379
Found: SEPTEMBER 2023 at row 397
Found: SEPTEMBER 2023 at row 416
Found: SEPTEMBER 2023 at row 435
‚úÖ Updated file saved: cleanExcel\cleanedBook.xlsx
Number of rows: 450
Number of columns: 35


In [None]:
category_map = {}
current_category = None
found_categories = []

for i, row in final_df.iterrows():
    for cell in row.dropna().astype(str):
        if "TOP 10" in cell.upper():
            # Get the last word, strip punctuation
            last_word = re.sub(r"[^\w]", "", cell.strip().split()[-1])
            current_category = last_word.capitalize()
            found_categories.append((i, current_category))
            print(f"‚úÖ Found: {current_category} at row {i}")
    category_map[i] = current_category  # Fill forward

# ‚úÖ Update existing Consultation_Type column
final_df["Consultation_Type"] = pd.Series(category_map)

# ‚úÖ Save back to Excel
final_df.to_excel(output_file, index=False, header=True)

print(f"\n‚úÖ File updated and saved to: {output_file}")
print("üîç Unique categories found:", set(cat for _, cat in found_categories))


‚úÖ Found: Consultation at row 4
‚úÖ Found: Diagnosis at row 23
‚úÖ Found: Mortality at row 42
‚úÖ Found: Consultation at row 61
‚úÖ Found: Diagnosis at row 80
‚úÖ Found: Mortality at row 99
‚úÖ Found: Consultation at row 116
‚úÖ Found: Diagnosis at row 135
‚úÖ Found: Mortality at row 154
‚úÖ Found: Consultation at row 173
‚úÖ Found: Diagnosis at row 192
‚úÖ Found: Mortality at row 211
‚úÖ Found: Consultation at row 230
‚úÖ Found: Consultation at row 249
‚úÖ Found: Consultation at row 268
‚úÖ Found: Consultation at row 285
‚úÖ Found: Diagnosis at row 304
‚úÖ Found: Mortality at row 323
‚úÖ Found: Consultation at row 342
‚úÖ Found: Diagnosis at row 361
‚úÖ Found: Mortality at row 380
‚úÖ Found: Consultation at row 398
‚úÖ Found: Diagnosis at row 417
‚úÖ Found: Mortality at row 436

‚úÖ File updated and saved to: cleanExcel\cleanedBook.xlsx
üîç Unique categories found: {'Consultation', 'Diagnosis', 'Mortality'}


In [None]:
file_path = os.path.join("cleanExcel", "cleanedBook.xlsx")

# Load Excel
final_df = pd.read_excel(file_path)

# Find and drop the row + 6 rows under it
drop_indexes = []
for i, row in final_df.iterrows():
    for cell in row.dropna().astype(str):
        if "PASIG CITY CHILDREN'S HOSPITAL/PASIG CITY COVID-19 REFERRAL CENTER" in cell.upper():
            drop_indexes.extend(range(i, i + 8))  # this row + 6 below
            break
        
# --- Remove rows containing "TOTAL" (disregard format) ---
for i, row in final_df.iterrows():
    for cell in row.dropna().astype(str):
        if "TOTAL" in cell.upper().strip():
            drop_indexes.append(i)
            break

# Drop them
final_df = final_df.drop(drop_indexes, errors="ignore").reset_index(drop=True)

# ‚úÖ Save back to the same Excel file
final_df.to_excel(file_path, index=False, header=True)

print(f"‚úÖ Removed {len(drop_indexes)} rows and updated file: {file_path}")

‚úÖ Removed 211 rows and updated file: cleanExcel\cleanedBook.xlsx


In [None]:
# ‚úÖ Replace empty/NaN cells with 0
final_df = final_df.fillna(0)

# ‚úÖ Save back to the same Excel file
final_df.to_excel(file_path, index=False, header=True)


  final_df = final_df.fillna(0)
