Import Our Libraries

In [6]:
import openpyxl
import json
from pathlib import Path
import os
from typing import List
import openpyxl.workbook
import pandas as pd

Define Our Parsing Functions

In [3]:


def load_workbook(filepath: Path) -> openpyxl.workbook.workbook.Workbook:
    if filepath.suffix != ".xlsx":
        raise ValueError(
            f"The inputted file is of type {filepath.suffix}. Please provide a .xlsx file."
        )

    try:
        workbook = openpyxl.load_workbook(filepath)

    except Exception as e:
        raise f"An unexpected error occurred: {e}"

    else:
        return workbook


def load_worksheet(
    workbook: openpyxl.workbook.workbook.Workbook, sheet_name: str
) -> openpyxl.worksheet.worksheet.Worksheet:
    if sheet_name not in workbook.sheetnames:
        raise KeyError(f"Sheet '{sheet_name}' cannot be found in workbook.")

    else:
        return workbook[sheet_name]


def parse_sheet_data(worksheet: openpyxl.worksheet.worksheet.Worksheet) -> List[List]:
    tables = []
    current_table = []
    for row in worksheet.iter_rows(values_only=True):
        if all(cell is None for cell in row):
            if current_table:
                tables.append(current_table)
                current_table = []

        else:
            current_table.append([r for r in row if r is not None])

    if current_table:
        tables.append(current_table)

    return tables


def parse_table_data(table: List[List]) -> dict:
    if len(table) < 2:
        return None

    column_name, column_description = table.pop(0)
    column_type = table.pop(0)[1]
    encodings = {key: value for key, value in table if key is not None}

    return {
        "column_name": column_name,
        "column_description": column_description,
        "column_type": column_type,
        "encodings": encodings,
    }


def load_question_schema(filepath: Path, sheet_name: str) -> List[dict]:
    wb = load_workbook(filepath=filepath)
    sheet = load_worksheet(workbook=wb, sheet_name=sheet_name)
    processed_tables = parse_sheet_data(sheet)

    return [parse_table_data(t) for t in processed_tables]

In [8]:
file_path = Path(os.path.cwd()).parent.parent / "survey.xlsx"
json_output = load_question_schema(filepath=file_path, sheet_name="Questions")
question_df = pd.DataFrame.from_records(json_output).drop(columns=["encodings"])
question_df.head(10)

AttributeError: module 'ntpath' has no attribute 'pwd'