In [1]:
import sys
import os
import importlib
import pandas as pd
import numpy as np
sys.path.insert(0, os.path.abspath("../data_model/"))

In [2]:
processed_dir = "../data/processed"
reports_dir = "../reports"
data_model_output_file = os.path.join(processed_dir, "data_model_output.csv")

summary_document = os.path.join(reports_dir, "variable_summary_for_appendix.docx")

In [3]:
data_model_output_df = pd.read_csv(data_model_output_file)
data_model_output_df = data_model_output_df[data_model_output_df['is_valid_record']==True]
data_model_output_df.shape

  data_model_output_df = pd.read_csv(data_model_output_file)


(9208, 439)

In [4]:
data_model_output_df.head()

Unnamed: 0,unique_id,respondentid,is_completed,is_valid_record,date_completed,time_completed,is_pilot,is_self_administered,record_type_synthetic,access_mode,...,validation_error_trip,validation_num_errors_person,validation_num_errors_trip,validation_severity_person,validation_severity_trip,weight,weight_departing_and_arriving,weight_departing_only,weight_departing_only_with_time_of_day,weight_non_sas_departing_only
0,1,5473,True,True,2024-10-04,08:41:12,False,False,0,,...,[],0,0,,,1.0,1.0,1.0,1.0,1.0
1,2,5476,True,True,2024-10-04,08:40:04,False,False,0,1.0,...,[],0,0,,,1.0,1.0,1.0,1.0,1.0
2,3,5489,True,True,2024-10-04,08:51:36,False,False,0,,...,[],0,0,,,1.0,1.0,1.0,1.0,1.0
3,4,5558,True,True,2024-10-04,10:32:58,False,False,0,1.0,...,[],0,0,,,1.0,1.0,1.0,1.0,1.0
4,5,5593,True,True,2024-10-04,11:09:46,False,False,0,1.0,...,[],0,0,,,1.0,1.0,1.0,1.0,1.0


In [None]:
def create_summary_table(df, col, weight_col=None):
    """
    Create a summary table with value counts, percentages, weighted percentages, 
    and cumulative percentages for a specified label column, ordered by its corresponding code column.

    Parameters:
        df (pd.DataFrame): Input dataframe.
        col (str): Label column to analyze (e.g., 'gender_label').
        weight_col (str, optional): Column containing weights. Defaults to None.

    Returns:
        pd.DataFrame: A summary table with value counts, percentages, weighted percentages, 
                      and cumulative percentages, ordered by code column values.
    """
    # Identify code column (assumes it's the same as `col` without "_label")
    code_col = col.replace('_label', '')

    # Combine label and code columns into a temporary DataFrame for sorting
    temp_df = df[[col, code_col]].drop_duplicates().set_index(col)

    # Create mapping from label to code for sorting
    label_to_code = temp_df[code_col].to_dict()

    # Calculate value counts and percentages
    value_counts = df[col].value_counts()
    percentages = df[col].value_counts(normalize=True) * 100

    # Sort by the corresponding code values
    sorted_index = sorted(value_counts.index, key=lambda x: label_to_code.get(x, float('inf')))
    sorted_value_counts = value_counts.loc[sorted_index]
    sorted_percentages = percentages.loc[sorted_index]

    # Calculate weighted percentages if weight_col is provided
    if weight_col:
        weights = df.groupby(col)[weight_col].sum()
        sorted_weights = weights.loc[sorted_index]
        weighted_percentages = (sorted_weights / sorted_weights.sum()) * 100
    else:
        weighted_percentages = pd.Series([None] * len(sorted_value_counts), index=sorted_index)

    # Calculate cumulative percentage
    cumulative_percentages = sorted_percentages.cumsum()

    # Combine into a summary table
    output_df = pd.DataFrame({
        'Responses': sorted_value_counts,
        'Raw %': round(sorted_percentages, 2),
        'Weighted %': round(weighted_percentages, 2),
        'Cumulative %': round(cumulative_percentages, 2)
    }).reset_index().rename(columns={col: 'Response'})

    return output_df


In [6]:
data_model_output_df['access_mode_label'].value_counts(normalize = True)

access_mode_label
WALK                                    0.471204
DROVE_ALONE_AND_PARKED                  0.191099
DROPPED_OFF_BY_FAMILY_FRIEND            0.167539
UBER_LYFT                               0.054974
OTHER_PUBLIC_TRANSIT                    0.047120
DROVE_WITH_OTHERS_AND_PARKED            0.023560
OTHER                                   0.015707
CAR_SERVICE_BLACK_LIMO                  0.013089
RODE_WITH_OTHER_TRAVELERS_AND_PARKED    0.007853
TAXI                                    0.005236
BICYCLE_PERSONAL_NON_ELECTRIC           0.002618
Name: proportion, dtype: float64

In [7]:
summary_df = create_summary_table(data_model_output_df, 'marketsegment_label', 'weight_departing_only')
summary_df

Unnamed: 0,Response,Responses,Raw %,Weighted %,Cumulative %
0,PASSENGER,8549,92.84,92.84,92.84
1,EMPLOYEE,659,7.16,7.16,100.0


In [8]:
data_model_output_df['passenger_type_label'].value_counts()

passenger_type_label
ARRIVING     4369
DEPARTING    4180
Name: count, dtype: int64

In [9]:
from docx import Document

def generate_summary_document(df, weight_col=None, segment_cols=None, output_file='summary_tables.docx'):
    """
    Generate a Word document with summary tables for all columns in a dataframe,
    optionally grouped by a concatenated custom segmentation column.
    
    Parameters:
        df (pd.DataFrame): Input dataframe.
        weight_col (str, optional): Column containing weights. Defaults to None.
        segment_cols (list of str, optional): Columns to segment by. Defaults to None.
        output_file (str): Path to save the generated Word document.
    """
    doc = Document()

    if segment_cols:
        doc.add_heading("Segment Columns Summary", level=1)
        for col in segment_cols:
            pretty_col = col.replace("_label", "").replace("_", " ").title()
            doc.add_heading(pretty_col, level=2)

            # Use your function for consistency
            summary_table = create_summary_table(df, col, weight_col)

            # Skip if summary is empty
            if summary_table.empty:
                continue

            table = doc.add_table(rows=1, cols=summary_table.shape[1])
            table.style = 'Light Grid Accent 1'

            # Add headers
            for i, column_name in enumerate(summary_table.columns):
                clean_name = column_name.replace("_", " ").title()
                table.cell(0, i).text = clean_name if column_name != 'Response' else 'Response'

            # Add data rows
            for _, row in summary_table.iterrows():
                cells = table.add_row().cells
                for i, value in enumerate(row):
                    cells[i].text = str(value)

            doc.add_paragraph()

    # Create custom segmentation column
    if segment_cols:
        df['custom_segmentation'] = df[segment_cols].astype(str).agg(" | ".join, axis=1)
        segments = df['custom_segmentation'].unique()
    else:
        df['custom_segmentation'] = 'All Data'
        segments = ['All Data']

    # Filter columns ending with '_label' and exclude segment columns
    label_columns = [col for col in df.columns if col.endswith('_label') and col not in (segment_cols or [])]

    # Iterate through each unique segment
    for segment in segments:
        subset_df = df[df['custom_segmentation'] == segment]
        doc.add_heading(f"Segment: {segment}", level=1)

        for col in label_columns:
            summary_table = create_summary_table(subset_df, col, weight_col)

            # Skip if summary is empty
            if summary_table.empty:
                continue

            doc.add_heading(f'{col.replace("_label", "").replace("_", " ").title()}', level=2)

            table = doc.add_table(rows=1, cols=summary_table.shape[1])
            table.style = 'Light Grid Accent 1'# 'Table Grid'

            for i, column_name in enumerate(summary_table.columns):
                table.cell(0, i).text = column_name

            for _, row in summary_table.iterrows():
                cells = table.add_row().cells
                for i, value in enumerate(row):
                    cells[i].text = str(value)

            doc.add_paragraph()

    # Save the document
    doc.save(output_file)
    print(f"Word document saved as {output_file}")


In [10]:
generate_summary_document(data_model_output_df, weight_col='weight_departing_and_arriving', segment_cols=['marketsegment_label', 'passenger_type_label'], output_file = summary_document)

Word document saved as ../reports\variable_summary_for_appendix.docx
