In [1]:
import sys
import os
import importlib
import pandas as pd
import numpy as np
sys.path.insert(0, os.path.abspath("../data_model/"))

In [2]:
processed_dir = "../data/processed"
reports_dir = "../reports"
data_model_output_file = os.path.join(processed_dir, "data_model_output.csv")

summary_document = os.path.join(reports_dir, "variable_summary_for_appendix.docx")

In [3]:
data_model_output_df = pd.read_csv(data_model_output_file)
data_model_output_df = data_model_output_df[data_model_output_df['is_valid_record']==True]
data_model_output_df.shape

  data_model_output_df = pd.read_csv(data_model_output_file)


(9098, 439)

In [4]:
data_model_output_df.head()

Unnamed: 0,unique_id,respondentid,is_completed,is_valid_record,date_completed,time_completed,is_pilot,is_self_administered,record_type_synthetic,access_mode,...,validation_error_person,validation_error_trip,validation_num_errors_person,validation_num_errors_trip,validation_severity_person,validation_severity_trip,weight_departing_and_arriving,weight_departing_only,weight_non_sas_departing_only,weight_departing_only_with_time_of_day
0,1,5473,True,True,2024-10-04,08:41:12,False,False,0,,...,[],[],0,0,,,10.840259,10.854572,19.179428,10.874504
1,2,5476,True,True,2024-10-04,08:40:04,False,False,0,1.0,...,[],[],0,0,,,6.487856,6.53419,8.687559,6.502862
2,3,5489,True,True,2024-10-04,08:51:36,False,False,0,,...,[],[],0,0,,,10.840259,10.854572,19.179428,10.874504
3,4,5558,True,True,2024-10-04,10:32:58,False,False,0,1.0,...,[],[],0,0,,,6.487856,6.53419,8.687559,6.502862
4,5,5593,True,True,2024-10-04,11:09:46,False,False,0,1.0,...,[],[],0,0,,,6.487856,6.53419,8.687559,6.502862


In [5]:
def create_summary_table(df, col, weight_col=None):
    """
    Create a summary table with value counts, percentages, weighted percentages, 
    and cumulative percentages for a specified label column, ordered by its corresponding code column.

    Parameters:
        df (pd.DataFrame): Input dataframe.
        col (str): Label column to analyze (e.g., 'gender_label').
        weight_col (str, optional): Column containing weights. Defaults to None.

    Returns:
        pd.DataFrame: A summary table with value counts, percentages, weighted percentages, 
                      and cumulative percentages, ordered by code column values.
    """
    # Identify code column (assumes it's the same as `col` without "_label")
    code_col = col.replace('_label', '')

    # Combine label and code columns into a temporary DataFrame for sorting
    temp_df = df[[col, code_col]].drop_duplicates().set_index(col)

    # Create mapping from label to code for sorting
    label_to_code = temp_df[code_col].to_dict()

    # Calculate value counts and percentages
    value_counts = df[col].value_counts()
    percentages = df[col].value_counts(normalize=True) * 100

    # Sort by the corresponding code values
    sorted_index = sorted(value_counts.index, key=lambda x: label_to_code.get(x, float('inf')))
    sorted_value_counts = value_counts.loc[sorted_index]
    sorted_percentages = percentages.loc[sorted_index]

    # Calculate weighted percentages if weight_col is provided
    if weight_col:
        weights = df.groupby(col)[weight_col].sum()
        sorted_weights = weights.loc[sorted_index]
        weighted_percentages = (sorted_weights / sorted_weights.sum()) * 100
    else:
        weighted_percentages = pd.Series([None] * len(sorted_value_counts), index=sorted_index)

    # Calculate cumulative percentage
    cumulative_percentages = sorted_percentages.cumsum()

    # Combine into a summary table
    output_df = pd.DataFrame({
        'Responses': sorted_value_counts,
        'Raw %': round(sorted_percentages, 2),
        'Weighted %': round(weighted_percentages, 2),
        'Cumulative %': round(cumulative_percentages, 2)
    }).reset_index().rename(columns={col: 'Response'})

    return output_df


In [6]:
data_model_output_df['access_mode_label'].value_counts(normalize = True)

access_mode_label
WALK                                    0.459459
DROVE_ALONE_AND_PARKED                  0.197297
DROPPED_OFF_BY_FAMILY_FRIEND            0.170270
UBER_LYFT                               0.056757
OTHER_PUBLIC_TRANSIT                    0.045946
DROVE_WITH_OTHERS_AND_PARKED            0.024324
OTHER                                   0.016216
CAR_SERVICE_BLACK_LIMO                  0.013514
RODE_WITH_OTHER_TRAVELERS_AND_PARKED    0.008108
TAXI                                    0.005405
BICYCLE_PERSONAL_NON_ELECTRIC           0.002703
Name: proportion, dtype: float64

In [7]:
data_model_output_df[['passenger_segment', 'passenger_segment_label']].value_counts()

passenger_segment  passenger_segment_label
3.0                VISITOR_ARRIVING           2549
4.0                VISITOR_DEPARTING          2428
1.0                RESIDENT_ARRIVING          1763
2.0                RESIDENT_DEPARTING         1699
Name: count, dtype: int64

In [8]:
data_model_output_df['summary_segment'] = np.where(
    data_model_output_df['marketsegment_label'] == 'EMPLOYEE', 1,
    np.where(data_model_output_df['passenger_segment_label'] == 'RESIDENT_DEPARTING', 2,
    np.where(data_model_output_df['passenger_segment_label'] == 'VISITOR_DEPARTING', 3, np.nan))
)

In [9]:
data_model_output_df['summary_segment_label'] = np.where(
    data_model_output_df['marketsegment_label'] == 'EMPLOYEE', 'EMPLOYEE',
    np.where(data_model_output_df['passenger_segment_label'] == 'RESIDENT_DEPARTING', 'RESIDENT_DEPARTING',
    np.where(data_model_output_df['passenger_segment_label'] == 'VISITOR_DEPARTING', 'VISITOR_DEPARTING', None))
)

In [10]:
data_model_output_df['summary_segment_label'].value_counts()

summary_segment_label
VISITOR_DEPARTING     2428
RESIDENT_DEPARTING    1699
EMPLOYEE               659
Name: count, dtype: int64

In [11]:
working_df = data_model_output_df[data_model_output_df['summary_segment_label'].isin(['VISITOR_DEPARTING', 'RESIDENT_DEPARTING', 'EMPLOYEE'])]
working_df.shape

(4786, 441)

In [12]:
summary_df = create_summary_table(working_df, 'summary_segment_label', 'weight_departing_only')
summary_df

Unnamed: 0,Response,Responses,Raw %,Weighted %,Cumulative %
0,EMPLOYEE,659,13.77,32.72,13.77
1,RESIDENT_DEPARTING,1699,35.5,25.45,49.27
2,VISITOR_DEPARTING,2428,50.73,41.83,100.0


In [13]:
from docx import Document
from docx.enum.section import WD_ORIENT
from docx.shared import Inches

def set_column_widths(table, widths):
    for col_idx, width in enumerate(widths):
        for row in table.rows:
            row.cells[col_idx].width = Inches(width)
        table.columns[col_idx].width = Inches(width)

def generate_summary_document(df, weight_col=None, segment_col=None, output_file='summary_tables.docx'):
    doc = Document()
    section = doc.sections[-1]
    section.orientation = WD_ORIENT.LANDSCAPE
    section.page_width, section.page_height = section.page_height, section.page_width

    usable_width = 8.5
    first_col_width = 3.0

    def get_widths(n_cols):
        if n_cols == 1:
            return [usable_width]
        remaining = usable_width - first_col_width
        other_width = remaining / (n_cols - 1)
        return [first_col_width] + [other_width] * (n_cols - 1)

    if segment_col:
        pretty_col = segment_col.replace("_label", "").replace("_", " ").title()
        doc.add_heading("Segment Columns Summary", level=1)
        doc.add_heading(pretty_col, level=2)

        summary_table = create_summary_table(df, segment_col, weight_col)
        if not summary_table.empty:
            num_cols = summary_table.shape[1]
            table = doc.add_table(rows=1, cols=num_cols)
            table.style = 'Light Grid Accent 1'
            table.autofit = False
            set_column_widths(table, get_widths(num_cols))

            for i, column_name in enumerate(summary_table.columns):
                clean_name = column_name.replace("_", " ").title()
                table.cell(0, i).text = clean_name if column_name != 'Response' else 'Response'

            for _, row in summary_table.iterrows():
                cells = table.add_row().cells
                for i, value in enumerate(row):
                    cells[i].text = f'{str(value).replace("_label", "").replace("_", " ").title()}'

            doc.add_paragraph()

    if segment_col:
        segments = df[segment_col].dropna().unique()
    else:
        df['custom_segmentation'] = 'All Data'
        segments = ['All Data']

    label_columns = [col for col in df.columns if col.endswith('_label') and col != segment_col]

    for idx, segment in enumerate(segments):
        if idx > 0:
            doc.add_page_break()

        if segment_col:
            subset_df = df[df[segment_col] == segment]
            doc.add_heading(f"Segment: {segment}", level=1)
        else:
            subset_df = df
            doc.add_heading("All Data", level=1)

        for col in label_columns:
            summary_table = create_summary_table(subset_df, col, weight_col)
            if summary_table.empty:
                continue

            doc.add_heading(col.replace("_label", "").replace("_", " ").title(), level=2)

            num_cols = summary_table.shape[1]
            table = doc.add_table(rows=1, cols=num_cols)
            table.style = 'Light Grid Accent 1'
            table.autofit = False
            set_column_widths(table, get_widths(num_cols))

            for i, column_name in enumerate(summary_table.columns):
                table.cell(0, i).text = column_name

            for _, row in summary_table.iterrows():
                cells = table.add_row().cells
                for i, value in enumerate(row):
                    cells[i].text = f'{str(value).replace("_label", "").replace("_", " ").title()}'

            doc.add_paragraph()

    doc.save(output_file)
    print(f"Word document saved as {output_file}")


In [15]:
generate_summary_document(data_model_output_df, weight_col='weight_departing_only', segment_col='summary_segment_label', output_file = summary_document)

Word document saved as ../reports\variable_summary_for_appendix.docx
