In [25]:
from utils.read_write import read_csv_v2, to_csv

In [26]:
processedFileString = '_Processed.csv'
people_df=read_csv_v2('../data/People'+processedFileString)
vehicles_df=read_csv_v2('../data/Vehicles'+processedFileString)
crashes_df=read_csv_v2('../data/Crashes'+processedFileString)

Reading csv People_Processed.csv


In [7]:
from typing import List, Dict, Any
from collections import OrderedDict
import copy

def create_dict_for_table(
    dfs: List[List[Dict[str, Any]]],
    columns_to_keep: List[List[str]],
    new_column_names: List[List[str]],
    use_index: List[bool],
    index_names: List[str]
) -> List[List[Dict[str, Any]]]:
    """
    Process multiple lists of dictionaries by keeping selected columns, renaming them,
    and optionally adding index columns. Creates deep copies of input dataframes to avoid
    modifying original data.
    
    Args:
        dfs: List of dataframes (each dataframe is a list of dictionaries)
        columns_to_keep: List of lists containing column names to retain for each dataframe
        new_column_names: List of lists containing new names for retained columns
        use_index: List of booleans indicating whether to add index column for each dataframe
        index_names: List of names for index columns
    
    Returns:
        List of processed dataframes with copied and transformed data
        
    Raises:
        ValueError: If input lists have inconsistent lengths or column mappings
    """
    # Validate input lengths
    input_lengths = [len(x) for x in (dfs, columns_to_keep, new_column_names, use_index, index_names)]
    if len(set(input_lengths)) > 1:
        raise ValueError(
            f"Inconsistent input lengths:\n"
            f"dfs: {input_lengths[0]}, columns_to_keep: {input_lengths[1]}, "
            f"new_column_names: {input_lengths[2]}, use_index: {input_lengths[3]}, "
            f"index_names: {input_lengths[4]}"
        )
    
    # Create a deep copy of the input dataframes list
    processed_dfs = []
    
    # Process each dataframe
    for df_idx, (df, keep_cols, new_cols, add_index, idx_name) in enumerate(
        zip(dfs, columns_to_keep, new_column_names, use_index, index_names)
    ):
        # Validate column name mappings
        if len(keep_cols) != len(new_cols):
            raise ValueError(
                f"Mismatched column counts in dataframe at index {df_idx}: "
                f"columns_to_keep: {len(keep_cols)}, new_column_names: {len(new_cols)}"
            )
        
        # Create column name mapping
        col_mapping = dict(zip(keep_cols, new_cols))
        
        # Create new dataframe for processed rows
        processed_df = []
        
        # Process each row in the dataframe
        for row_idx, row in enumerate(df):
            # Create new row with only desired columns and renamed
            new_row = {
                new_name: copy.deepcopy(row[old_name])
                for old_name, new_name in col_mapping.items()
            }
            
            # Add index if requested
            if add_index:
                new_row[idx_name] = row_idx
                
            # Add processed row to new dataframe
            processed_df.append(new_row)
        
        # Add processed dataframe to result list
        processed_dfs.append(processed_df)
    
    return processed_dfs

In [9]:
from collections import defaultdict

def outer_join_datasets(dataset1, dataset2, dataset3, key='RD_NO', dataset1Name='Crashes', dataset2Name='Vehicles', dataset3Name='People'):
    """
    Perform an outer join on three datasets using the specified key.
    
    Parameters:
    dataset1 (list of dicts): First dataset
    dataset2 (list of dicts): Second dataset
    dataset3 (list of dicts): Third dataset
    key (str): Key to use for joining the datasets, defaults to 'RD_NO'
    
    Returns:
    list of dicts: Joined dataset
    """
    # Initialize the joined data with a defaultdict of dictionaries
    joined_data = defaultdict(lambda: {key: None})

    # List of datasets and their names for dynamic key construction
    datasets = [(dataset1, dataset1Name), (dataset2, dataset2Name), (dataset3, dataset3Name)]

    # Populate joined_data with each dataset
    for i, (dataset, prefix) in enumerate(datasets, start=1):
        for row in dataset:
            row_key = row.get(key)
            if row_key is None:
                continue  # Skip rows without the join key

            # Set key and values with dataset-specific prefixes in the joined data
            joined_data[row_key][key] = row_key  # Ensure key is present
            for k, v in row.items():
                joined_data[row_key][f'{prefix}_{k}'] = v

    # Convert the defaultdict to a list of dictionaries
    return list(joined_data.values())

In [11]:
x = outer_join_datasets(crashes_df, vehicles_df, people_df)

In [15]:
len(x)

257927

In [18]:
keys = []
for elem in x:
    keys.append(elem['RD_NO'])

In [20]:
len(set(keys))

257927

In [24]:
len(crashes_df)

257925