In [2]:
import pandas as pd
import numpy as np
import os


In [None]:

# Loading the excel file 
print("Loading data...")
df = pd.read_excel('Coordinates_Imputed_Final.xlsx')
print(f"Loaded data with {len(df)} rows and {df.columns.shape[0]} columns")


Loading data...
Loaded data with 51284 rows and 25 columns


In [4]:

# Create output directory
output_dir = 'dimension_tables'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")

# Function to save dataframe to CSV
def save_to_csv(df, filename):
    file_path = os.path.join(output_dir, filename)
    df.to_csv(file_path, index=False)
    print(f"Saved: {file_path} with {len(df)} rows")


In [None]:

# ========= 1. Victim Dimension =========
# Extract unique combinations of victim attributes
victim_dim = df[['Road User', 'Gender', 'Age', 'Age Group']].drop_duplicates()
# Generate surrogate key
victim_dim = victim_dim.reset_index(drop=True)
victim_dim['victim_key'] = victim_dim.index + 1
# Rename columns
victim_dim = victim_dim.rename(columns={
    'Road User': 'road_user_type',
    'Gender': 'gender',
    'Age': 'age',
    'Age Group': 'age_group'
})
# Reorder columns
victim_dim = victim_dim[['victim_key', 'road_user_type', 'gender', 'age', 'age_group']]
save_to_csv(victim_dim, 'victim_dimension.csv')



Generating Victim Dimension...
Saved: dimension_tables/victim_dimension.csv with 1071 rows


In [None]:
# ========= 2. Crash Event Dimension =========
# Extract unique combinations of crash event attributes
crash_dim = df[['Crash ID', 'Crash Type', 'Number Fatalities']].drop_duplicates()

# Generate sequential surrogate key
crash_dim = crash_dim.reset_index(drop=True)
crash_dim['crash_id'] = crash_dim.index + 1

# Rename columns
crash_dim = crash_dim.rename(columns={
    'Crash ID': 'crash_event',  # Change from 'Crash Event' to 'Crash ID'
    'Crash Type': 'crash_type',
    'Number Fatalities': 'number_fatalities'
})

# Reorder columns
crash_dim = crash_dim[['crash_id', 'crash_event', 'crash_type', 'number_fatalities']]
save_to_csv(crash_dim, 'crash_event_dimension.csv')


Generating Crash Event Dimension...
Saved: dimension_tables/crash_event_dimension.csv with 51284 rows


In [None]:
# ========= 3. Date Dimension =========
# Extract unique combinations of date attributes
date_dim = df[['Year', 'Month', 'Dayweek']].drop_duplicates()

# Create sequential surrogate key
date_dim = date_dim.reset_index(drop=True)
date_dim['date_key'] = date_dim.index + 1  # Sequential surrogate key

# Add season and quarter
def get_season(month):
    if month in [12, 1, 2]:
        return 'Summer'
    elif month in [3, 4, 5]:
        return 'Autumn'
    elif month in [6, 7, 8]:
        return 'Winter'
    else:
        return 'Spring'

def get_quarter(month):
    return (month - 1) // 3 + 1
    
date_dim['season'] = date_dim['Month'].apply(get_season)
date_dim['quarter'] = date_dim['Month'].apply(get_quarter)

# Add month name
month_names = {
    1: 'January', 2: 'February', 3: 'March', 4: 'April', 
    5: 'May', 6: 'June', 7: 'July', 8: 'August',
    9: 'September', 10: 'October', 11: 'November', 12: 'December'
}
date_dim['month_name'] = date_dim['Month'].map(month_names)

# Rename columns
date_dim = date_dim.rename(columns={
    'Year': 'year',
    'Month': 'month',
    'Dayweek': 'day_of_week'
})

# Reorder columns
date_dim = date_dim[['date_key', 'year', 'month', 'month_name', 'day_of_week', 'quarter', 'season']]
save_to_csv(date_dim, 'date_dimension.csv')


Generating Date Dimension...
Saved: dimension_tables/date_dimension.csv with 3024 rows


In [None]:
# ========= 4. Time of Day Dimension =========
# Extract hour and minute from the Time column
df_time = df.copy()
df_time['Hour'] = pd.to_datetime(df_time['Time'], format='%H:%M:%S', errors='coerce').dt.hour
df_time['Minute'] = pd.to_datetime(df_time['Time'], format='%H:%M:%S', errors='coerce').dt.minute

# Extract unique combinations of time attributes
time_dim = df_time[['Hour', 'Minute', 'Time of Day']].drop_duplicates()

# Create sequential surrogate key
time_dim = time_dim.reset_index(drop=True)
time_dim['time_key'] = time_dim.index + 1  # Sequential surrogate key

# Add original time value for reference (HHMM format)
time_dim['time_value'] = time_dim.apply(
    lambda row: int(f"{0 if pd.isna(row['Hour']) else int(row['Hour']):02d}"
                    f"{0 if pd.isna(row['Minute']) else int(row['Minute']):02d}"), 
    axis=1)

# Add time category
def get_time_category(hour):
    if pd.isna(hour):
        return 'Unknown'
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'
        
time_dim['time_category'] = time_dim['Hour'].apply(get_time_category)

# Clean up NaN values
time_dim = time_dim.fillna({'Hour': -1, 'Minute': -1})

# Rename columns
time_dim = time_dim.rename(columns={
    'Hour': 'hour',
    'Minute': 'minute',
    'Time of Day': 'daylight_indicator'
})

# Reorder columns
time_dim = time_dim[['time_key', 'hour', 'minute', 'daylight_indicator', 'time_category']]
save_to_csv(time_dim, 'time_of_day_dimension.csv')


Generating Time of Day Dimension...
Saved: dimension_tables/time_of_day_dimension.csv with 1530 rows


In [None]:

# ========= 5. Location Dimension =========
# Extract unique combinations of location attributes
location_dim = df[['State', 'SA4 Name 2021', 'National LGA Name 2021']].drop_duplicates()
# Generate surrogate key
location_dim = location_dim.reset_index(drop=True)
location_dim['location_key'] = location_dim.index + 1
# Rename columns
location_dim = location_dim.rename(columns={
    'State': 'state',
    'SA4 Name 2021': 'sa4_region',
    'National LGA Name 2021': 'lga_name'
})
# Reorder columns
location_dim = location_dim[['location_key', 'state', 'sa4_region', 'lga_name']]
save_to_csv(location_dim, 'location_dimension.csv')



Generating Location Dimension...
Saved: dimension_tables/location_dimension.csv with 585 rows


In [None]:

# ========= 6. Remoteness Dimension =========
# Extract unique remoteness areas
remoteness_dim = df[['National Remoteness Areas']].drop_duplicates()
# Generate surrogate key
remoteness_dim = remoteness_dim.reset_index(drop=True)
remoteness_dim['remoteness_key'] = remoteness_dim.index + 1
# Rename columns
remoteness_dim = remoteness_dim.rename(columns={
    'National Remoteness Areas': 'remoteness_area'
})
# Reorder columns
remoteness_dim = remoteness_dim[['remoteness_key', 'remoteness_area']]
save_to_csv(remoteness_dim, 'remoteness_dimension.csv')



Generating Remoteness Dimension...
Saved: dimension_tables/remoteness_dimension.csv with 5 rows


In [None]:

# ========= 7. Road Characteristics Dimension =========
# Extract unique combinations of road attributes
road_dim = df[['National Road Type', 'Speed Limit']].drop_duplicates()
# Add speed category
def get_speed_category(speed):
    if pd.isna(speed) or speed < 0:
        return 'Unknown'
    elif speed <= 50:
        return 'Low'
    elif speed <= 80:
        return 'Medium'
    else:
        return 'High'
road_dim['speed_category'] = road_dim['Speed Limit'].apply(get_speed_category)
# Generate surrogate key
road_dim = road_dim.reset_index(drop=True)
road_dim['road_key'] = road_dim.index + 1
# Rename columns
road_dim = road_dim.rename(columns={
    'National Road Type': 'road_type',
    'Speed Limit': 'speed_limit'
})
# Reorder columns
road_dim = road_dim[['road_key', 'road_type', 'speed_limit', 'speed_category']]
save_to_csv(road_dim, 'road_characteristics_dimension.csv')



Generating Road Characteristics Dimension...
Saved: dimension_tables/road_characteristics_dimension.csv with 95 rows


In [None]:

# ========= 8. Vehicle Involvement Dimension =========
# Extract unique combinations of vehicle involvement attributes
vehicle_dim = df[['Bus Involvement', 'Heavy Rigid Truck Involvement', 'Articulated Truck Involvement']].drop_duplicates()
# Add derived attribute
vehicle_dim['any_heavy_vehicle'] = vehicle_dim.apply(
    lambda row: 'Yes' if row['Bus Involvement'] == 'Yes' or
                         row['Heavy Rigid Truck Involvement'] == 'Yes' or
                         row['Articulated Truck Involvement'] == 'Yes'
                else 'No',
    axis=1
)
# Count vehicle types involved
vehicle_dim['vehicle_type_count'] = vehicle_dim.apply(
    lambda row: sum([1 for col in ['Bus Involvement', 'Heavy Rigid Truck Involvement', 'Articulated Truck Involvement'] 
                     if row[col] == 'Yes']),
    axis=1
)
# Generate surrogate key
vehicle_dim = vehicle_dim.reset_index(drop=True)
vehicle_dim['vehicle_key'] = vehicle_dim.index + 1
# Rename columns
vehicle_dim = vehicle_dim.rename(columns={
    'Bus Involvement': 'bus_involvement',
    'Heavy Rigid Truck Involvement': 'heavy_rigid_truck_involvement',
    'Articulated Truck Involvement': 'articulated_truck_involvement'
})
# Reorder columns
vehicle_dim = vehicle_dim[['vehicle_key', 'bus_involvement', 'heavy_rigid_truck_involvement', 
                           'articulated_truck_involvement', 'any_heavy_vehicle', 'vehicle_type_count']]
save_to_csv(vehicle_dim, 'vehicle_involvement_dimension.csv')



Generating Vehicle Involvement Dimension...
Saved: dimension_tables/vehicle_involvement_dimension.csv with 7 rows


In [None]:
# ========= Create the Fact Table =========
# Create lookup dictionaries
victim_lookup = dict(zip(
    zip(victim_dim['road_user_type'], victim_dim['gender'], victim_dim['age'], victim_dim['age_group']),
    victim_dim['victim_key']
))

crash_lookup = dict(zip(
    crash_dim['crash_event'],  
    crash_dim['crash_id']  
))

date_lookup = dict(zip(
    zip(date_dim['year'], date_dim['month'], date_dim['day_of_week']),
    date_dim['date_key']
))

# Extract time data
df['Hour'] = pd.to_datetime(df['Time'], format='%H:%M:%S', errors='coerce').dt.hour
df['Minute'] = pd.to_datetime(df['Time'], format='%H:%M:%S', errors='coerce').dt.minute

time_lookup = dict(zip(
    zip(time_dim['hour'], time_dim['minute']),
    time_dim['time_key']
))

location_lookup = dict(zip(
    zip(location_dim['state'], location_dim['sa4_region'], location_dim['lga_name']),
    location_dim['location_key']
))

remoteness_lookup = dict(zip(
    remoteness_dim['remoteness_area'],
    remoteness_dim['remoteness_key']
))

road_lookup = dict(zip(
    zip(road_dim['road_type'], road_dim['speed_limit']),
    road_dim['road_key']
))

vehicle_lookup = dict(zip(
    zip(vehicle_dim['bus_involvement'], vehicle_dim['heavy_rigid_truck_involvement'], 
        vehicle_dim['articulated_truck_involvement']),
    vehicle_dim['vehicle_key']
))

# Create fact table
fact_table = pd.DataFrame()
fact_table['crash_event'] = df['Crash ID']  # Use original Crash ID as crash_event
fact_table['fact_id'] = range(1, len(df) + 1)  # Unique fact table ID

# Add foreign keys
fact_table['victim_key'] = df.apply(
    lambda row: victim_lookup.get((row['Road User'], row['Gender'], row['Age'], row['Age Group'])), axis=1
)

fact_table['crash_id'] = df['Crash ID'].map(crash_lookup)

fact_table['date_key'] = df.apply(
    lambda row: date_lookup.get((row['Year'], row['Month'], row['Dayweek'])), axis=1
)

fact_table['time_key'] = df.apply(
    lambda row: time_lookup.get((row['Hour'], row['Minute'])), axis=1
)

fact_table['location_key'] = df.apply(
    lambda row: location_lookup.get((row['State'], row['SA4 Name 2021'], row['National LGA Name 2021'])), axis=1
)

fact_table['remoteness_key'] = df['National Remoteness Areas'].map(remoteness_lookup)

fact_table['road_key'] = df.apply(
    lambda row: road_lookup.get((row['National Road Type'], row['Speed Limit'])), axis=1
)

fact_table['vehicle_key'] = df.apply(
    lambda row: vehicle_lookup.get((row['Bus Involvement'], 
                                  row['Heavy Rigid Truck Involvement'], 
                                  row['Articulated Truck Involvement'])), axis=1
)

# Add basic measures
fact_table['fatalities'] = df['Number Fatalities']  # Original fatality count

# Add enhanced numerical measures for crash-level analysis


# 2. Normalized Fatality Rate - fatalities per 100,000 dwellings
fact_table['fatality_rate_per_100k_dwellings'] = df.apply(
    lambda row: round((row['Number Fatalities'] / row['Dwelling_Count']) * 100000, 4) if row['Dwelling_Count'] > 0 else None,
    axis=1
)

# 3. Risk Factor Score - composite score based on speed, time and road factors
# Calculate time factor (night is higher risk)
time_factor = df['Time of Day'].apply(lambda x: 1.5 if x == 'Night' else 1.0)

# Calculate speed factor (higher speeds = higher risk)
def get_speed_factor(speed):
    if pd.isna(speed):
        return 1.0
    elif speed <= 50:
        return 1.0
    elif speed <= 80:
        return 1.5
    else:
        return 2.0

speed_factor = df['Speed Limit'].apply(get_speed_factor)

# Calculate road type factor
road_type_factors = {
    'National or State Highway': 1.6,
    'Arterial Road': 1.4,
    'Sub-arterial Road': 1.3,
    'Local Road': 1.0,
    'Collector Road': 1.2,
    'Access road': 0.9,
    'Undetermined': 1.0,
    'Pedestrian Thoroughfare': 0.8,
    'Busway': 0.7
}
road_factor = df['National Road Type'].map(road_type_factors).fillna(1.0)

# Calculate weekend factor
weekend_factor = df['Day of week'].apply(lambda x: 1.2 if x == 'Weekend' else 1.0)

# Combined risk factor score
fact_table['risk_factor_score'] = round(time_factor * speed_factor * road_factor * weekend_factor,4)


# Save fact table
save_to_csv(fact_table, 'fact_crashes.csv')


Generating Fact Table...
Saved: dimension_tables/fact_crashes.csv with 51284 rows
