## Automating Regional Briefings
Author: Smitha Mahesh, Eric Englin

Purpose: To complete the analyses and visualizations required for the region-level briefings. 

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import docx

In [2]:
myworkingdirectory = r"C:\Users\Eric.Englin\DOT OST\volpe-proj-VU16A100 - Transportation Safety Program\Region Briefing"
os.chdir(myworkingdirectory)

In [3]:
analysis_2022 = pd.read_csv(".//Data//Output Data//analysis_database_2022.csv")


In [110]:
park_info = pd.read_csv(".//Data//Reference Data/Park_Info_Table.csv")
park_info = park_info.rename(columns={'UNIT_CODE':'Park','REGION':'RGN'})


In [4]:
analysis_2022.columns

Index(['INCID_NO', 'NUM_OCC', '5_mph', '10_mph', '15_mph', '20_mph', '25_mph',
       '30_mph', '35_mph', '40_mph', '45_mph', '50_mph', '55_mph', '60_mph',
       '65_mph', '70_mph', '75_mph', 'no_posted_speed', 'Crosswalk',
       'Outside a Crosswalk', 'No Injury', 'Possible Injury',
       'Non-incapacitating Injury', 'Incapacitating Injury', 'Fatality',
       'Unknown Injury', 'Num_Fatalities', 'Num_Injuries', 'Injury or Fatal',
       'Pedestrian', 'Bicycle', 'Pedacycle', 'VRU', 'HorseLlama', 'Cow',
       'Deer', 'Elk', 'Moose', 'Buffalo', 'Bear', 'Antelope', 'SheepGoats',
       'OtherWild', 'OtherDomestic', 'Involving Animal', 'LATITUDE',
       'LONGITUDE', 'Park', 'RGN', 'CRASH_DATE', 'CRASH_TIME', 'CRASH_YEAR',
       'database'],
      dtype='object')

## Available Data Fields

In [5]:
analysis_2022.database.value_counts()

CDS      194307
IMARS     11804
Name: database, dtype: int64

In [68]:
safety_studies = pd.read_excel(".//Safety Studies//NPS_TSP_Safety_Study_Catalog_2010-2020.xlsx",  "Safety Studies 2010-2020 ")
safety_studies['RGN'] = safety_studies['Legacy Region Abbreviation']
safety_studies['INCID_NO']=1

In [11]:
safety_analyst_parks = pd.read_excel(".//Data//Safety Analyst//Safety Analyst Parks.xlsx")
safety_analyst_parks['Safety Analyst Park']=1

In [16]:
analysis_2022_merge = analysis_2022.merge(safety_analyst_parks, on="Park", how = "left")

In [17]:
analysis_2022_merge.shape, analysis_2022.shape

((206111, 55), (206111, 53))

In [78]:
def pull_source_table_data(df, region, field, filter_value):
    df = df.loc[df['RGN']==region]
    df = df.loc[df[field]==filter_value]
    parks, crashes = df.groupby("Park").agg({
        'INCID_NO':'count',
    }).reset_index().agg({
        "Park":"count",
        'INCID_NO':sum,
    })
    
    parks = str(parks) + " Park Units"
    crashes = "{:,}".format(crashes) + " Crash Reports"
    
    return parks, crashes


def make_source_table(region):

    CDS_parks, CDS_crash_reports = pull_source_table_data(analysis_2022_merge, "IMR", 'database', 'CDS')
    IMARS_parks, IMARS_crash_reports = pull_source_table_data(analysis_2022_merge, "IMR", 'database', 'IMARS')
    SA_parks, SA_crash_reports = pull_source_table_data(analysis_2022_merge, "IMR", 'Safety Analyst Park', 1)
    SS_parks, SS_crash_reports = pull_source_table_data(safety_studies, "IMR", 'INCID_NO', 1)

    source_list = []
    parks_list = [CDS_parks, IMARS_parks, SA_parks, SS_parks]
    crash_reports_list = [CDS_crash_reports, IMARS_crash_reports, SA_crash_reports, SS_crash_reports]

    final_df = pd.DataFrame({
        "Source":["Crash Data System (CDS)", 
                  "Incident Management Analysis and Reporting System (IMARS)",
                  'Safety Analyst', "Safety Studies"],
        "Locations": parks_list,
        'Years': ["1989-2013", "2012-present", "2005-2014", "2010-present"],
        "Quantity of Data": crash_reports_list
    })
    
    return final_df


## Data Quality Tables

### CDS

In [111]:
cds_crash = pd.read_excel('./Data/New CDS/New CDS Excel Files/ALL_CRASH.xlsx')
cds_crash = cds_crash.rename(columns={'PARK_ALPHA' : 'Park'})

In [112]:
# add RGN column from lookup table to CDS crash database, joining the two datasets based on park name
cds_crash = pd.merge(cds_crash, park_info[['RGN','Park']], how='left', on='Park')
# resulting dataframe after join should have one additional column and no additional rows
cds_crash = cds_crash.drop_duplicates() 
cds_crash.shape

(204687, 57)

In [126]:
cds_crash.columns

Index(['OBJECTID', 'INCID_NO', 'CASE_NUM', 'Park', 'STATE_CODE', 'CRASH_DATE',
       'CRASH_TIME', 'RTE_NO', 'RTE_NAME', 'NODE_DIST_FT', 'NODE_DIST_MI',
       'NODE_DIR', 'NODE_NUM', 'LIGHT', 'WEATHER', 'CRASH_LOCATION',
       'SURF_COND', 'CRASH_CLASS', 'VEH_COLL', 'OBJ_STRUCK', 'ROAD_CHAR',
       'CON_FACT1', 'CON_FACT2', 'CON_FACT3', 'CON_FACT4', 'CON_FACT5',
       'CON_FACT6', 'HIT_RUN', 'CATEGORY', 'FATALS', 'INJURED', 'PED_FAT',
       'PED_INJ', 'BIKE_FAT', 'BIKE_INJ', 'PED', 'CRASH_YEAR', 'COMMENTS',
       'ZIPFILE', 'LOCATION', 'PHOTOS_TAKEN', 'USPP_NPS_VEH_INV',
       'PARK_PTY_DEST', 'LOCKED_UPDATE', 'LOCKED_BY_USER', 'DATA_SRC',
       'LATITUDE', 'LONGITUDE', 'MILEPOST', 'IMPORT_DATE', 'FILE_NAME',
       'SAVE_DATE', 'ROUTE_IDENT', 'RIP_CYCLE', 'MP_NODE', 'SPTL_LOC', 'RGN'],
      dtype='object')

In [124]:
cds_unit = pd.read_excel('./Data/New CDS/New CDS Excel Files/ALL_UNIT.xlsx')

In [125]:
cds_unit.columns

Index(['OBJECTID', 'INCID_NO', 'UNIT_NO', 'VEH_YEAR', 'MAKE_MOD', 'MODEL',
       'NUM_OCC', 'REG_STATE', 'REG_YEAR', 'PLATE_NUM', 'DIR_TRAVEL',
       'SPEED_LIMIT', 'BODY_TYPE', 'VEH_MANVR', 'VEH_DAMAGE', 'DAM_LOCATION',
       'LIC_NUM', 'LIC_STATE', 'PED', 'BRTH_DATE', 'DRIVER_SEX', 'DRIVER_BELT',
       'DRIVER_EJECT', 'DRIVER_INJ', 'DRIVER_VIOLTN', 'VIOL_CHG1', 'VIOL_CHG2',
       'PED_TYPE', 'PED_LOC', 'PED_ACTN', 'REPAIR'],
      dtype='object')

In [129]:
cds_unit = cds_unit.merge(cds_crash[['INCID_NO','Park','RGN']], on = 'INCID_NO', how = 'left')


In [None]:
#cds_df_crash to find lat/long
#cds_df_unit to find speed

In [137]:
def make_data_quality_table_cds(region):
    not_linked_table = pd.DataFrame({
        "Area":["National", "NCR", "IMR",'SER','PWR','NER','MWR','AKR'],
        'Total': [203909, 97838, 33076,27262, 24419,18642, 2435, 237], 
        "Unlinked": [9602, 7988, 183, 1221, 117, 73,19, 1]
    })
    not_linked_table["Unlinked Percent"]=not_linked_table["Unlinked"]/not_linked_table['Total']
    
    df = not_linked_table.loc[(not_linked_table['Area']=="National")|(not_linked_table['Area']==region)]
    
    cds_crash_region = cds_crash.loc[cds_crash['RGN']==region[:2]]
    
    df['Missing coordinates'] = (len(cds_crash.loc[cds_crash['LATITUDE'].isnull()==True]),
                                len(cds_crash_region.loc[cds_crash_region['LATITUDE'].isnull()==True]))
    
    cds_unit_region = cds_unit.loc[cds_unit['RGN']==region[:2]]
    df['Missing speed'] = (len(cds_unit.loc[cds_unit['SPEED_LIMIT'].isnull()==True]),
                                len(cds_unit_region.loc[cds_unit_region['SPEED_LIMIT'].isnull()==True]))
    
    no_ConFact1 = cds_crash.loc[cds_crash['CON_FACT1'].isnull()==True]
    no_ConFact1_2 = no_ConFact1.loc[no_ConFact1['CON_FACT2'].isnull()==True]
    no_ConFact1_3 = no_ConFact1_2.loc[no_ConFact1_2['CON_FACT3'].isnull()==True]
    no_ConFact1_4 = no_ConFact1_3.loc[no_ConFact1_3['CON_FACT4'].isnull()==True]
    no_ConFact1_5 = no_ConFact1_4.loc[no_ConFact1_4['CON_FACT5'].isnull()==True]
    no_ConFact_any = no_ConFact1_5.loc[no_ConFact1_5['CON_FACT6'].isnull()==True]
    
    no_ConFact_any_region = no_ConFact_any.loc[no_ConFact_any['RGN']==region[:2]]
    df['Missing contributing factors']= (len(no_ConFact_any),
                                len(no_ConFact_any_region))
    
    return df

### IMARS

In [None]:
def make_data_quality_table_imars(region):
    not_linked_table = pd.DataFrame({
        "Area":["National", "NCR", "IMR",'SER','PWR','NER','MWR','AKR'],
        'Total': [12790,3541,2656,2642,2293,1206,372,80], 
        "Unlinked": [986,213,267,210,201,72,19,4]
    })
    not_linked_table["Unlinked Percent"]=not_linked_table["Unlinked"]/not_linked_table['Total']


# Create final document

In [142]:
def add_regional_information(region):
    df = make_source_table(region)
    doc_name = './22-23/Drafted Deliverables/NPS TSP Regional Briefing Draft 22-23 - Testing.docx'
    doc = docx.Document(doc_name)
    doc.add_heading("Overview of Region")
    doc.add_heading("Available Data", 2)
    doc.add_paragraph(" ")
    doc.add_paragraph("Table 2. Overview of Data Sources", style="Caption")
    t = doc.add_table(df.shape[0]+1, df.shape[1], style="Table Grid")

    for j in range(df.shape[-1]):
        t.cell(0,j).text = df.columns[j]

    for i in range(df.shape[0]):
        for j in range(df.shape[-1]):
            t.cell(i+1,j).text = str(df.values[i,j])
    
    doc.add_heading("Data Quality", 2)
    doc.add_paragraph("Below is a table of CDS and IMARS crash report completeness, broken down by region. Reports are labeled as incomplete if they only contain a crash table with core crash elements (e.g. coordinates, crash classification, etc.). For CDS crashes, this means vehicle and passenger data tables are not populated. The NCR was an outlier as compared to other regions, with around 8% of reports failing to include vehicle or passenger data tables. For IMARS crashes, this means crash details, vehicle, and passenger data tables are not populated. IMARS reports have generally been less complete across each region. ")
    doc.add_paragraph("Table 3. Regional CDS Crash Reports", style="Caption")
    
    df = make_data_quality_table_cds(region)
    
    t = doc.add_table(df.shape[0]+1, df.shape[1], style="Table Grid")

    for j in range(df.shape[-1]):
        t.cell(0,j).text = df.columns[j]

    for i in range(df.shape[0]):
        for j in range(df.shape[-1]):
            t.cell(i+1,j).text = str(df.values[i,j])
    doc.add_paragraph(" ")
    doc.add_paragraph("Table 4. Regional IMARS Crash Reports", style="Caption")

    doc_name_new = './22-23/Drafted Deliverables/NPS TSP Regional Briefing Draft 22-23 - Testing_' + region + ".docx"

    doc.save(doc_name_new)

In [143]:
add_regional_information("NCR")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Missing coordinates'] = (len(cds_crash.loc[cds_crash['LATITUDE'].isnull()==True]),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Missing speed'] = (len(cds_unit.loc[cds_unit['SPEED_LIMIT'].isnull()==True]),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Missing contributing factors']= 

In [100]:

for x in analysis_2022.RGN.unique():
    print(x)


SER
NER
MWR
IMR
NCR
PWR
AKR
