# Fatal Accidents Analysis

## Data Preparation

1. Gather data sources

2. Load and clean up

3. Confirm data quality

## Setup

In [6]:
# importing required modules 
import os
import arcgis
from arcgis.features import GeoAccessor, GeoSeriesAccessor
import arcpy
import pandas as pd
import glob
import zipfile
from simpledbf import Dbf5
from fuzzywuzzy import process

In [7]:
# Set and Change directory to workspace
root_dir = os.path.join(os.getcwd(), r"../..")
workspace_dir = os.path.join(root_dir, "workspace")
data_dir = os.path.join(root_dir, "data")

# 1. Gather data sources

Need two types of data: 

- A. Fatal accident location data 
- B. US Interstate with traffic volumes. 

## A. Fatal accident location data

Zipped FARS datasets for 2008-2017 have already been downloaded from ftp://ftp.nhtsa.dot.gov/fars/

In [8]:
os.listdir(data_dir)

['FARS2008.zip',
 'FARS2009.zip',
 'FARS2010.zip',
 'FARS2011.zip',
 'FARS2012.zip',
 'FARS2013NationalDBF.zip',
 'FARS2014NationalDBF.zip',
 'FARS2015NationalCSV.zip',
 'FARS2016NationalCSV.zip',
 'FARS2017NationalCSV.zip']

Let's unzip each dataset

In [9]:
unzipped_files_list = []
for file in os.listdir(data_dir):
    print("Unzipping file {0}...".format(file))
    target_name = os.path.join(data_dir, file.split('.')[0])
    zip_ref = zipfile.ZipFile(os.path.join(data_dir, file), 'r')
    zip_ref.extractall(target_name)
    zip_ref.close()
    unzipped_files_list.append(target_name)
print("Final list of unzipped files:")    
print(unzipped_files_list)

Unzipping file FARS2008.zip...
Unzipping file FARS2009.zip...
Unzipping file FARS2010.zip...
Unzipping file FARS2011.zip...
Unzipping file FARS2012.zip...
Unzipping file FARS2013NationalDBF.zip...
Unzipping file FARS2014NationalDBF.zip...
Unzipping file FARS2015NationalCSV.zip...
Unzipping file FARS2016NationalCSV.zip...
Unzipping file FARS2017NationalCSV.zip...
Final list of unzipped files:
['C:\\Users\\albe9057\\Documents\\GitHub\\fatal_accidents_spatial_analysis\\src\\NHTSA_Analysis_2018\\../..\\data\\FARS2008', 'C:\\Users\\albe9057\\Documents\\GitHub\\fatal_accidents_spatial_analysis\\src\\NHTSA_Analysis_2018\\../..\\data\\FARS2009', 'C:\\Users\\albe9057\\Documents\\GitHub\\fatal_accidents_spatial_analysis\\src\\NHTSA_Analysis_2018\\../..\\data\\FARS2010', 'C:\\Users\\albe9057\\Documents\\GitHub\\fatal_accidents_spatial_analysis\\src\\NHTSA_Analysis_2018\\../..\\data\\FARS2011', 'C:\\Users\\albe9057\\Documents\\GitHub\\fatal_accidents_spatial_analysis\\src\\NHTSA_Analysis_2018\\../

For each year (in our unzipped_files_list), we need to create a feature class of accidents. Let's start by creating a file geodatabase where our yearly files will reside

In [10]:
fars_fgdb = arcpy.CreateFileGDB_management(data_dir, "FARSData.gdb").getOutput(0)
fars_fgdb

'C:\\Users\\albe9057\\Documents\\GitHub\\fatal_accidents_spatial_analysis\\src\\NHTSA_Analysis_2018\\../..\\data\\FARSData.gdb'

We know each year has an accidents table, but some are in CSV format and others are in DBF format. Let's iterate and try to handle that logic for both cases. We also need to use logic to find the right Latitude and Longitude fields when spatially enabling each dataset.

In [29]:
for unzipped_file in unzipped_files_list:
    try:
        print("\nProcessing {0}...".format(unzipped_file))
        print("Retrieving accidents table...")
        os.chdir(unzipped_file)

        accident_table = os.path.join(os.getcwd(), glob.glob("accident*")[0])
        print(accident_table)

        file_extension = accident_table.split(".")[-1]
        print(file_extension)

        if file_extension.lower() == "csv":
            accident_df = pd.read_csv(accident_table)

        elif file_extension.lower() == "dbf":
            accident_df = Dbf5(accident_table).to_dataframe()

        else:
            print("WARNING: the unzipped file '{0}' did not contain an accident table with a recognizable file format!")

        # Find the appropriate latitude and longitude columns
        columns = accident_df.columns.tolist()
        latitude = process.extractOne('LATITUDE', columns)[0]
        longitude = process.extractOne('LONGITUDE', columns)[0]
        
        # Filter for dummy lat/lon values
        print(accident_df.shape[0])
        accident_df = accident_df.loc[accident_df[longitude] < 360]
        print(accident_df.shape[0])

        # Convert the accident DF to a spatially-enabled dataframe using the lat/lon columns
        accident_sedf = accident_df.spatial.from_xy(accident_df, x_column=longitude, y_column=latitude)
        year = accident_sedf.iloc[0]['YEAR'].astype('str').split(".")[0]

        # Convert the SEDF to a feature class
        accident_fc = accident_sedf.spatial.to_featureclass(os.path.join(fars_fgdb, "accident_{0}".format(year)))
    except:
        print("Unexpected error:")
        print(sys.exc_info())
#         raise


Processing C:\Users\albe9057\Documents\GitHub\fatal_accidents_spatial_analysis\src\NHTSA_Analysis_2018\../..\data\FARS2008...
Retrieving accidents table...
C:\Users\albe9057\Documents\GitHub\fatal_accidents_spatial_analysis\data\FARS2008\ACCIDENT.DBF
DBF
34172
33691

Processing C:\Users\albe9057\Documents\GitHub\fatal_accidents_spatial_analysis\src\NHTSA_Analysis_2018\../..\data\FARS2009...
Retrieving accidents table...
C:\Users\albe9057\Documents\GitHub\fatal_accidents_spatial_analysis\data\FARS2009\accident.DBF
DBF
30862
30499

Processing C:\Users\albe9057\Documents\GitHub\fatal_accidents_spatial_analysis\src\NHTSA_Analysis_2018\../..\data\FARS2010...
Retrieving accidents table...
C:\Users\albe9057\Documents\GitHub\fatal_accidents_spatial_analysis\data\FARS2010\accident.dbf
dbf
0
0
Unexpected error:
(<class 'TypeError'>, TypeError('Cannot index by location index with a non-integer key',), <traceback object at 0x000001F8D4D33648>)

Processing C:\Users\albe9057\Documents\GitHub\fatal_

## >>>>> Construction Zone <<<<<

In [None]:
folder = os.listdir(data_dir)[1]
folder

In [None]:
os.chdir(unzipped_files_list[1])
glob.glob("accident*")

In [None]:
os.getcwd()

In [None]:
os.path.join(os.getcwd(), glob.glob("accident*")[0])

In [None]:
accident_table = os.path.join(os.getcwd(), glob.glob("accident*")[0])

file_extension = accident_table.split(".")[-1]

In [None]:
file_extension

In [None]:
accident_df = pd.DataFrame.from_csv(accident_table)
accident_df

In [None]:
accident_df = Dbf5(accident_table).to_dataframe()
accident_df

In [None]:
columns = accident_df.columns.tolist()
columns

In [None]:
latitude = fuzzywuzzy.process.extractOne('LATITUDE', columns)[0]
longitude = fuzzywuzzy.process.extractOne('LONGITUDE', columns)[0]

In [None]:
accident_sedf = accident_df.spatial.from_xy(accident_df, x_column=longitude, y_column=latitude)

In [None]:
year = accident_sedf.iloc[0]['YEAR'].astype('str').split(".")[0]
year

In [None]:
fc = accident_sedf.spatial.to_featureclass(os.path.join(fars_fgdb, "accident_{0}".format(year)))

In [None]:
fars_fgdb