# ANALYZING RTC SEVERITY DATASET
In this project, we are analyzing rioad accident data in order to answer the following questions:
1. When do most road accidents occur? What hour?
2. Which roads lead to major road accidents.
3. Which vehicles lead to fatal or serious accidents?

In [1]:
import csv
import datetime as dt

In [2]:
# read the csv file
with open("C://Users//user//Downloads//accident_data.csv") as file:
    rows=list(csv.reader(file))

In [3]:
print(rows[:5])

[['Accident_Index', '1st_Road_Class', '1st_Road_Number', '2nd_Road_Class', '2nd_Road_Number', 'Accident_Severity', 'Carriageway_Hazards', 'Date', 'Day_of_Week', 'Did_Police_Officer_Attend_Scene_of_Accident', 'Junction_Control', 'Junction_Detail', 'Latitude', 'Light_Conditions', 'Local_Authority_(District)', 'Local_Authority_(Highway)', 'Location_Easting_OSGR', 'Location_Northing_OSGR', 'Longitude', 'LSOA_of_Accident_Location', 'Number_of_Casualties', 'Number_of_Vehicles', 'Pedestrian_Crossing-Human_Control', 'Pedestrian_Crossing-Physical_Facilities', 'Police_Force', 'Road_Surface_Conditions', 'Road_Type', 'Special_Conditions_at_Site', 'Speed_limit', 'Time', 'Urban_or_Rural_Area', 'Weather_Conditions', 'Year', 'InScotland'], ['200501BS00001', 'A', '3218', '', '0', 'Serious', 'None', '04/01/2005', 'Tuesday', '1', 'Data missing or out of range', 'Not at junction or within 20 metres', '51.489096', '', 'Kensington and Chelsea', 'Kensington and Chelsea', '525680', '178240', '-0.19117', 'E010

In [4]:
header = rows[0]
print(len(header))

34


In [5]:
data = rows[1:]
print(len(data[0]))

34


In [6]:
# function to explore the dataset to find out how many rows and columns there are
def explore_dataset(dataset, start, end, rows_columns = False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print("\n")
    if rows_columns:
        print("Number of rows",len(dataset))
        print("Number of columns", len(dataset[0]))

In [7]:
explore_dataset(data, 0,3,True)

['200501BS00001', 'A', '3218', '', '0', 'Serious', 'None', '04/01/2005', 'Tuesday', '1', 'Data missing or out of range', 'Not at junction or within 20 metres', '51.489096', '', 'Kensington and Chelsea', 'Kensington and Chelsea', '525680', '178240', '-0.19117', 'E01002849', '1', '1', '0', '1', 'Metropolitan Police', 'Wet or damp', 'Single carriageway', 'None', '30', '17:42', 'Urban', 'Raining no high winds', '2005', 'No']


['200501BS00002', 'B', '450', 'C', '0', 'Slight', 'None', '05/01/2005', 'Wednesday', '1', 'Auto traffic signal', 'Crossroads', '51.520075', 'Darkness - lights lit', 'Kensington and Chelsea', 'Kensington and Chelsea', '524170', '181650', '-0.211708', 'E01002909', '1', '1', '0', '5', 'Metropolitan Police', 'Dry', 'Dual carriageway', 'None', '30', '17:36', 'Urban', 'Fine no high winds', '2005', 'No']


['200501BS00003', 'C', '0', '', '0', 'Slight', 'None', '06/01/2005', 'Thursday', '1', 'Data missing or out of range', 'Not at junction or within 20 metres', '51.525301', 

In [8]:
# check for wrong data in my lists of lists
def check_wrong_data(mylist):
    for row in mylist:
        header_length = len(header)
        row_length = len(row)
        if row_length != header_length:
            print(row)
            print(data.index(row)) 

In [9]:
check_wrong_data(data)

In [10]:
roadtypes = []
for row in data:
    road_type = row[1]
    if road_type not in roadtypes:
        roadtypes.append(road_type)
print(roadtypes)

['A', 'B', 'C', '', 'Motorway', 'A(M)']


In [11]:
# check for duplicates
def check_duplicates(dataset):
    duplicate_entries = []
    unique_entries = []
    for row in dataset:
        accident_id = row[0]
        if accident_id in unique_entries:
            duplicate_entries.append(accident_id)
        else:
            unique_entries.append(accident_id)
        len_unique_entries = len(unique_entries)
        len_duplicate_entries = len(duplicate_entries)
        example_duplicate = duplicate_entries[0:3]
    return len_duplicate_entries, len_unique_entries, example_duplicate

In [12]:
len_duplicate_entries, len_unique_entries, example_duplicate = check_duplicates(data)
print(f'Number of duplicate entries : {len_duplicate_entries}\n'
      f'Number of unique entries : {len_unique_entries}\n'
      f'Examples of duplicate entries : {example_duplicate}\n'
     )

KeyboardInterrupt: 

In [36]:
def fill_missing_strings(i):
    for row in data:
        col = row[i]
        col = col.title()
        if not col:
            col = "Unknown Data"
        row[i] = col

In [37]:
for i in range(len(header)):
    fill_missing_strings(i)
print(data[2:5])

[['200501Bs00003', 'C', '0', 'Unknown Data', '0', 'Slight', 'None', '06/01/2005', 'Thursday', '1', 'Data Missing Or Out Of Range', 'Not At Junction Or Within 20 Metres', '51.525301', 'Darkness - Lights Lit', 'Kensington And Chelsea', 'Kensington And Chelsea', '524520', '182240', '-0.206458', 'E01002857', '1', '2', '0', '0', 'Metropolitan Police', 'Dry', 'Single Carriageway', 'None', '30', '00:15', 'Urban', 'Fine No High Winds', '2005', 'No'], ['200501Bs00004', 'A', '3220', 'Unknown Data', '0', 'Slight', 'None', '07/01/2005', 'Friday', '1', 'Data Missing Or Out Of Range', 'Not At Junction Or Within 20 Metres', '51.482442', 'Unknown Data', 'Kensington And Chelsea', 'Kensington And Chelsea', '526900', '177530', '-0.173862', 'E01002840', '1', '1', '0', '0', 'Metropolitan Police', 'Dry', 'Single Carriageway', 'None', '30', '10:35', 'Urban', 'Fine No High Winds', '2005', 'No'], ['200501Bs00005', 'Unknown Data', '0', 'Unknown Data', '0', 'Slight', 'None', '10/01/2005', 'Monday', '1', 'Data Mi