# California Collisions Data Cleaning

In [5]:
import pandas as pd
import numpy as np
import sqlite3

In [6]:
con = sqlite3.connect('/content/switrs.sqlite')

# Drunk collisions per county

In [7]:
new_data = pd.read_sql_query('''
                          SELECT
                          county_location, party_sobriety 
                          FROM collisions
                          INNER JOIN parties
                          ON parties.case_id = collisions.case_id
                          WHERE '2020-01-01' > collision_date
                          AND collision_date >= '2010-01-01'
                          AND parties.party_type = 'driver'
                          AND parties.at_fault = 1
                         ''', con)

In [8]:
def county_groups(dataframe):
    sober_filter = dataframe['party_sobriety'] != 'had been drinking, under influence'
    sober_data = dataframe.loc[sober_filter]['county_location'].value_counts()\
    .rename_axis('county_name').reset_index(name='number_of_sober_collisions')
    drunk_filter = dataframe['party_sobriety'] == 'had been drinking, under influence'
    drunk_data = dataframe.loc[drunk_filter]['county_location'].value_counts()\
    .rename_axis('county_name').reset_index(name='number_of_drunk_collisions')
    return pd.merge(sober_data, drunk_data, on = 'county_name')

In [9]:
collisions = county_groups(new_data)
collisions['county_name'] = collisions['county_name'].apply(lambda x: x.title() + ' County')

In [10]:
collisions['Proportion_of_drunk_accidents (%)'] = round(collisions['number_of_drunk_collisions'] 
                                                        / collisions['number_of_sober_collisions'] * 100, 2)

In [12]:
collisions = collisions.sort_values('county_name')

In [15]:
collisions = collisions.set_index('county_name')

In [16]:
collisions

Unnamed: 0_level_0,number_of_sober_collisions,number_of_drunk_collisions,Proportion_of_drunk_accidents (%)
county_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alameda County,178014,11564,6.5
Alpine County,607,55,9.06
Amador County,3848,457,11.88
Butte County,16083,2332,14.5
Calaveras County,4515,588,13.02
Colusa County,2723,272,9.99
Contra Costa County,81915,7201,8.79
Del Norte County,2495,262,10.5
El Dorado County,12973,2105,16.23
Fresno County,60144,6786,11.28


# Drunk Collisions per Race

In [17]:
race_data = pd.read_sql_query('''SELECT collisions.county_location, parties.party_race, count(parties.party_race)
                                 FROM collisions 
                                 INNER JOIN parties ON collisions.case_id = parties.case_id
                                 WHERE parties.party_type = 'driver'
                                 AND parties.party_sobriety = 
                                 'had been drinking, under influence'
                                 AND '2020-01-01' > collisions.collision_date 
                                 AND collisions.collision_date >= '2010-01-01'
                                 AND parties.at_fault = 1
                                 GROUP BY county_location, party_race''', con)

In [18]:
race_data.dropna(inplace = True)

In [19]:
race_data.head(10)

Unnamed: 0,county_location,party_race,count(parties.party_race)
1,alameda,asian,751
2,alameda,black,2176
3,alameda,hispanic,4125
4,alameda,other,769
5,alameda,white,3426
6,alpine,black,3
7,alpine,hispanic,4
8,alpine,other,4
9,alpine,white,44
11,amador,asian,2


In [20]:
def race_groups_dataframe(county):
  dic = {}
  for i in race_data['party_race'].unique():
    if i not in race_data['party_race'][race_data['county_location'] == county].values:
      dic[i] = 0
    else:
      dic[i] = race_data['count(parties.party_race)'][(race_data['county_location'] == county) & (race_data['party_race'] == i)].values[0]
  return dic.values()

In [24]:
index = [i.title() + ' County' for i in race_data['county_location'].unique()]

In [25]:
drunk_race_groups_split = pd.DataFrame([race_groups_dataframe(i) for i in race_data['county_location'].unique()], index =
                                      index, columns = race_data['party_race'].unique())

In [26]:
drunk_race_groups_split

Unnamed: 0,asian,black,hispanic,other,white
Alameda County,751,2176,4125,769,3426
Alpine County,0,3,4,4,44
Amador County,2,7,46,9,386
Butte County,33,64,356,61,1758
Calaveras County,3,8,62,7,507
Colusa County,2,7,128,11,120
Contra Costa County,227,1082,2314,419,2991
Del Norte County,5,2,24,33,194
El Dorado County,17,28,182,70,1743
Fresno County,281,280,4138,181,1820


# Drunk Collisions per Age Group

In [27]:
age_ranges = list(range(16, 84, 4))
age_strings = []
for i in range(len(age_ranges) - 1):
  age_strings.append(f"WHEN parties.party_age >= {age_ranges[i]} AND parties.party_age < {age_ranges[i + 1]} \
  THEN '{age_ranges[i]} - {age_ranges[i + 1] - 1}'")

In [29]:
strings = ' '.join(age_strings)

In [30]:
drunk_age_group_query = f'''
                           SELECT collisions.county_location, Count(parties.party_age) AS 'Age_Groups', 
                           CASE {strings} 
                           WHEN parties.party_age >= 80 THEN 'Over 80'
                           END AS Ages_groups
                           FROM parties
                           INNER JOIN collisions 
                           ON collisions.case_id = parties.case_id
                           WHERE parties.party_type = 'driver'
                           AND parties.party_sobriety = 
                           'had been drinking, under influence'
                           AND '2020-01-01' > collisions.collision_date 
                           AND collisions.collision_date >= '2010-01-01'
                           AND parties.at_fault = 1
                           GROUP BY county_location, ages_groups
                       '''

In [31]:
drunk_age_groups = pd.read_sql_query(drunk_age_group_query, con)

In [32]:
drunk_age_groups.dropna(inplace = True)

In [34]:
index = [i.title() + ' County' for i in drunk_age_groups['county_location'].unique()]

In [35]:
def age_groups_dataframe(county):
  dic = {}
  for i in drunk_age_groups['Ages_groups'].unique():
    if i not in drunk_age_groups['Ages_groups'][drunk_age_groups['county_location'] == county].values:
      dic[i] = 0
    else:
      dic[i] = drunk_age_groups['Age_Groups'][(drunk_age_groups['county_location'] == county) & (drunk_age_groups['Ages_groups'] == i)].values[0]
  return dic.values()

In [36]:
drunk_age_groups_split = pd.DataFrame([age_groups_dataframe(i) for i in drunk_age_groups['county_location'].unique()], index =
                                      index, columns = drunk_age_groups['Ages_groups'].unique())

In [37]:
drunk_age_groups_split

Unnamed: 0,16 - 19,20 - 23,24 - 27,28 - 31,32 - 35,36 - 39,40 - 43,44 - 47,48 - 51,52 - 55,56 - 59,60 - 63,64 - 67,68 - 71,72 - 75,76 - 79,Over 80
Alameda County,495,1899,2044,1509,1148,819,759,674,595,542,365,290,190,93,45,19,16
Alpine County,2,8,9,7,1,1,5,2,4,7,2,3,2,2,0,0,0
Amador County,13,61,59,42,28,26,23,32,39,42,26,18,30,7,7,1,3
Butte County,147,504,343,249,184,137,136,131,109,94,103,83,40,26,19,11,3
Calaveras County,21,81,70,59,38,30,43,46,59,43,34,26,24,3,7,3,0
Colusa County,17,59,26,32,24,20,18,15,16,18,13,6,3,0,1,1,1
Contra Costa County,398,1170,1185,876,646,539,449,418,438,349,260,182,121,69,26,18,17
Del Norte County,22,32,33,30,23,26,19,14,23,8,14,6,5,2,3,2,0
El Dorado County,119,318,291,220,183,125,123,121,140,142,115,95,43,39,11,13,3
Fresno County,459,1384,1242,840,609,484,384,328,298,251,173,148,84,45,17,9,5
