In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data= pd.read_csv('..\data\Crash_Data.csv')

# Data Cleaning

In [None]:
%%writefile functions.py

import pandas as pd
import numpy as np

def column_rename (df: pd.DataFrame) -> pd.DataFrame:
    """
    This function renames column names by removing spaces and converting to lower case
    Inputs: df of type pandas dataframe
    Outputs: returns the dataframe with the renamed columns
    """
    cols =[]
    for x in df.columns:
        if isinstance(x, str):
            cols.append(x.lower().replace(' ', '_'))
        else:
            cols.append(x)
            
    df.columns=cols
    return df



def col_replace_dash (df: pd.DataFrame) -> pd.DataFrame:
    """
    This function renames column names by removing spaces and converting to lower case
    Inputs: df of type pandas dataframe
    Outputs: returns the dataframe with the renamed columns
    """
    cols =[]
    for x in df.columns:
        if isinstance(x, str):
            cols.append(x.lower().replace('-', '_'))
        else:
            cols.append(x)
            
    df.columns=cols
    return df



def clean_agency_name (df: pd.DataFrame) -> pd.DataFrame:
    """
    This function groups the same values of the agency name column
    It uses a dictionary to replace the redundant values
    Inputs: df type pandas dataframe
    Outputs: returns the dataframe with the renamed columns
    """
    new_row_values = {'montgomery county police': 'montgomery', 'rockville police departme' : 'rockville', 'gaithersburg police depar' : 'gaithersburg',
    'takoma park police depart': 'takoma', 'maryland-national capital': 'maryland'}
    df['agency_name']= df['agency_name'].replace(new_row_values)
    return df


def clean_collision_type(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function renames the clean collision type column
    It uses combination of regex and a dictionary to do so
    Inputs: df of type pandas dataframe
    Outputs: returns the dataframe with the renamed values
    """
    replace_short= {'\\bdir\\b' : 'direction', '\\brend\\b' : 'rear end'}
    df['collision_type']= df['collision_type'].replace(replace_short, regex=True)
    return df
    
    
def df_to_lower (df: pd.DataFrame) -> pd.DataFrame:
    """
    This function converts all the values in all the columns to lower case
    Inputs: df of type pandas dataframe
    Outputs: returns the dataframe with the string values in lower case
    """
    df = df.map(lambda x: x.lower() if isinstance(x, str) else x)
    return df


def replace_invalids_of_column(df: pd.DataFrame, num_replacements: float, replacement_value:str, col_to_clean: str) -> pd.DataFrame:
    """
    This function removes the invalid values from the given column
    Inputs: df of type pandas dataframe, the values that will replace the invalid values, and their frequencies
    Outputs: returns dataframe with the replaced invalid values
    """
    invalid_indices= df[df[col_to_clean] == 'invalid'].index
    selected_indices = np.random.choice(invalid_indices, size=num_replacements, replace=False)
    df.loc[selected_indices, col_to_clean] = replacement_value
    return df


In [None]:
df= data.copy()

In [None]:
%run functions.py

In [None]:
df= col_replace_dash(df)
df= column_rename(df)
df= df_to_lower(df)
df.head()
df.shape

In [None]:
df= clean_agency_name(df)
df['agency_name'].value_counts(dropna=False)

In [None]:
#after checking the datatypes of the columns, we decided to cast the crash_date/time column to datetime datatype 
df['crash_date/time']=pd.to_datetime(df['crash_date/time'], format="%m/%d/%Y %I:%M:%S %p")
type(df['crash_date/time'])
df['crash_date/time']

In [None]:
df.info()

In [None]:
original_null_percentages= df.isnull().sum()/len(df)*100
display(original_null_percentages)

After checking the percentage of missing values for the columns, we dropped the columns with a percentage greater than 20% and also those that were irrelevant to our analysis.
We also made sure to keep the columns that were relevant to our business questions

In [None]:
columns_to_drop=['lane_type', 'off_road_description', 'municipality', 'related_non_motorist', 'non_motorist_substance_abuse',
                 'first_harmful_event', 'second_harmful_event', 'fixed_oject_struck', 'junction', 'intersection_type', 'intersection_area', 
                 'route_type', 'mile_point_direction', 'lane_direction','direction', 'distance_unit' ,'road_name', 'cross_street_type', 'cross_street_name', 
                'surface_condition', 'traffic_control', 'driver_substance_abuse', 'road_alignment' , 'road_division', 'mile_point', 'distance']
df.drop(columns_to_drop, axis=1, inplace= True)

In [None]:
#rechecking the NaNs after dropping the columns with high percentage of missing values
df.isnull().sum()/len(df)*100

After checking the percentage of NaNs for the remaining columns, the percentages for the following columns were insignificant
We decided to drop the NaNs from these columns.

In [None]:
df=df.dropna(subset=['hit/run', 'collision_type' , 'light'])

In [None]:
df.isnull().sum()/len(df)*100

In [None]:
df['light'].value_counts(dropna=False)

In [None]:
df['collision_type'].value_counts(dropna=False)

After checking the value counts for the columns that are significant to our business questions, we decided to drop the NaN values from the road_condition and check its effect on the remaining columns

In [None]:
# Displaying the value counts for road_condition
display(df['road_condition'].value_counts(dropna=False))

In [None]:
# Replace 'unknown', 'other', and NaN values with 'invalid'
df['road_condition'] = df['road_condition'].replace(['unknown', 'other', np.nan], 'invalid')

# Calculate the sum of 'invalid' values
sum_invalid_values = (df['road_condition'] == 'invalid').sum()

# Displaying the current counts for the categories that are not invalid
unique_values = df['road_condition'][~df['road_condition'].isin(['invalid', 'no defects'])].value_counts()

# Calculate the total count of other values (excluding 'invalid' and 'no defects')
other_values_count = df['road_condition'][~df['road_condition'].isin(['invalid', 'no defects'])].count()

# Finding the amount of invalids to be replaced for each category
distribution_amounts = (unique_values / other_values_count) * sum_invalid_values
distribution_amounts= distribution_amounts.round(0)
distribution_amounts['obstruction not signaled']+=1

# Applying the function to clean the road_condition column and replace the invalid values
for key, value in distribution_amounts.items():
    replace_invalids_of_column(df, int(value), key, 'road_condition')

display(df['road_condition'].value_counts(dropna=False))

In [None]:
# Displaying the value counts for road_grade
df['road_grade'].value_counts(dropna=False)

In [None]:
# Replace 'unknown', 'other', and NaN values with 'invalid'
df['road_grade'] = df['road_grade'].replace(['unknown', 'other', np.nan], 'invalid')

# Calculate the sum of 'invalid' values
sum_invalid_values = (df['road_grade'] == 'invalid').sum()

# Displaying the current counts for the categories that are not invalid
unique_values = df['road_grade'][~df['road_grade'].isin(['invalid'])].value_counts()

# Calculate the total count of other values (excluding 'invalid')
other_values_count = df['road_grade'][~df['road_grade'].isin(['invalid'])].count()

# Finding the amount of invalids to be replaced for each category
distribution_amounts = (unique_values / other_values_count) * sum_invalid_values
distribution_amounts= distribution_amounts.round(0)

# Applying the function to clean the road_grade column and replace the invalid values
for key, value in distribution_amounts.items():
    replace_invalids_of_column(df, int(value), key, 'road_grade')

display(df['road_grade'].value_counts(dropna=False))

In [None]:
# Displaying the value counts for light
df['light'].value_counts(dropna=False)

In [None]:
# Displaying the value counts for weather
df['weather'].value_counts(dropna=False)

In [None]:
# Replace 'unknown', 'other', and NaN values with 'invalid'
df['weather'] = df['weather'].replace(['unknown', 'other', np.nan], 'invalid')

# Calculate the sum of 'invalid' values
sum_invalid_values = (df['weather'] == 'invalid').sum()

# Displaying the current counts for the categories that are not invalid
unique_values = df['weather'][~df['weather'].isin(['invalid'])].value_counts()

# Calculate the total count of other values (excluding 'invalid')
other_values_count = df['weather'][~df['weather'].isin(['invalid'])].count()

# Finding the amount of invalids to be replaced for each category
distribution_amounts = (unique_values / other_values_count) * sum_invalid_values
distribution_amounts= distribution_amounts.round(0)
distribution_amounts['clear']-=2

# Applying the function to clean the weather column and replace the invalid values
for key, value in distribution_amounts.items():
    replace_invalids_of_column(df, int(value), key, 'weather')

display(df['weather'].value_counts(dropna=False))

In [None]:
df= clean_collision_type(df)
display(df['collision_type'].value_counts(dropna=False))

In [None]:
# Replace 'unknown' and 'other' with 'invalid'
df['collision_type'] = df['collision_type'].replace(['unknown', 'other'], 'invalid')

# Calculate the sum of 'invalid' values
sum_invalid_values = (df['collision_type'] == 'invalid').sum()

# Displaying the current counts for the categories that are not invalid
unique_values = df['collision_type'][~df['collision_type'].isin(['invalid'])].value_counts()

# Calculate the total count of other values (excluding 'invalid')
other_values_count = df['collision_type'][~df['collision_type'].isin(['invalid'])].count()


# Finding the amount of invalids to be replaced for each category
distribution_amounts = (unique_values / other_values_count) * sum_invalid_values
distribution_amounts= distribution_amounts.round(0)

distribution_amounts['same direction rear end']-=1

# Applying the function to clean the collision_type column and replace the invalid values
for key, value in distribution_amounts.items():
    replace_invalids_of_column(df, int(value), key, 'collision_type')

display(df['collision_type'].value_counts(dropna=False))

In [None]:
# final check 
df.isnull().sum()/len(df)*100

In [None]:
df.to_csv('..\data\clean_crash_data.csv',index = False)

In [None]:
!dir

In [None]:
df_clean= pd.read_csv('..\data\clean_crash_data.csv')
display(df_clean)

In [None]:
df.describe()