## Data Cleansing and Transformation in Python

In [None]:
import pandas as pd 
import logging

### Writing Transformation Functions

In [None]:
def get_transformed_data(crash_file, vehcile_file): 

    # import data
    df_crashes = pd.read_csv(f"data/{crash}") 
    df_vehicles= pd.read_csv(f"data/{vehcile_file}") 

 

    under_threshold_removed = df_crashes.dropna(axis='index', thresh=2, inplace=False) 
    under_threshold_rows = df_crashes[~df_crashes.index.isin(under_threshold_removed.index)] 
    df_crashes.fillna(value={'report_type': 'ON SCENE'}, inplace=True) 

   

    df = df_crashes.merge(df_vehicles, how = 'left',on='crash_record_id',suffixes=('_left', '_right')) 
    df_agg = df.groupby('vehicle_type').agg({'crash_record_id': 'count'}).reset_index()  

    
    number_of_passenger_cars_involved = df_agg[df_agg['vehicletypes'] == 'PASSENGER']['crash_record_id'].array[0] 



    df = df.rename(columns = vehicle_mapping)  

    vehicle_mapping = {'vehicle_type' :  'vehicletypes'}  

    df = df.rename(columns= vehicle_mapping) 

    return df

The preceding code can be split into reusable functions that are easy to manage as follows:  

In [None]:
# Read data from data source  
def read_datasources(source_name): 
    df = pd.read_csv(f"data/{source_name}")  
    return df 

# Drop rows with null values 
def drop_rows_with_null_values(df): 
    under_threshold_removed = df.dropna(axis='index', thresh=2, inplace=False)  
    df = df[~df.index.isin(under_threshold_removed.index)]  
    return df 

# Fill missing values 
def fill_missing_values(df): 
    df = df.fillna(value={'report_type': 'ON SCENE'})  
    return df 

# Merge Dataframes 
def merge_dataframes(df_vehicles,df_crashes ): 
    df = df_crashes.merge(df_vehicles,how='left', on='crash_record_id', suffixes=('_left', '_right'))  
    return df 

### Running the Workflow

Define the Pipeline Functions to run the Cleansing and Transformation Functions

In [None]:
def read_data_pipeline(crash_file, vehicle_crash_file): 
    df_crash = pd.DataFrame() 
    df_vehicle_crash = pd.DataFrame() 
    try: 
        df_crash = read_datasources(crash_file) 
        df_vehicle_crash = read_datasources(vehicle_crash_file) 
    except Exception as e: 
        logging.info("Exception in reading data pipeline") 
    finally: 
        return df_crash, df_vehicle_crash 
 
 
def drop_rows_with_null_values_pipeline(df_crash, df_vehicle_crash): 
    try: 
        df_crash = drop_rows_with_null_values(df_crash) 
        df_vehicle_crash = drop_rows_with_null_values(df_vehicle_crash) 
    except Exception as e: 
        logging.info("Exception in dropping rows with null value data pipeline") 
 
    finally: 
        return df_crash, df_vehicle_crash 
 
 
def fill_missing_values_pipeline(df_crash, df_vehicle_crash): 
    try: 
        df_crash = fill_missing_values(df_crash) 
        df_vehicle_crash = fill_missing_values(df_vehicle_crash) 
    except Exception as e: 
        logging.info("Exception in filling missing value pipeline") 
 
    finally: 
        return df_crash, df_vehicle_crash 
 
 
def merge_dataframes_pipeline(df_crash, df_vehicle_crash): 
    try: 
        df_crash = fill_missing_values(df_crash) 
        df_vehicle_crash = fill_missing_values(df_vehicle_crash) 
    except Exception as e: 
        logging.info("Exception in merge dataframes pipeline") 
 
    finally: 
        return df_crash, df_vehicle_crash 

Use the Chigaco Traffic Data and Run the Pipeline Workflow

In [None]:
# Define input data 
crash_data_file = "traffic_crashes.csv" 
vehicle_crash_data_file = "traffic_crash_vehicle.csv" 

# Read Data Pipeline
df_crash, df_vehicle_crash = read_data_pipeline("traffic_crashes.csv", "traffic_crash_vehicle.csv")

# Drop Nulls
df_crash, df_vehicle_crash = drop_rows_with_null_values_pipeline(df_crash, df_vehicle_crash) 

# Fill in Missing Values
df_crash, df_vehicle_crash = fill_missing_values_pipeline(df_crash, df_vehicle_crash) 

# Merge Dataframes
df_crash, df_vehicle_crash = merge_dataframes_pipeline(df_crash, df_vehicle_crash)

### Transformation Activities in Python 

In [None]:
READING_CRASH_DATA_PIPELINE = "<NOT_EXECUTED>" 
DROPPING_ROW_WITH_NULL_PIPELINE = "<NOT_EXECUTED>" 
FILLING_MISSING_VALUE_PIPELINE = "<NOT_EXECUTED>" 
MERGE_DATAFRAME_PIPELINE = "<NOT_EXECUTED>" 

In [None]:
df_crash, df_vehicle_crash = read_data_pipeline("traffic_crashes.csv", "traffic_crash_vehicle.csv") 
 
if READING_CRASH_DATA_PIPELINE == "<OK>": 
    df_crash, df_vehicle_crash = drop_rows_with_null_values_pipeline(df_crash, df_vehicle_crash) 
 
if DROPPING_ROW_WITH_NULL_PIPELINE == "<OK>": 
    df_crash, df_vehicle_crash = fill_missing_values_pipeline(df_crash, df_vehicle_crash) 
 
if FILLING_MISSING_VALUE_PIPELINE == "<OK>": 
    df_crash, df_vehicle_crash = merge_dataframes_pipeline(df_crash, df_vehicle_crash) 