This script will clean the raw downloaded KSP incident vehicle datasets and exports the cleaned and combined dataset as a csv file and then uploads to the previously created SQLite database.

In [1]:
import os
import pandas as pd
import sqlite3

In [2]:
# Define the path for the SQLite database
database_path = r'/Users/terid/Git/CodeYou_Capstone/data/crash_data.db'
os.makedirs(os.path.dirname(database_path), exist_ok=True)

# Define the path for the raw crash data files downloaded
directory_path = '/Users/terid/Git/CodeYou_Capstone/data/raw_crash_data'

In [3]:
# Function to check if tables created in the database exist
def check_table_exists(db_path, table_name):
    # Connect to the SQLite database
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Query to check if the table exists
    cursor.execute('''
        SELECT name
        FROM sqlite_master
        WHERE type='table' AND name=?
    ''', (table_name,))

    # Fetch one record
    table_exists = cursor.fetchone() is not None

    # Close the connection
    conn.close()

    return table_exists

In [4]:
# Function to check if tables created in the database have data
def check_table_has_data(db_path, table_name):
    # Connect to the SQLite database
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Query to count the number of rows in the table
    cursor.execute(f'SELECT COUNT(*) FROM {table_name}')

    # Fetch the count
    row_count = cursor.fetchone()[0]

    # Close the connection
    conn.close()

    return row_count > 0


In [5]:
# Create/Connect to SQLite database
conn = sqlite3.connect(database_path)
cursor = conn.cursor()

In [39]:
# Create table incident_vehicles in database
cursor.execute('''CREATE TABLE IF NOT EXISTS incident_vehicles (
        IncidentID INT,
        UnitNumber INT,
        UnitType TEXT,
        AirbagSwitchCde TEXT,
        IsCommercialVeh TEXT,
        CrashAvoidCde TEXT,
        DriverIdentifiedCde TEXT,
        EventCollWithFirstCde TEXT,
        EventCollWithSecondCde TEXT,
        HasFire TEXT,
        PreCollActionCde TEXT,
        UnderOverrideCde TEXT,
        VehicleIsInsured TEXT,
        MakeCde TEXT,
        ModelCde TEXT,
        VehicleType TEXT,
        MakeDescription TEXT,
        ModelDescription TEXT);''')

<sqlite3.Cursor at 0x124ccb0c0>

In [6]:
# temp section to read the column names to create table columns

def read_column_names(csv_file_path):

    # Read only the first row of the CSV to get the column names
    df = pd.read_csv(csv_file_path, nrows=0)
    column_names = df.columns.tolist()
    return column_names

# Read the column names from the CSV file
csv_file_path = '/Users/terid/Git/CodeYou_Capstone/data/raw_crash_data/Vehicles_2024.csv'

columns = read_column_names(csv_file_path)
print(columns)


['IncidentID', 'UnitNumber', 'UnitType', 'AirbagSwitchCde', 'IsCommercialVeh', 'CrashAvoidCde', 'DriverIdentifiedCde', 'EventCollWithFirstCde', 'EventCollWithSecondCde', 'HasFire', 'PreCollActionCde', 'UnderOverrideCde', 'VehicleIsInsured', 'MakeCde', 'ModelCde', 'VehicleType', 'MakeDescription', 'ModelDescription', 'Unnamed: 18']


In [9]:
# Check to see if tables created exist
table_name = 'incident_vehicles'

if check_table_exists(database_path, table_name):
    print(f"The table '{table_name}' exists.")
else:
    print(f"The table '{table_name}' does not exist.")


The table 'incident_vehicles' exists.


In [10]:
# Check of tables created are empty

table_name = 'incident_vehicles'

if check_table_has_data(database_path, table_name):
    print(f"The table '{table_name}' contains data.")
else:
    print(f"The table '{table_name}' is empty.")


The table 'incident_vehicles' contains data.


In [76]:

# Combine all CSV files in the directory that begin with "IncidentTrafficControl_"
csv_files = [f for f in os.listdir(directory_path)  if f.startswith("Vehicles_") and f.endswith(".csv")]

# Initialize an empty list to hold dataframes
dataframes = []

# Iterate through the CSV files and load them into dataframes
for csv_file in csv_files:
    file_path = os.path.join(directory_path, csv_file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all dataframes into a single dataframe
combined_vehicles_df = pd.concat(dataframes, ignore_index=True)

# Drop any column with "Unnamed" in its name
unnamed_columns = [col for col in combined_vehicles_df.columns if col.startswith('Unnamed')]
combined_vehicles_df = combined_vehicles_df.drop(columns=unnamed_columns)

# Set the option to display all columns
pd.set_option('display.max_columns', None)

print("All CSV files have been successfully loaded into a single DataFrame.")
print(combined_vehicles_df)

All CSV files have been successfully loaded into a single DataFrame.
      IncidentId  TrafficControlNo       TrafficControl
0       26133281                 1     STOP & GO SIGNAL
1       26146419                 1  ADVISORY SPEED SIGN
2       26146419                 2          CENTER LINE
3       26146419                 3                OTHER
...          ...               ...                  ...
6741    33431439                 3                OTHER
6743    33433119                 1               MEDIAN
6744    33433119                 2  ADVISORY SPEED SIGN

[6746 rows x 3 columns]


In [78]:

# Specify the path where you want to save the CSV file
output_path = '/Users/terid/Git/CodeYou_Capstone/data/clean_crash_data/incident_vehicles.csv'

# Export the dataframe to a CSV file
combined_vehicles_df.to_csv(output_path, index=False)

print(f"Dataframe exported successfully to {output_path}")


Dataframe exported successfully to /Users/terid/Git/CodeYou_Capstone/data/clean_crash_data/incident_traffic_controls.csv


In [80]:

# Write the DataFrame to the SQLite table
combined_vehicles_df.to_sql('incident_vehicles', conn, if_exists='append', index=False)

# Commit the changes and close the connection
conn.commit()
conn.close()

print(f"Data from {df} has been successfully inserted into the collision_incidents table.")


Data from      IncidentId  TrafficControlNo       TrafficControl  Unnamed: 3
0      32658708                 1               MEDIAN         NaN
1      32659228                 1               MEDIAN         NaN
2      32672935                 1               MEDIAN         NaN
3      32676655                 1  ADVISORY SPEED SIGN         NaN
4      32676655                 2          CENTER LINE         NaN
..          ...               ...                  ...         ...
695    33431439                 3                OTHER         NaN
697    33433119                 1               MEDIAN         NaN
698    33433119                 2  ADVISORY SPEED SIGN         NaN

[700 rows x 4 columns] has been successfully inserted into the collision_incidents table.
