# Bike Data Processing
This notebook processes the bike data Excel file to:
- Extract specific columns (HSMV_Report_Number, Bike Type, Narrative)
- Filter rows that have a value in Bike Type
- Display basic statistics

In [3]:
# Import required libraries
import pandas as pd
import numpy as np
from pathlib import Path

In [4]:
# Read the Excel file
file_path = r"nolabeluse.xlsx"
df = pd.read_excel(file_path)

print(f"Original data shape: {df.shape}")
print(f"\nColumns in original file: {list(df.columns)}")

Original data shape: (9031, 45)

Columns in original file: ['HSMV_Report_Number', 'Reporting_Agency', 'Form_Type', 'Year', 'Crash_Date', 'Crash_Time', 'City', 'County', 'Crash_Street', 'Intersecting_Street', 'Vehicles', 'Non_Motorists', 'Fatalities', 'Injuries', 'Alcohol_Related', 'Distraction_Related', 'Drug_Related', 'Weather_Condition', 'Light_Condition', 'Crash_Severity', 'Type_of_Intersection', 'Road_Sys_Identifier', 'Type_of_Shoulder', 'Road_Surf_Cond', 'Bicyclists', 'Possible_Injuries', 'Non_Incapacitating_Injuries', 'Incapacitating_Injuries', 'Fatalities_30_Days', 'Non_Traffic_Fatalities', 'S4_Mapping', 'S4_Decimal_Degree_Longitude', 'S4_Decimal_Degree_Latitude', 'S4_Albers_X', 'S4_Albers_Y', 'S4_Mapping_Date', 'Bike_Crash_Group_Number', 'Bike_Crash_Group', 'Bike_Crash_Type_Number', 'Bike_Crash_Type', 'Bike_Crash_Location', 'Bike_Bicyclist_Direction', 'Bike_Bicyclist_Position', 'Bike_Typing_Notes', 'Bike Type']


In [5]:
# Select the columns we need
# Start with the existing columns
existing_columns = ['HSMV_Report_Number', 'Bike Type']

# Create a new dataframe with existing columns
df_filtered = df[existing_columns].copy()

# Add the Narrative column as empty
df_filtered['Narrative'] = ''

print(f"Created dataframe with columns: {list(df_filtered.columns)}")
print(f"Shape: {df_filtered.shape}")

Created dataframe with columns: ['HSMV_Report_Number', 'Bike Type', 'Narrative']
Shape: (9031, 3)


In [6]:
# Filter: Keep only rows where 'Bike Type' has a value (not null/empty)
df_filtered = df_filtered[df_filtered['Bike Type'].notna()]
df_filtered = df_filtered[df_filtered['Bike Type'].astype(str).str.strip() != '']

print(f"Filtered data shape: {df_filtered.shape}")
print(f"Rows removed: {df.shape[0] - df_filtered.shape[0]}")

Filtered data shape: (111, 3)
Rows removed: 8920


In [7]:
# Display first few rows
print("\nFirst 10 rows of filtered data:")
df_filtered.head(10)


First 10 rows of filtered data:


Unnamed: 0,HSMV_Report_Number,Bike Type,Narrative
403,82089001,Motorized bicycle,
530,81994356,Motorized bicycle,
745,82877472,E-trike,
927,83350430,Pedicab,
1031,83353372,Motorized bicycle,
1205,83353605,Police bike,
1487,83360086,Bicycle with trailer,
1556,80823780,E-trike,
2079,83968397,Adult tricycle,
2100,83671693,Recumbent trike,


In [8]:
# Save to new Excel file
output_path = r"C:\Users\rmathew1\Desktop\bike\bike_data_filtered.xlsx"
df_filtered.to_excel(output_path, index=False)
print(f"Filtered data saved to: {output_path}")

Filtered data saved to: C:\Users\rmathew1\Desktop\bike\bike_data_filtered.xlsx


## Basic Statistics

In [9]:
# Basic statistics
print("=" * 60)
print("BASIC STATISTICS")
print("=" * 60)

print(f"\nTotal number of records: {len(df_filtered)}")
print(f"\nNumber of unique HSMV Report Numbers: {df_filtered['HSMV_Report_Number'].nunique()}")
print(f"Number of unique Bike Types: {df_filtered['Bike Type'].nunique()}")

BASIC STATISTICS

Total number of records: 111

Number of unique HSMV Report Numbers: 111
Number of unique Bike Types: 10


In [10]:
# Bike Type distribution
print("\n" + "=" * 60)
print("BIKE TYPE DISTRIBUTION")
print("=" * 60)
bike_type_counts = df_filtered['Bike Type'].value_counts()
print(bike_type_counts)

print("\nPercentage distribution:")
print(df_filtered['Bike Type'].value_counts(normalize=True) * 100)


BIKE TYPE DISTRIBUTION
Bike Type
E-bike                  42
Motorized bicycle       37
Pedicab                 16
E-trike                  4
Bicycle with trailer     3
Police bike              2
Adult tricycle           2
Bike share               2
Tandem                   2
Recumbent trike          1
Name: count, dtype: int64

Percentage distribution:
Bike Type
E-bike                  37.837838
Motorized bicycle       33.333333
Pedicab                 14.414414
E-trike                  3.603604
Bicycle with trailer     2.702703
Police bike              1.801802
Adult tricycle           1.801802
Bike share               1.801802
Tandem                   1.801802
Recumbent trike          0.900901
Name: proportion, dtype: float64


In [11]:
# Narrative column statistics
print("\n" + "=" * 60)
print("NARRATIVE COLUMN STATISTICS")
print("=" * 60)
print(f"\nRows with Narrative filled: {df_filtered['Narrative'].notna().sum()}")
print(f"Rows with Narrative empty: {df_filtered['Narrative'].isna().sum()}")
print(f"Percentage filled: {(df_filtered['Narrative'].notna().sum() / len(df_filtered) * 100):.2f}%")


NARRATIVE COLUMN STATISTICS

Rows with Narrative filled: 111
Rows with Narrative empty: 0
Percentage filled: 100.00%


In [12]:
# Missing values summary
print("\n" + "=" * 60)
print("MISSING VALUES SUMMARY")
print("=" * 60)
print(df_filtered.isnull().sum())


MISSING VALUES SUMMARY
HSMV_Report_Number    0
Bike Type             0
Narrative             0
dtype: int64


In [13]:
# Data info
print("\n" + "=" * 60)
print("DATA INFO")
print("=" * 60)
df_filtered.info()


DATA INFO
<class 'pandas.core.frame.DataFrame'>
Index: 111 entries, 403 to 9012
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   HSMV_Report_Number  111 non-null    int64 
 1   Bike Type           111 non-null    object
 2   Narrative           111 non-null    object
dtypes: int64(1), object(2)
memory usage: 7.5+ KB
