In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv('5_vehicle_traffic_data.csv')
print(data.head())
# 1. Data cleaning and pre-handling
data = data.dropna()
data['Age'] = data['Age'].astype(int)
data['Speed'] = data['Speed'].astype(float)
data['TravelDistance'] = data['TravelDistance'].astype(float)
data['TravelTime'] = data['TravelTime'].astype(float)
data = data[(data['Age'].between(18, 70))  & 
            (data['Speed'].between (0, 200)) & 
            (data['TravelDistance'].between(1, 1000)) & 
            (data['TravelTime'].between(1, 1440))]

data.to_csv('5_vehicle_traffic_data_cleaned.csv', index=False)
print("Data cleaning complete. Saved as '5_vehicle_traffic_data_cleaned.csv'")

   VehicleID DriverName  Age Gender  Speed  TravelDistance   TravelTime  \
0          1   Driver_1   62   Male     17             242   854.117647   
1          2   Driver_2   65   Male    128             438   205.312500   
2          3   Driver_3   18   Male    167             792   284.550898   
3          4   Driver_4   21   Male     38             999  1577.368421   
4          5   Driver_5   21   Male    193             364   113.160622   

  TrafficEvent  
0       Normal  
1     Accident  
2    Breakdown  
3  Traffic Jam  
4    Breakdown  
Data cleaning complete. Saved as '5_vehicle_traffic_data_cleaned.csv'


In [2]:
# 2. Verify data is reasonable
unreasonable_data = data[~((data['Age'].between(18, 70)) & 
                           (data['Speed'].between(0, 200)) & 
                           (data['TravelDistance'].between(1, 1000)) & 
                           (data['TravelTime'].between(1, 1440)))]
print('Unreasonable Data:\n', unreasonable_data)

Unreasonable Data:
 Empty DataFrame
Columns: [VehicleID, DriverName, Age, Gender, Speed, TravelDistance, TravelTime, TrafficEvent]
Index: []


In [3]:
# 3. Data Statistics
traffic_event_counts = data['TrafficEvent'].value_counts()
print('Occurrence count by traffic event type:\n', traffic_event_counts)
gender_stats = data.groupby('Gender', observed=False).agg({'Speed': 'mean', 'TravelDistance': 'mean', 'TravelTime': 'mean'})
print('Average Speed, Travel Distance, and Travel Time by Gender:\n', gender_stats)
age_bins = [18, 26, 36, 46, 56, 66, np.inf]
age_labels = ['18-25', '26-35', '36-45', '46-55', '56-65', '65+']
data['AgeGroup'] = pd.cut(data['Age'], bins=age_bins, labels=age_labels, right=False)
age_group_counts = data['AgeGroup'].value_counts()
print("Driver count by age group:\n", age_group_counts)

Occurrence count by traffic event type:
 TrafficEvent
Accident       229
Normal         227
Traffic Jam    218
Breakdown      204
Name: count, dtype: int64
Average Speed, Travel Distance, and Travel Time by Gender:
              Speed  TravelDistance  TravelTime
Gender                                        
Female  105.130820      487.476718  359.628355
Male    108.918033      477.932084  359.134097
Driver count by age group:
 AgeGroup
46-55    172
36-45    169
56-65    159
26-35    154
18-25    151
65+       73
Name: count, dtype: int64
