In [2]:
#Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#Read Vehicles Table
data = pd.read_csv('../data/Vehicles.csv',encoding='utf-8')

# General info

In [None]:
#Check info of vehicles table
data.info()

# Missing and Unique values

In [None]:
#Check number of unique values for each column
for col in data.columns:
    print(str(col) +": "+ str(data[col].nunique()))


print("------------------------")
#Check number of missing values for each column
for col in data.columns:
    print(str(col) +": "+ str(data[col].isna().sum()))

# Maximum character length for string attributes of Vehicles csv

In [None]:
#Check maximum character length for string attributes of vehicles
for col in data.columns:
    if data[f"{col}"].dtype == "object":
        print(f"{col}: "+ str(data[f"{col}"].str.len().max()))

# Changes to make in the attribute values

In [None]:
print("UNIT_NO:"+str(data["UNIT_NO"].unique())) #number of units implicated in the rd_no case(No change)
print("---------------")
print("UNIT_TYPE:"+str(data["UNIT_TYPE"].unique())) #Change to UNKNOWN the nan's
print("---------------")
print("LIC_PLATE_STATE:"+str(data["LIC_PLATE_STATE"].unique())) #lic plate state we should rename xx and nan as unknown
print("---------------")
# According to the Chicago PD website, if MAKE and/or MODEL are NA, they are not considered relevant or important so can be left as is
# MAKE column has some typos to fix
# MODEL column has two different UNKNOWNS, which is probably a typo
print("VEHICLE_YEAR:"+str(data["VEHICLE_YEAR"].unique())) #Change the vehicles year from 2024 up, to UNKNOWN, NA to UNKNOWN too
print("---------------")
print("VEHICLE_DEFECT:"+str(data["VEHICLE_DEFECT"].unique())) #modify NAN to UNKNOWN
print("---------------")
print("VEHICLE_TYPE:"+str(data["VEHICLE_TYPE"].unique())) #Change NAN to UNKNOWN/NA
print("---------------")
print("VEHICLE_USE:"+str(data["VEHICLE_USE"].unique())) #Change NAN to UNKNOWN/NA
print("---------------")
print("TRAVEL_DIRECTION:"+str(data["TRAVEL_DIRECTION"].unique())) #Change NAN to UNKNOWN
print("---------------")
print("MANEUVER:"+str(data["MANEUVER"].unique())) #Change NAN to UNKNOWN
print("---------------")
print("OCCUPANT_CNT:"+str(data["OCCUPANT_CNT"].unique())) #Numbers appear right
print("---------------")
print("FIRST_CONTACT_POINT:"+str(data["FIRST_CONTACT_POINT"].unique())) #Change NAN to UNKNOWN
print("---------------")



#### UNIT_TYPE Analysis:
While we could determine the vehicle type from MAKE and MODEL when UNIT_TYPE is NaN, this field also represents the vehicle's status during the crash (parked, driven, etc.). Therefore, we cannot accurately determine the UNIT_TYPE from other fields alone.

#### License_Plate Analysis:
The column contains both a placeholder value (XX) and NaN values. Since this is a string column, we should standardize by replacing both with 'UNKNOWN'.

#### VEHICLE_YEAR Analysis:
The column contains NaN values, some values greater than 2024 as well as a value (9999) that appears to be a typo for 1999. It has a lot of occurences (257). We could fix the typo by replacing it with 1999, and setting the years greater than 2024 as 'UNKNOWN'. 

In [None]:
filtered_df = data[(data['VEHICLE_YEAR'] > 2024) & data['VEHICLE_YEAR'].notnull()]
print("total entries with wrong year date: " + str(len(filtered_df)))
filtered_df = filtered_df["VEHICLE_YEAR"].unique()
print("unique entries with wrong year date: " + str(filtered_df.size))
#Count by vehicle year date
""" for i in filtered_df:
    print(str(i)+": "+str(data[data["VEHICLE_YEAR"] == i]["VEHICLE_YEAR"].count()))
 """

#### VEHICLE_DEFECT Analysis:
For vehicle defect we already have a label for UNKNOWN thus we can replace NAN's with this label.

#### OCCUPANT_CNT Analysis:
Very few vehicles are carrying more than 6 persons during the accident.

In [None]:
filtered_df = data[(data['OCCUPANT_CNT'] > 6) & data['OCCUPANT_CNT'].notnull()]
print("total entries with more than usual ppl: " + str(len(filtered_df)))
filtered_df = filtered_df["OCCUPANT_CNT"].unique()
print("unique entries with more than usual ppl: " + str(filtered_df.size))
""" for i in filtered_df:
    print(str(i)+": "+str(data[data["OCCUPANT_CNT"] == i]["OCCUPANT_CNT"].count())) """


# Check for semantic inconsistencies on vehicle make and model
There exist some typos on the names of makes and models.

In [None]:
df_cleaned = data.dropna(subset=['MAKE','MODEL'])
import Levenshtein

def find_similar_words(word_list, threshold=2):

    similar_words = []
    for i in range(len(word_list)):
        for j in range(i + 1, len(word_list)):
            distance = Levenshtein.distance(word_list[i], word_list[j])
            if distance <= threshold:
                similar_words.append((word_list[i], word_list[j]))

    return similar_words

word_list = df_cleaned["MAKE"].unique()
similar_pairs = find_similar_words(word_list)
print(similar_pairs)

more_tan5_model = df_cleaned[df_cleaned['MAKE'].str.len() > 8].drop_duplicates()
word_list = more_tan5_model['MAKE'].unique()
similar_pairs = find_similar_words(word_list)
print(similar_pairs)

#('NEW HOLLAND, DIV. OF SPERRY NEW HOLLAND', 'NEW HOLLAND, (DIV. OF SPERRY NEW HOLLAND)')
#('AMC (LAWN & GARDEN TRACTORS BY AMERICAN MOTORS)', 'AMC (LAWN & GARDEN TRACTORS BY AMMERICAN MOTORS)')
#('ROLLS ROYCE', 'ROLLS-ROYCE')

In [None]:
word_list = df_cleaned["MODEL"].unique()
similar_pairs = find_similar_words(word_list)
print(similar_pairs)
more_tan5_model = df_cleaned[df_cleaned['MODEL'].str.len() > 9].drop_duplicates()
word_list = more_tan5_model['MODEL'].unique()
similar_pairs = find_similar_words(word_list)
print(similar_pairs)

#('UNKNOWN', 'UNKOWN')


# Check for crash dates in the future 
There are no dates in the future in regards to the crash date.

In [None]:
# Convert the 'Date' column to datetime format
data['CRASH_DATE'] = pd.to_datetime(data['CRASH_DATE'])

# Filter rows where the date is after 01/01/2024 12:00 AM
filtered_df = data[data['CRASH_DATE'] >= pd.to_datetime('2020-01-01 00:00:00')]['CRASH_DATE']

print(filtered_df)

# Visualizations

#### Showing how the vehicle type and the unit type stack up
#### Most of the accidents happened during the driving of the vehicles

In [None]:
data = pd.read_csv('../data/Vehicles_Processed.csv',encoding='utf-8')
# Create a count plot
sns.countplot(x="UNIT_TYPE", data=data)
# Adjust the font size of tick labels
plt.tick_params(axis='both', which='major', labelsize=5)
plt.show()

#### The main directions are the most prevalent in the data of the vehicles

In [None]:
# Create a count plot
sns.countplot(x="TRAVEL_DIRECTION", data=data)
# Adjust the font size of tick labels
plt.tick_params(axis='both', which='major', labelsize=5)
plt.show()

#### Total, roof and others appear to be the less reported first contact points, this can be expected from a crashed car, while the most common are the front parts of the vehicle.

In [None]:
# Create a count plot
sns.countplot(x="FIRST_CONTACT_POINT", data=data)
# Adjust the font size of tick labels
plt.tick_params(axis='both', which='major', labelsize=4)
plt.show()

# Changes to make in the attribute values
1. UNIT_TYPE -> change nan’s to UNKNOWN
2. LIC_PLATE_STATE -> change both nan’s and XX to UNKNOWN
3. MAKE ->  has duplicates with typos in their names fix and replace one of the names
4. MODEL -> has two UNKNOWN and UKNOW fix the typos
5. VEHICLE_YEAR -> years above 2024 rename to UNKNOWN, nan’s also to UNKNOWN (BUT the year 999 appears as a typo for 1999, what to do? there’s nothing on their website about it)
6. VEHICLE_DEFECT -> nan’s to UNKNOWN
7. VEHICLE_TYPE -> nan’s to UNKNOWN/NA
8. VEHICLE_USE -> nan’s to UNKNOWN/NA
9. VEHICLE_DIRECTION -> nan’s to UNKNOWN
10. MANEUVER -> nan’s to UNKNOWN
11. FIRST_CONTACT_POINT -> nan’s to UNKNOWN
