# 1: Load data

In [14]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score

In [2]:
acc_data = pd.read_csv('../Data/traffic_data/Accident_Information.csv', dtype={0: str}, encoding='latin1')
veh_data = pd.read_csv('../Data/traffic_data/Vehicle_Information.csv', dtype={0: str}, encoding='latin1')

# 2: Preprocess data

In [3]:
%%writefile wrangle_data.py
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)


def wrangle_data(acc_data, veh_data):
    merged_data = pd.merge(acc_data, veh_data, on='Accident_Index')

    data_cut = pd.DataFrame(merged_data[
        ['Accident_Index', 'Accident_Severity', 'Speed_limit', 'Junction_Detail', 'Junction_Control',
         'Weather_Conditions', 'Road_Surface_Conditions', 'Carriageway_Hazards', 'Vehicle_Manoeuvre']])

    data_cut.replace('Data missing or out of range', np.nan, inplace=True)
    data_cut_cleaned = data_cut.dropna(how='any')

    data_cut_cleaned.to_csv('../Data/traffic_data/cleaned_data.csv', index=False)

    return data_cut_cleaned


Overwriting wrangle_data.py


In [4]:
%%writefile preprocess_data.py
import pandas as pd

pd.set_option('display.max_columns', None)


def preprocess(cleaned_data):
    data_df = cleaned_data

    data_df_encoded = pd.get_dummies(data_df, dtype=int, columns=['Junction_Detail',
                                                                  'Junction_Control', 'Weather_Conditions',
                                                                  'Road_Surface_Conditions', 'Carriageway_Hazards',
                                                                  'Vehicle_Manoeuvre'])

    data_df_encoded['column_normalized'] = -1 + (
            (data_df_encoded['Speed_limit'] - data_df_encoded['Speed_limit'].min()) * 2) / (
                                            data_df_encoded['Speed_limit'].max() - data_df_encoded['Speed_limit'].min())

    data_df_encoded.drop(['Speed_limit', 'Accident_Index'], axis=1, inplace=True)

    # save the encoded data
    data_df_encoded.to_csv('../Data/traffic_data/encoded_data.csv', index=False)

    return data_df_encoded


Overwriting preprocess_data.py


In [5]:
import wrangle_data as wad
import preprocess_data as ppd

cleaned_data = wad.wrangle_data(acc_data, veh_data)
preprocessed_data = ppd.preprocess(cleaned_data)

In [6]:
cleaned_data

Unnamed: 0,Accident_Index,Accident_Severity,Speed_limit,Junction_Detail,Junction_Control,Weather_Conditions,Road_Surface_Conditions,Carriageway_Hazards,Vehicle_Manoeuvre
375,200501BS70652,Slight,30.0,T or staggered junction,Give way or uncontrolled,Fine + high winds,Wet or damp,Other object on road,Turning right
376,200501BS70652,Slight,30.0,T or staggered junction,Give way or uncontrolled,Fine + high winds,Wet or damp,Other object on road,Going ahead other
377,200501BS70653,Slight,30.0,T or staggered junction,Give way or uncontrolled,Fine no high winds,Dry,Other object on road,Going ahead other
588,200501CP00220,Slight,30.0,Crossroads,Auto traffic signal,Fine no high winds,Dry,Pedestrian in carriageway - not injured,Going ahead other
589,200501CP00220,Slight,30.0,Crossroads,Auto traffic signal,Fine no high winds,Dry,Pedestrian in carriageway - not injured,Turning left
...,...,...,...,...,...,...,...,...,...
2057967,2016981109316,Slight,30.0,T or staggered junction,Stop sign,Fine no high winds,Dry,Other object on road,Parked
2058005,2016981117916,Slight,20.0,T or staggered junction,Give way or uncontrolled,Fine no high winds,Dry,Pedestrian in carriageway - not injured,Turning right
2058052,2016982106316,Slight,60.0,T or staggered junction,Give way or uncontrolled,Fine no high winds,Dry,Other object on road,Going ahead right-hand bend
2058053,2016982106316,Slight,60.0,T or staggered junction,Give way or uncontrolled,Fine no high winds,Dry,Other object on road,Going ahead other


In [7]:
preprocessed_data

Unnamed: 0,Accident_Severity,Junction_Detail_Crossroads,Junction_Detail_Mini-roundabout,Junction_Detail_More than 4 arms (not roundabout),Junction_Detail_Not at junction or within 20 metres,Junction_Detail_Other junction,Junction_Detail_Private drive or entrance,Junction_Detail_Roundabout,Junction_Detail_Slip road,Junction_Detail_T or staggered junction,Junction_Control_Authorised person,Junction_Control_Auto traffic signal,Junction_Control_Give way or uncontrolled,Junction_Control_Not at junction or within 20 metres,Junction_Control_Stop sign,Weather_Conditions_Fine + high winds,Weather_Conditions_Fine no high winds,Weather_Conditions_Fog or mist,Weather_Conditions_Other,Weather_Conditions_Raining + high winds,Weather_Conditions_Raining no high winds,Weather_Conditions_Snowing + high winds,Weather_Conditions_Snowing no high winds,Weather_Conditions_Unknown,Road_Surface_Conditions_Dry,Road_Surface_Conditions_Flood over 3cm. deep,Road_Surface_Conditions_Frost or ice,Road_Surface_Conditions_Snow,Road_Surface_Conditions_Wet or damp,Carriageway_Hazards_Any animal in carriageway (except ridden horse),Carriageway_Hazards_Other object on road,Carriageway_Hazards_Pedestrian in carriageway - not injured,Carriageway_Hazards_Previous accident,Carriageway_Hazards_Vehicle load on road,Vehicle_Manoeuvre_Changing lane to left,Vehicle_Manoeuvre_Changing lane to right,Vehicle_Manoeuvre_Going ahead left-hand bend,Vehicle_Manoeuvre_Going ahead other,Vehicle_Manoeuvre_Going ahead right-hand bend,Vehicle_Manoeuvre_Moving off,Vehicle_Manoeuvre_Overtaking - nearside,Vehicle_Manoeuvre_Overtaking moving vehicle - offside,Vehicle_Manoeuvre_Overtaking static vehicle - offside,Vehicle_Manoeuvre_Parked,Vehicle_Manoeuvre_Reversing,Vehicle_Manoeuvre_Slowing or stopping,Vehicle_Manoeuvre_Turning left,Vehicle_Manoeuvre_Turning right,Vehicle_Manoeuvre_U-turn,Vehicle_Manoeuvre_Waiting to go - held up,Vehicle_Manoeuvre_Waiting to turn left,Vehicle_Manoeuvre_Waiting to turn right,column_normalized
375,Slight,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,-0.6
376,Slight,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.6
377,Slight,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.6
588,Slight,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.6
589,Slight,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,-0.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2057967,Slight,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.6
2058005,Slight,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,-1.0
2058052,Slight,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.6
2058053,Slight,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.6


# 3: Training

In [8]:
%%writefile train.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

data = pd.read_csv('../Data/traffic_data/encoded_data.csv')

X = data.drop('Accident_Severity', axis=1)
y = data['Accident_Severity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Function for comparing different algorithms to find the best fit
def train_all():
    train_random_forest()
    train_decision_tree()
    train_knn()
    train_svc()


def train_random_forest():
    # Train a random forest classifier
    clf = RandomForestClassifier(n_estimators=50, random_state=42)
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f'Random Forest Accuracy: {accuracy:.2f}')
    print(f'Random Forest F1: {f1:.2f}')


def train_decision_tree():
    # Train a decision tree classifier
    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f'Decision Tree Accuracy: {accuracy:.2f}')
    print(f'Decision Tree F1: {f1:.2f}')


def train_knn():
    # Train a KNN classifier
    clf = KNeighborsClassifier(n_neighbors=15)
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f'KNN Accuracy: {accuracy:.2f}')
    print(f'KNN F1: {f1:.2f}')


def train_svc():
    # Train an SVC classifier
    clf = SVC(kernel='linear')
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f'SVC Accuracy: {accuracy:.2f}')
    print(f'SVC F1: {f1:.2f}')


Overwriting train.py


In [12]:
import train

train.train_all()

Random Forest Accuracy: 0.84
Decision Tree Accuracy: 0.82
KNN Accuracy: 0.84
SVC Accuracy: 0.85


# Choice and selection of the best fit model

We have chosen to compare the following algorithms to find the best model:
- Random Forest
- Decision Tree
- KNN
- SVC

When comparing the above accuracies, we can conclude that they all have relatively high scores. Even though 'SVC' has a high score, the training process has a high time complexity of O(n^3), with n being the size of the training dataset. This makes it infeasible for our large dataset.

Since the Random Forest algorithm is exceptionally good at handling large data sets with higher dimensionality, as well as being able to manage missing values and maintain accuracy even when a large proportion of the data are missing, we have made the Random Forest our choice of algorithm.
In our tests, the Random Forest scored an accuracy of 0.84 when removing NaN and 'Data or missing value' rows, and scored an accuracy of 0.86 when leaving those rows in the data set.

# Overfitting and how you can spot it

- Poor Performance on New/Unseen Data

- Complex Models with High Variance: Overfitting is more likely with very complex models

- Learning Curves: Plotting learning curves can help identify overfitting

- Highly Fluctuating Metrics: If evaluation metrics on the validation set fluctuate significantly with minor changes

# Hyperparameters

We have adjusted the following hyperparameter:
- Neighbors in the KNN algorithm
- Number of estimators on the Random Forest algorithm

The Neighbors hyperparameter was adjusted because it often has an effect on the accuracy of the model. In our case, we tried values between 1 and 20, and concluded that the accuracy plateaued after around 15, which became the amount of neighbors we chose.

Increasing the number of estimators in a Random Forest model can improve the model's accuracy up to a certain point, as the ensemble method can capture more aspects of the data and make more robust predictions. In our tests, the accuracy plateaued after around 10, but we chose 50 for good measure.

# Measuring quality using F1 score. Accuracy, precision and recall.