In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import numpy as np

In [2]:
!pip install kmodes --quiet

In [3]:
from kmodes.kmodes import KModes

In [14]:
ksi_df = pd.read_csv('KSI.csv')

In [15]:
df = ksi_df.iloc[ksi_df['ACCNUM'].drop_duplicates().index]

In [16]:
df = df[df['INVTYPE'].str.contains('Driver')]

In [17]:
def actc(x):
    if x in ('Speed Too Fast For Condition', 'Exceeding Speed Limit'):
        return 'Speeding'
    return x

def condc(x):
    if x in ('Ability Impaired, Alcohol', 'Ability Impaired, Alcohol Over .08', 'Had Been Drinking', 'Ability Impaired, Drugs', 'Ability Impaired, Alcohol Over .80'):
        return 'Ability Impaired (Drugs/Alcohol)'
    return x

def tc(x):
    if x in ('Traffic Controller', 'Police Control'):
        return 'Controller present'
    if x in ('Stop Sign', 'Yield Sign'):
        return 'Stop/Yield Sign'
    return x
def agec(x):
    if x in ('0 to 4', '5 to 9', '10 to 14'):
        return 'Children'
    if x in ('15 to 19', '20 to 24'):
        return 'Young Adult'
    if x in ('25 to 29', '30 to 34', '35 to 39', '40 to 44', '45 to 49', '50 to 54', '55 to 59'):
        return 'Adult'
    if x in ('60 to 64', '65 to 69', '70 to 74'):
        return 'Senior'
    if x != 'Other':
        return 'Super Senior'
    return x

In [18]:
df['TRAFFCTL'] = df['TRAFFCTL'].apply(tc)
df['DRIVACT'] = df['DRIVACT'].apply(actc)
df['DRIVCOND'] = df['DRIVCOND'].apply(condc)
df['INVAGE'] = df['INVAGE'].apply(agec)

In [19]:
cols = ['MANOEUVER', 'DRIVACT', 'DRIVCOND', 'INVAGE','TRAFFCTL', 'SPEEDING', 'ALCOHOL', 'REDLIGHT', 'AG_DRIV']

In [20]:
cluster_cols = df[cols]

In [21]:
kmode = KModes(n_clusters= 10, init = "random", n_init = 5, verbose=0, random_state = 1)
clusters = kmode.fit_predict(cluster_cols)

In [22]:
cluster_cols.insert(0, "Cluster", clusters, True)

In [23]:
#Number of accidents in each cluster
k = range(0,10)
for x in k:
  cluster_cols[cluster_cols['Cluster'] == x]
  print(cluster_cols[cluster_cols['Cluster'] == x].shape)

(841, 10)
(275, 10)
(1186, 10)
(578, 10)
(339, 10)
(259, 10)
(172, 10)
(100, 10)
(47, 10)
(269, 10)


In [24]:
cluster_cols[cluster_cols['Cluster'] == 0].apply(lambda x: x.mode())

Unnamed: 0,Cluster,MANOEUVER,DRIVACT,DRIVCOND,INVAGE,TRAFFCTL,SPEEDING,ALCOHOL,REDLIGHT,AG_DRIV
0,0,Going Ahead,Driving Properly,Normal,Adult,Traffic Signal,<Null>,<Null>,<Null>,<Null>


In [25]:
cluster_cols[cluster_cols['Cluster'] == 1].apply(lambda x: x.mode())

Unnamed: 0,Cluster,MANOEUVER,DRIVACT,DRIVCOND,INVAGE,TRAFFCTL,SPEEDING,ALCOHOL,REDLIGHT,AG_DRIV
0,1,Going Ahead,Disobeyed Traffic Control,Normal,Adult,Traffic Signal,<Null>,<Null>,Yes,Yes


In [26]:
cluster_cols[cluster_cols['Cluster'] == 2].apply(lambda x: x.mode())

Unnamed: 0,Cluster,MANOEUVER,DRIVACT,DRIVCOND,INVAGE,TRAFFCTL,SPEEDING,ALCOHOL,REDLIGHT,AG_DRIV
0,2,Going Ahead,Driving Properly,Normal,Adult,No Control,<Null>,<Null>,<Null>,<Null>


In [27]:
cluster_cols[cluster_cols['Cluster'] == 3].apply(lambda x: x.mode())

Unnamed: 0,Cluster,MANOEUVER,DRIVACT,DRIVCOND,INVAGE,TRAFFCTL,SPEEDING,ALCOHOL,REDLIGHT,AG_DRIV
0,3,Turning Left,Failed to Yield Right of Way,Inattentive,Adult,Traffic Signal,<Null>,<Null>,<Null>,Yes


In [28]:
cluster_cols[cluster_cols['Cluster'] == 4].apply(lambda x: x.mode())

Unnamed: 0,Cluster,MANOEUVER,DRIVACT,DRIVCOND,INVAGE,TRAFFCTL,SPEEDING,ALCOHOL,REDLIGHT,AG_DRIV
0,4,Going Ahead,Failed to Yield Right of Way,Normal,Adult,No Control,<Null>,<Null>,<Null>,Yes


In [29]:
cluster_cols[cluster_cols['Cluster'] == 5].apply(lambda x: x.mode())

Unnamed: 0,Cluster,MANOEUVER,DRIVACT,DRIVCOND,INVAGE,TRAFFCTL,SPEEDING,ALCOHOL,REDLIGHT,AG_DRIV
0,5,Going Ahead,Speed too Fast For Condition,Normal,Adult,No Control,Yes,<Null>,<Null>,Yes


In [30]:
cluster_cols[cluster_cols['Cluster'] == 6].apply(lambda x: x.mode())

Unnamed: 0,Cluster,MANOEUVER,DRIVACT,DRIVCOND,INVAGE,TRAFFCTL,SPEEDING,ALCOHOL,REDLIGHT,AG_DRIV
0,6,Turning Left,Improper Turn,Inattentive,Super Senior,No Control,<Null>,<Null>,<Null>,<Null>


In [31]:
cluster_cols[cluster_cols['Cluster'] == 7].apply(lambda x: x.mode())

Unnamed: 0,Cluster,MANOEUVER,DRIVACT,DRIVCOND,INVAGE,TRAFFCTL,SPEEDING,ALCOHOL,REDLIGHT,AG_DRIV
0,7,Going Ahead,Speeding,Unknown,Young Adult,No Control,Yes,<Null>,<Null>,Yes


In [32]:
cluster_cols[cluster_cols['Cluster'] == 8].apply(lambda x: x.mode())

Unnamed: 0,Cluster,MANOEUVER,DRIVACT,DRIVCOND,INVAGE,TRAFFCTL,SPEEDING,ALCOHOL,REDLIGHT,AG_DRIV
0,8,Going Ahead,Speeding,Ability Impaired (Drugs/Alcohol),Adult,No Control,Yes,<Null>,<Null>,Yes


In [33]:
cluster_cols[cluster_cols['Cluster'] == 9].apply(lambda x: x.mode())

Unnamed: 0,Cluster,MANOEUVER,DRIVACT,DRIVCOND,INVAGE,TRAFFCTL,SPEEDING,ALCOHOL,REDLIGHT,AG_DRIV
0,9,Turning Left,Failed to Yield Right of Way,Normal,Adult,Traffic Signal,<Null>,<Null>,<Null>,Yes
