In [None]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/DaSci/Dataset/non_accident.csv')

In [None]:
data = data.drop(['Unnamed: 0'], axis=1)

In [None]:
data['WEATHER_CONDITION']= data['WEATHER_CONDITION'].replace({'CLEAR':1,'RAIN':2,'CLOUDY/OVERCAST':3,'SNOW':4})
data['LIGHTING_CONDITION']= data['LIGHTING_CONDITION'].replace({'DAYLIGHT':1,'DARKNESS, LIGHTED ROAD':2,'DARKNESS':3,'DUSK':4,'DAWN':5})
data['ROADWAY_SURFACE_COND']= data['ROADWAY_SURFACE_COND'].replace({'DRY':1,'WET':2,'SNOW OR SLUSH':3})

In [None]:
data.head()

Unnamed: 0,WEATHER_CONDITION,LIGHTING_CONDITION,CRASH_WEEKDAY,CRASH_HOUR,CRASH_Month,x,y,ROADWAY_SURFACE_COND,accident
0,1,1,3,7,7,43.0,43.0,1,1
1,1,1,4,16,5,38.0,16.0,1,1
2,1,1,1,8,5,36.0,35.0,1,1
3,1,2,6,1,6,14.0,49.0,1,1
4,3,1,1,16,8,8.0,47.0,1,1


In [None]:
#Reduce the size of the dataset
both_obs = data

# For convinience, divide the dataframe on two labels and reduce the sample. 
nor_obs = both_obs.loc[both_obs['accident']==0].sample(frac=0.023)  #Data frame with normal observations
acc_obs = both_obs.loc[data['accident']==1].sample(frac=0.20) #Data frame with accident observations


In [None]:
#Define train : train is composed of 90% of accidents and 10% of non accidents
X_train = nor_obs.append(acc_obs)

nb_acc = X_train['accident'].value_counts()[1]
nb_noacc = X_train['accident'].value_counts()[0]
print(nb_acc)
print(nb_noacc)
print(nb_acc/(nb_noacc+nb_acc))

83712
8409
0.9087178819161754


In [None]:
#We train the one class SVM on data composed only by non accidents
train_feature = X_train.drop(['accident'], axis=1)

In [None]:
train_feature

Unnamed: 0,WEATHER_CONDITION,LIGHTING_CONDITION,CRASH_WEEKDAY,CRASH_HOUR,CRASH_Month,x,y,ROADWAY_SURFACE_COND
541250,1,1,5,14,7,19.0,41.0,3
616990,2,1,3,16,8,29.0,59.0,1
511236,4,1,5,10,1,14.0,55.0,1
598011,1,2,3,17,1,27.0,33.0,3
703173,1,2,4,22,8,40.0,58.0,3
...,...,...,...,...,...,...,...,...
392003,3,1,4,11,3,40.0,22.0,1
214123,1,2,4,6,1,37.0,27.0,1
251519,1,5,4,18,12,54.0,19.0,1
197715,1,2,5,20,3,44.0,46.0,1


In [None]:
# Setting the hyperparameters for Once Class SVM
oneclass = svm.OneClassSVM(nu=0.1, kernel="poly", gamma=0.01)

In [None]:
# Training the algorithm with the features. 
oneclass.fit(train_feature)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma=0.01, kernel='poly',
            max_iter=-1, nu=0.1, shrinking=True, tol=0.001, verbose=False)

In [None]:
#Define test
noUse, test = train_test_split(both_obs, test_size = 0.1)

# Creating test observations composed of accidents and non accidents
X_test = test.drop(['accident'], axis=1)
Y_test = test['accident']


In [None]:
# Test the algorithm on the test set
accident_pred = oneclass.predict(X_test)

In [None]:
# Check the number of outliers predicted by the algorithm
unique, counts = np.unique(accident_pred, return_counts=True)
print (np.asarray((unique, counts)).T)

[[   -1 14014]
 [    1 64402]]


In [None]:
#Convert Y-test and accident_pred to dataframe for ease of operation
Y_test= Y_test.to_frame()
Y_test=Y_test.reset_index()
accident_pred = pd.DataFrame(accident_pred)
accident_pred= accident_pred.rename(columns={0: 'prediction'})

In [None]:
##Performance check of the model

TP = FN = FP = TN = 0
for j in range(len(Y_test)):
    if Y_test['accident'][j]== 0 and accident_pred['prediction'][j] == 1:
        TP = TP+1
    elif Y_test['accident'][j]== 0 and accident_pred['prediction'][j] == -1:
        FN = FN+1
    elif Y_test['accident'][j]== 1 and accident_pred['prediction'][j] == 1:
        FP = FP+1
    else:
        TN = TN +1
print (TP,  FN,  FP,  TN)

25818 10807 38584 3207


In [None]:
# Performance Matrix
accuracy = (TP+TN)/(TP+FN+FP+TN)
print (accuracy)
sensitivity = TP/(TP+FN)
print (sensitivity)
specificity = TN/(TN+FP)
print (specificity)

0.3701412976943481
0.7049283276450512
0.07673901079179728


In [None]:
#Define train and test dataset
train, test = train_test_split(nor_obs, test_size = 0.5)

In [None]:
train_feature = nor_obs.drop(['accident'], axis=1)

In [None]:
clf = svm.OneClassSVM(nu=0.1, kernel="poly", gamma=0.1)

In [None]:
clf.fit(train_feature)

In [None]:
y_pred = clf.predict()

In [None]:
# Once class SVM is trained with the observations of only one class. In this case, the algorithm is trained with 
# first 200,000 observation of normal transactions. The remaining observation is merged with the anomalous observation 
# to create a test set. 

train_feature = train.drop('accident', axis=1)
Y_1 = nor_obs['accident']
Y_2 = acc_obs['accident']

In [None]:
train_feature

In [None]:
train_feature

In [None]:
#Define train and test dataset
train, test = train_test_split(data, test_size = 0.3)

In [None]:
#Reduce the size of the dataset (complexity to high)
train = train.sample(frac = 0.3)
test = test.sample(frac=0.3)

In [None]:
train['accident'].value_counts()

In [None]:
train_normal = train[train['accident']==0] 
train_outliers = train[train['accident']==1] 
outlier_prop = len(train_outliers) / len(train_normal) 
print(outlier_prop)

In [None]:
#Compose of accidents and non_accidents data
non_accident

In [None]:
#Take 25% of the rows
OneClass = non_accident.sample(frac = 0.25)

In [None]:
# Setting the hyperparameters for Once Class SVM
oneclass = svm.OneClassSVM(kernel='poly')

In [None]:
model = oneclass.fit(OneClass.values)

In [None]:
y_pred_train = clf.predict(OneClass)


In [None]:
X_train = non_accident.sample(frac = 0.25)
X_test = non_accident.sample(frac = 0.25)

In [None]:
clf = svm.OneClassSVM(nu=0.1, kernel="poly", gamma=0.1)
clf.fit(X_train)

In [None]:
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)