In [7]:
#imports
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score,mean_squared_error,mean_absolute_error
from sklearn.utils import class_weight
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTENC


In [2]:
df = pd.read_csv("Transformed_Accidents.csv")
df = df.dropna()
arr = df.to_numpy(dtype= int)
labels = arr.transpose()[0] #getting the classes
features = arr.transpose()[1:] #getting the features and excluding the classes
features = features.transpose()

In [3]:
print(np.bincount(labels))

[     0   6826  83194 599129]


In [5]:
#this is for creating synthethic samples for minority classes
smote_nc = SMOTENC(categorical_features= [1,2,3], random_state=0)
x_resampled , y_resampled = smote_nc.fit_resample(features,labels)

In [None]:
print(x_resampled.shape)
print(y_resampled.shape)

In [13]:
#his is cross validating train and test data
kf = GroupKFold(n_splits= 10)
for train_index, test_index in kf.split(x_resampled,labels):
    x_train,x_test = x_resampled[train_index], x_resampled[test_index]
    y_train, y_test = y_resampled[train_index], y_resampled[test_index]


ValueError: Found input variables with inconsistent numbers of samples: [1797387, 689149]

In [21]:
kf = StratifiedKFold(n_splits= 100)
for train_index, test_index in kf.split(features,labels):
    x_train,x_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

print(np.bincount(y_train))
print(np.bincount(y_test))


[     0   6783  82760 597122]
[   0   69  836 6031]
{1: 33.74441004471964, 2: 2.765687932978895, 3: 0.3833192100330139}


In [None]:
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes = np.unique(y_train), y =y_train)
class_weights = dict(zip(np.unique(y_train), class_weights))
print(class_weights)

In [22]:
modelTree = DecisionTreeClassifier(criterion= "entropy", splitter = "random", max_depth= 57, class_weight = class_weights)
sampler = RandomOverSampler(random_state=0)
model = make_pipeline(sampler, modelTree)
model.fit(X=x_train,y=y_train)
predictions = model.predict(X=x_test)


In [None]:
model = DecisionTreeClassifier(criterion= "entropy", splitter = "random", max_depth= 57, class_weight = class_weights)
model.fit(X=x_train,y=y_train)
predictions = model.predict(X=x_test)


In [23]:
accuracy = accuracy_score(y_test,predictions)
print("---------------------------")
print(f'Accuracy {accuracy:.4}')
precision = precision_score(y_test, predictions, average="weighted")
recall = recall_score(y_test,predictions,average="weighted")
f1 = f1_score(y_test,predictions, average= "weighted")
print(f'Precision: {precision:.4}')
print(f'Recall: {recall:.4}')
print(f'F1 Score: {f1:.4}')
print('Mean squared error = {:5.2f}'.format(mean_squared_error(predictions,y_test)))
print('Mean absolute error =  {:5.2f}'.format(mean_absolute_error(predictions,y_test)))

---------------------------
Accuracy 0.3668
Precision: 0.7987
Recall: 0.3668
F1 Score: 0.4726
Mean squared error =  1.31
Mean absolute error =   0.86
