# Вариант 1.

Пример взят [отсюда](https://www.kaggle.com/code/outofskills/binary-random-forest).

Точность около 55-60%.


In [None]:
# * Разорхивируем данные из архива
import zipfile

with zipfile.ZipFile('/content/drive/MyDrive/Driving Behavior.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
# * Загружаем данные
import pandas as pd

df_train = pd.read_csv('/content/train_motion_data.csv')
df_test = pd.read_csv('/content/test_motion_data.csv')

In [None]:
# * Глобальные константы
FEATURES = 6

# * Записи, которые удаляются из начала и конца набора данных по причине шума
N_RECORDS = 11

In [None]:
# Разделите образцы по классам
#, чтобы их можно было обрезать до нужной формы
df_train_normal = df_train.loc[df_train['Class'] == 'NORMAL']
df_train_slow = df_train.loc[df_train['Class'] == 'SLOW']
df_train_aggressive = df_train.loc[df_train['Class'] == 'AGGRESSIVE']

df_test_normal = df_test.loc[df_test['Class'] == 'NORMAL']
df_test_slow = df_test.loc[df_test['Class'] == 'SLOW']
df_test_aggressive = df_test.loc[df_test['Class'] == 'AGGRESSIVE']

In [None]:
# Как можно заметить, присутствует некоторый шум
# в начале и конце каждого временного ряда
# Это вызвано переключением классов
# ---------------------------------------------
# Решением было бы просто удалить первую и последнюю N выборок
df_train_slow = df_train_slow.iloc[N_RECORDS:]
df_train_slow = df_train_slow.iloc[:-N_RECORDS]

df_test_slow = df_test_slow.iloc[N_RECORDS:]
df_test_slow = df_test_slow.iloc[:-N_RECORDS]

df_train_normal = df_train_normal.iloc[N_RECORDS:]
df_train_normal = df_train_normal.iloc[:-N_RECORDS]

df_test_normal = df_test_normal.iloc[N_RECORDS:]
df_test_normal = df_test_normal.iloc[:-N_RECORDS]

df_train_aggressive = df_train_aggressive.iloc[N_RECORDS:]
df_train_aggressive = df_train_aggressive.iloc[:-N_RECORDS]

df_test_aggressive = df_test_aggressive.iloc[N_RECORDS:]
df_test_aggressive = df_test_aggressive.iloc[:-N_RECORDS]

In [None]:
# * Проверка обучающих данных на каждой метке
train_slow_samples = df_train_slow.__len__()
train_normal_samples = df_train_normal.__len__()
train_aggressive_samples = df_train_aggressive.__len__()

print('---Train---')
print('Slow samples: ' + str(train_slow_samples))
print('Normal samples: ' + str(train_normal_samples))
print('Aggressive samples: ' + str(train_aggressive_samples))

df_train_slow = df_train_slow.tail(1300)
df_train_normal = df_train_normal.tail(1160)
df_train_aggressive = df_train_aggressive.tail(1080)

---Train---
Slow samples: 1309
Normal samples: 1178
Aggressive samples: 1091


In [None]:
# * Проверка тестовых данных на каждой метке
test_slow_samples = df_test_slow.__len__()
test_normal_samples = df_test_normal.__len__()
test_aggressive_samples = df_test_aggressive.__len__()

print('---Test---')
print('Slow samples: ' + str(test_slow_samples))
print('Normal samples: ' + str(test_normal_samples))
print('Aggressive samples: ' + str(test_aggressive_samples))

df_test_slow = df_test_slow.tail(1240)
df_test_normal = df_test_normal.tail(960)
df_test_aggressive = df_test_aggressive.tail(780)

---Test---
Slow samples: 1251
Normal samples: 975
Aggressive samples: 792


In [None]:
# * Отделение объектов от надписей

# * Особенности
X_train_normal = df_train_normal.iloc[: , :FEATURES]
# X_train_slow = df_train_slow.iloc[: , :FEATURES]
X_train_aggressive = df_train_aggressive.iloc[: , :FEATURES]

X_test_normal = df_test_normal.iloc[: , :FEATURES]
# X_test_slow = df_test_slow.iloc[: , :FEATURES]
X_test_aggressive = df_test_aggressive.iloc[: , :FEATURES]

# * Метки
y_train_normal = df_train_normal.Class
# y_train_slow = df_train_slow.Class
y_train_aggressive = df_train_aggressive.Class

y_test_normal = df_test_normal.Class
# y_test_slow = df_test_slow.Class
y_test_aggressive = df_test_aggressive.Class

In [None]:
# * Объедините все выборки в единый набор данных
X_train = pd.concat([X_train_normal, X_train_aggressive])
y_train = pd.concat([y_train_normal, y_train_aggressive])

X_test = pd.concat([X_test_normal, X_test_aggressive])
y_test = pd.concat([y_test_normal, y_test_aggressive])

In [None]:
# Поскольку алгоритмы ML/DL работают с числами
# метки должны быть сопоставлены с числами
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
y_train = labelEncoder.fit_transform(y_train)
y_test = labelEncoder.transform(y_test)

In [None]:
# Поскольку набор данных содержит переменные, отличающиеся по масштабу,
# хорошим решением будет его стандартизация
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=True, with_std=True)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# * Перетасовываем данные
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train, random_state=42)
X_test, y_test = shuffle(X_test, y_test, random_state=42)

In [None]:
# * Настройка гиперпараметров для классификатора случайных лесов
import numpy as np

n_estimators = [5, 20, 50, 100, 1000] # number of trees in the random forest
max_features = ['auto', 'sqrt'] # number of features in consideration at every split
max_depth = [int(x) for x in np.linspace(10, 120, num = 12)] # maximum number of levels allowed in each decision tree
min_samples_split = [2, 6, 10] # minimum sample number to split a node
min_samples_leaf = [1, 3, 4] # minimum sample number that can be stored in a leaf node
bootstrap = [True, False] # method used to sample data points

random_grid = {'n_estimators': n_estimators,

'max_features': max_features,

'max_depth': max_depth,

'min_samples_split': min_samples_split,

'min_samples_leaf': min_samples_leaf,

'bootstrap': bootstrap}

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [None]:
# * Настройка и запуск поиска наилучших параметров
from sklearn.model_selection import RandomizedSearchCV

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,
               n_iter = 100, cv = 5, verbose=2, random_state=35, n_jobs = -1)

In [None]:
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


225 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
skl

In [None]:
# * Получаем лучший результат

print ('Random grid: ', random_grid, '\n')
# Best parameters
rf_random.best_params_

Random grid:  {'n_estimators': [5, 20, 50, 100, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120], 'min_samples_split': [2, 6, 10], 'min_samples_leaf': [1, 3, 4], 'bootstrap': [True, False]} 



{'n_estimators': 20,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

In [None]:
# Перестраиваем модель под лучшие параметры обучения
randmf = RandomForestClassifier(n_estimators = 20, min_samples_split = 10, min_samples_leaf= 4, max_features = 'sqrt', max_depth= 10, bootstrap=True)
randmf.fit( X_train, y_train)

In [None]:
# * Анализируем точность предсказаний
y_pred=randmf.predict(X_test)
# Reverse the label encoding
array_actual = labelEncoder.inverse_transform(y_test)
array_predicted = labelEncoder.inverse_transform(y_pred)
df_predicted = pd.DataFrame( { "Actual": array_actual, "Predicted": array_predicted } )
df_predicted

Unnamed: 0,Actual,Predicted
0,NORMAL,NORMAL
1,AGGRESSIVE,NORMAL
2,NORMAL,NORMAL
3,AGGRESSIVE,NORMAL
4,NORMAL,NORMAL
...,...,...
1735,AGGRESSIVE,AGGRESSIVE
1736,AGGRESSIVE,AGGRESSIVE
1737,NORMAL,NORMAL
1738,AGGRESSIVE,AGGRESSIVE


In [None]:
# * Собираем метрики
from sklearn import metrics

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5948275862068966


In [None]:
# * Пример загрузки данных для предсказания
import numpy as np
# -4.854163, -2.2235918, -0.46277046, 0.040317107, -0.009544769, 0.21120666 - AGRESSITIV
# -0.11302543, -0.038478702, 0.14680958, 0.0018325958, 0.0045051314, -0.01786781 - NORMAL
# 0.69761944,	0.1059494, 0.33160496, 0.08368854, 8.399397E-4,	0.050549097 - NORMAL
# 0.36734557,	0.3778225, 0.020035744,	0.0012217305,	-0.07368562, 0.011453724 - NORMAL
# 0.75819385, -0.2177906, 0.45726347, 0.0, 0.0, 0.0 - AGRESSITIV
# 0.5129715, 0.1584896, 0.2572117, 0.040927973, -0.005879578, 0.33215797 - AGRESSITIV
# 0.27051997, -0.07373035, 0.0030345917, 0.0036651916, -0.0077121737, 0.2576324 - AGRESSITIV
test_data = np.array( [[-0.9, -0.4, 0.0, -0.9, -0.5, -0.5]] )
test_predict=randmf.predict(test_data)
print(labelEncoder.inverse_transform(test_predict))

['NORMAL']


# Вариант 2.

Пример взят [отсюда](https://www.kaggle.com/code/outofskills/binary-conv-lstm).

Точность около 70-75%.

In [None]:
# * Разорхивируем данные из архива
import zipfile

with zipfile.ZipFile('/content/drive/MyDrive/Driving Behavior.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
# * Загружаем данные
import pandas as pd

df_train = pd.read_csv('/content/train_motion_data.csv')
df_test = pd.read_csv('/content/test_motion_data.csv')

In [None]:
# Global constants
TIMESTEPS = 20 # the number of sample to be fed to the NN
FEATURES = 6
LABELS = 2
N_RECORDS = 10
# CNN constantS
N_COLUMNS = 4
N_LENGTH = TIMESTEPS // N_COLUMNS
N_ROWS = 1 # 1 for 1D data

In [None]:
# Split the samples by class
# so it can be trimmed to a desired shape
df_train_normal = df_train.loc[df_train['Class'] == 'NORMAL']
df_train_slow = df_train.loc[df_train['Class'] == 'SLOW']
df_train_aggressive = df_train.loc[df_train['Class'] == 'AGGRESSIVE']

df_test_normal = df_test.loc[df_test['Class'] == 'NORMAL']
df_test_slow = df_test.loc[df_test['Class'] == 'SLOW']
df_test_aggressive = df_test.loc[df_test['Class'] == 'AGGRESSIVE']

In [None]:
# As it may be noticed, there is some noice
# at the beggin and end of every time series
# It is caused because of the class switching
# ---------------------------------------------
# A solution would be to just remove the first and last N samples
df_train_slow = df_train_slow.iloc[N_RECORDS:]
df_train_slow = df_train_slow.iloc[:-N_RECORDS]

df_test_slow = df_test_slow.iloc[N_RECORDS:]
df_test_slow = df_test_slow.iloc[:-N_RECORDS]

df_train_normal = df_train_normal.iloc[N_RECORDS:]
df_train_normal = df_train_normal.iloc[:-N_RECORDS]

df_test_normal = df_test_normal.iloc[N_RECORDS:]
df_test_normal = df_test_normal.iloc[:-N_RECORDS]

df_train_aggressive = df_train_aggressive.iloc[N_RECORDS:]
df_train_aggressive = df_train_aggressive.iloc[:-N_RECORDS]

df_test_aggressive = df_test_aggressive.iloc[N_RECORDS:]
df_test_aggressive = df_test_aggressive.iloc[:-N_RECORDS]

In [None]:
# Check train data per label
train_slow_samples = df_train_slow.__len__()
train_normal_samples = df_train_normal.__len__()
train_aggressive_samples = df_train_aggressive.__len__()

print('---Train---')
print('Slow samples: ' + str(train_slow_samples))
print('Normal samples: ' + str(train_normal_samples))
print('Aggressive samples: ' + str(train_aggressive_samples))

df_train_slow = df_train_slow.tail(1300)
df_train_normal = df_train_normal.tail(1160)
df_train_aggressive = df_train_aggressive.tail(1080)

---Train---
Slow samples: 1311
Normal samples: 1180
Aggressive samples: 1093


In [None]:
# Check test data per label
test_slow_samples = df_test_slow.__len__()
test_normal_samples = df_test_normal.__len__()
test_aggressive_samples = df_test_aggressive.__len__()

print('---Test---')
print('Slow samples: ' + str(test_slow_samples))
print('Normal samples: ' + str(test_normal_samples))
print('Aggressive samples: ' + str(test_aggressive_samples))

df_test_slow = df_test_slow.tail(1250)
df_test_normal = df_test_normal.tail(970)
df_test_aggressive = df_test_aggressive.tail(790)

---Test---
Slow samples: 1253
Normal samples: 977
Aggressive samples: 794


In [None]:
# Separating features from labels

# Features
X_train_normal = df_train_normal.iloc[: , :FEATURES]
X_train_slow = df_train_slow.iloc[: , :FEATURES]
X_train_aggressive = df_train_aggressive.iloc[: , :FEATURES]

X_test_normal = df_test_normal.iloc[: , :FEATURES]
X_test_slow = df_test_slow.iloc[: , :FEATURES]
X_test_aggressive = df_test_aggressive.iloc[: , :FEATURES]

# Labels
y_train_normal = df_train_normal.Class
y_train_slow = df_train_slow.Class
y_train_aggressive = df_train_aggressive.Class

y_test_normal = df_test_normal.Class
y_test_slow = df_test_slow.Class
y_test_aggressive = df_test_aggressive.Class

In [None]:
# Combine all the samples intro a train dataset
X_train = pd.concat([X_train_normal, X_train_aggressive])
y_train = pd.concat([y_train_normal, y_train_aggressive])

X_test = pd.concat([X_test_normal, X_test_aggressive])
y_test = pd.concat([y_test_normal, y_test_aggressive])

In [None]:
# As ML/DL algorithms are working with numbers
# the labels should be mapped to numbers
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
y_train = labelEncoder.fit_transform(y_train)
y_test = labelEncoder.transform(y_test)

In [None]:
# Because I'm using CategoricalCrossentropy loss function
# I have to convert my labels to one-hot encoded vectors
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(y_train, num_classes=LABELS)
y_test = to_categorical(y_test, num_classes=LABELS)

In [None]:
# As the dataset contains variables that are different in scale,
# a good solution will be to standardize it
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=True, with_std=True)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Reshaping the input for the Conv-LSTM network (Features)
train_samples = X_train.shape[0]//TIMESTEPS
X_train = X_train.reshape(train_samples, N_COLUMNS, N_ROWS, N_LENGTH, FEATURES)

test_samples = X_test.shape[0]//TIMESTEPS
X_test = X_test.reshape(test_samples, N_COLUMNS, N_ROWS, N_LENGTH, FEATURES)

print('Train features shape: ' + str(X_train.shape[0]))
print('Test features shape: ' + str(X_test.shape[0]))

Train features shape: 112
Test features shape: 88


In [None]:
y_train = y_train[::TIMESTEPS]
y_test = y_test[::TIMESTEPS]

print('Train labels shape: ' + str(y_train.shape[0]))
print('Test labels shape: ' + str(y_test.shape[0]))

Train labels shape: 112
Test labels shape: 88


In [None]:
!pip install keras-tuner --upgrade

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [None]:
import tensorflow as tf
from tensorflow import keras, lite
from tensorflow.keras.layers import LSTM, Dense, Dropout, Flatten, ConvLSTM2D, RepeatVector

import keras_tuner as kt

# Set a global random seed for a better random result
tf.random.set_seed(42)

# Build the model by assigning
# the number of layers and number of neurons for each layer
# the learning rate
# the number of epochs
# ------------------------------------------------------------------------
# To get these optimal parameters I will use the Keras Tuner library
# so it will get the optimal parameters for the NN
# and pass it to the model_builder function
def model_builder(hp):
    model = tf.keras.Sequential()

    model.add(
        ConvLSTM2D(
            filters=hp.Int('filter_units', min_value=16, max_value=64, step=16),
            kernel_size=(1,3),
            activation='relu',
            input_shape=(N_COLUMNS, N_ROWS, N_LENGTH, FEATURES)
            )
        )

    model.add(Dropout(hp.Float('dropout_0_rate', min_value=0, max_value=0.5, step=0.1)))

    model.add(Flatten())

    model.add(Dense(hp.Int('dense_units', min_value=32, max_value=512, step=32), activation='relu'))

    model.add(Dense(LABELS, activation='softmax'))


    # Tune the learning rate for the optimizer
    # Choose an optimal value from 0.1, 0.01, 0.001, or 0.0001
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-1, 1e-2, 1e-3, 1e-4])

    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
        metrics=[
            tf.keras.metrics.BinaryAccuracy(name='accuracy'),
            tf.keras.metrics.Precision(name='precision'),
            tf.keras.metrics.Recall(name='recall')
        ]
    )

    return model

In [None]:
# To be able to get the optimal parameters for our network
# we have to create a Tuner with the following setup
# 'objective' -> will use to create test models
# 'max_epochs' -> the maximum number of epochs to train one model
# 'factor' -> the reduction factor for the number of epochs and number of models for each bracket
# 'directory' -> will save the training history there
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=50,
                     factor=3,
                     directory='bin_conv_lstm_dir',
                     project_name='driving_behavior')

Reloading Tuner from bin_conv_lstm_dir/driving_behavior/tuner0.json


In [None]:
# Define an event that will stop the training
# depending of the value of "val_loss"
# ------------------------------------------------------------------
# Will stop training if the "val_loss" hasn't improved in 5 epochs.
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [None]:
# The algorithm trains a large number of models for a few epochs and
# carries forward only the top-performing half of models to the next round.
tuner.search(
        x=X_train,
        y=y_train,
        epochs=50,
        validation_data=(X_test, y_test),
        callbacks=[stop_early],
        shuffle=True
)

In [None]:
# Get best hyperparameters that were found during the search
best_model = tuner.get_best_models()[0]

In [None]:
# Get the metrics for our trained model
eval_result = best_model.evaluate(X_test, y_test)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 391ms/step - accuracy: 0.5820 - loss: 0.7078 - precision: 0.5820 - recall: 0.5820


In [None]:
# * Пример загрузки данных для предсказания
import numpy as np
# -3.5900722, -1.1844693, -1.7470059, 0.12339478, -0.07857254, 0.018784106 - NORMAL
# -4.854163, -2.2235918, -0.46277046, 0.040317107, -0.009544769, 0.21120666 - AGRESSITIV
# -2.5856998, 0.2698185, -0.3722973, -0.0036651916, 0.030161472, 0.21670444 - NORMAL
# -2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.025503624 - AGRESSITIV
test_data = np.array( [
    [
      [[
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624],
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624],
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624],
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624],
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624]
      ]],
      [[
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624],
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624],
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624],
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624],
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624]
      ]],
      [[
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624],
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624],
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624],
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624],
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624]
      ]],
      [[
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624],
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624],
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624],
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624],
        [-2.6275148, -1.7161267, -0.022824287, -0.064751714, -0.13660474, 0.925503624]
      ]]
    ],
    ] )

y_pred=best_model.predict(x=test_data)
y_pred = y_pred.tolist()[0]
result = {'AGGRESSIVE': y_pred[0], 'NORMAL': y_pred[1]}
print(result)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
{'AGGRESSIVE': 0.8636014461517334, 'NORMAL': 0.1363985538482666}


# Вариант 3.

Пример взят [отсюда](https://www.kaggle.com/datasets/veeralakrishna/aggressive-driving-data).

Точность около 89%.

In [14]:
# * Разорхивируем данные из архива
import zipfile

with zipfile.ZipFile('/content/drive/MyDrive/Aggressive Driving Data.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [15]:
# * Загружаем данные
import pandas as pd
data1 = pd.read_csv('/content/Train_Data/Train.csv')
data2 = pd.read_csv('/content/Train_Data/Train_Vehicletravellingdata.csv')
data3 = pd.read_csv('/content/Train_Data/Train_WeatherData.csv')

In [16]:
# * Переименовываем имена столбцов на нормальные названия
data1 = data1.rename(columns = {
                                'V2':'veincle length' ,
                                'V5':'veincle weight' ,
                                'V6':'axles number'})
data2 = data2.rename(columns = {
                                'V1':'time of collection' ,
                                'V3':'Lane of the road' ,
                                'V4':'veincle speed',
                                'V7':'perceding Veincle ID',
                                'V8':'perceding veincle speed',
                                'V9':'perceding veincle weight',
                                'V10':'perceding veincle length',
                                'V11':'perceding veincle time-gap',
                                'V18':'road condition'})
data3 = data3.rename(columns = {
                                'V1':'time of collection' ,
                                'V12':'Air temprture' ,
                                'V13':'perciption type',
                                'V14':'perciption intensity',
                                'V15':'relatve humadity',
                                'V16':'wind direction',
                                'V17':'wind speed',
                                'V19':'Lighting condition'})

In [None]:
# * Заменяем пустые ячейки на средние показатели
# df[col] = df[col].method(value)
data2['perceding veincle time-gap'] = data2['perceding veincle time-gap'].fillna(
    value = int(data2['perceding veincle time-gap'].median()) )
data3['Air temprture'] = data3['Air temprture'].fillna(
    value = int(data3['Air temprture'].median()) )

In [None]:
# * Назначение переменной как категориальной
data2['road condition'] = pd.factorize(data2['road condition'])[0]

In [None]:
data2_gropued = data2.groupby('ID')[
    ['perceding veincle time-gap',
     'Lane of the road',
     'veincle speed',
     'perceding veincle speed',
     'perceding veincle weight',
     'perceding veincle length',
     'road condition']
    ].mean()
data2_gropued = data2_gropued.astype({'Lane of the road':int})

TypeError: agg function failed [how->mean,dtype->object]

In [None]:
# * Замена пустых ячеек средним показателем

data3['relatve humadity'] = data3['relatve humadity'].fillna( value = int(data3['relatve humadity'].median()) )
data3['wind direction'] = data3['wind direction'].fillna( value = int(data3['wind direction'].median()) )
data3['wind speed'] = data3['wind speed'].fillna( value = int(data3['wind speed'].median()) )

In [None]:
# * Назначение категориальных переменных
data3['perciption type'] = pd.factorize(data3['perciption type'])[0]
data3['perciption intensity'] = pd.factorize(data3['perciption intensity'])[0]
data3['Lighting condition'] = pd.factorize(data3['Lighting condition'])[0]

In [None]:
data3_gropued = data3.groupby('ID')[
    ['Air temprture',
     'perciption type',
     'perciption intensity',
     'relatve humadity',
     'wind direction',
     'wind speed',
     'Lighting condition']
    ].mean()

In [17]:
data1['ID'].drop_duplicates

In [18]:
merged_data = pd.merge(data2_gropued,data3_gropued, on ='ID')
merged_data2 = pd.merge(data1,merged_data, on='ID')
merged_data2.head()
""" Для FastAI """
# merged_data = pd.merge(data2,data3, on ='ID')
# merged_data2 = pd.merge(data1,merged_data, on='ID')
# merged_data2 = merged_data2.astype({'DrivingStyle':int})
# merged_data2 = merged_data2.dropna(subset=['relatve humadity'])
# merged_data2 = merged_data2.drop('ID',axis = 1)

In [None]:
merged_data2.isnull().sum()

Unnamed: 0,0
ID,0
veincle length,0
veincle weight,0
axles number,0
DrivingStyle,0
perceding veincle time-gap,0
Lane of the road,0
veincle speed,0
perceding veincle speed,0
perceding veincle weight,0


In [None]:
merged_data2 = merged_data2.dropna(subset=['relatve humadity'])
merged_data2 = merged_data2.drop('ID',axis = 1)

In [19]:
# Сохраняем данные
merged_data2.to_csv('cleared_data.csv', index=False)

Сборка

In [None]:
X = merged_data2.drop(['DrivingStyle'] , axis = 1)
y = merged_data2['DrivingStyle']

In [None]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))
print(sorted(Counter(X_resampled).items()))

[(1, 6422), (2, 6422), (3, 6422)]
[('Air temprture', 1), ('Lane of the road', 1), ('Lighting condition', 1), ('axles number', 1), ('perceding veincle length', 1), ('perceding veincle speed', 1), ('perceding veincle time-gap', 1), ('perceding veincle weight', 1), ('perciption intensity', 1), ('perciption type', 1), ('relatve humadity', 1), ('road condition', 1), ('veincle length', 1), ('veincle speed', 1), ('veincle weight', 1), ('wind direction', 1), ('wind speed', 1)]


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as met
x_train,x_test,y_train,y_test=train_test_split(X_resampled,y_resampled,test_size=0.1,random_state=123)
avgList = []
for x in range(10):
  clf2=RandomForestClassifier(n_estimators=150,max_depth=100,min_samples_split=3)
  clf2.fit(x_train,y_train)
  y_pred = clf2.predict(x_test)
  print("---------------------------------")
  print("---------------------------------")
  print(y_pred)
  print("accuracy =" ,met.accuracy_score(y_test,y_pred))
  print("f1 score =" ,met.f1_score(y_test,y_pred,average='macro'))
  print("precision =",met.precision_score(y_test,y_pred,average='macro'))
  print("recall = ",met.recall_score(y_test,y_pred,average='macro'))
  avgList.append(met.accuracy_score(y_test,y_pred))
print("---------------------------------")
print("---------------------------------")
print("---------------------------------")
print("---------------------------------")
print("average =" ,sum(avgList)/len(avgList))

---------------------------------
---------------------------------
[2 3 1 ... 1 1 3]
accuracy = 0.8801245459263103
f1 score = 0.8791457500501002
precision = 0.884926078457993
recall =  0.8822078758003286
---------------------------------
---------------------------------
[2 3 1 ... 1 1 3]
accuracy = 0.8889465490399585
f1 score = 0.8880927835251345
precision = 0.8923104650649561
recall =  0.8908528755170265
---------------------------------
---------------------------------
[2 3 1 ... 1 1 3]
accuracy = 0.8837571354436948
f1 score = 0.8829024049657154
precision = 0.8877829360182301
recall =  0.885744212136665
---------------------------------
---------------------------------
[2 3 1 ... 1 1 3]
accuracy = 0.8879086663207058
f1 score = 0.8871232575704225
precision = 0.8909356765433641
recall =  0.889753376017527
---------------------------------
---------------------------------
[2 3 1 ... 1 1 3]
accuracy = 0.8863518422418267
f1 score = 0.8856078806688353
precision = 0.8903413360918115
re

In [None]:
def normalazed(input_data):
  """
  Функция нормализации данных
  input:  input_data - dict[]
  output: list[float]
  """
  normalized_data = list()
  for name_parameter, value_parameter in input_data.items():
    if name_parameter not in columns_cannot_normalized:
      min_value = merged_data2[name_parameter].min()
      max_value = merged_data2[name_parameter].max()
      normalized_parameter = (value_parameter - min_value) / (max_value - min_value)
    normalized_data.append(value_parameter)
  return normalized_data

# * Столбцы не подлежащие нормализации
columns_cannot_normalized = ("axles number", "Lane of the road", "perciption type", "perciption intensity", "road condition", "Lighting condition")
examples_driving_styles = \
 {
     "агрессивный": {
         "veincle length": 450,
         "veincle weight": 2500,
         "axles number": 2,
         "perceding veincle time-gap": 0.5,
         "Lane of the road": 2,
         "veincle speed": 200,
         "perceding veincle speed": 150,
         "perceding veincle weight": 1400,
         "perceding veincle length": 440,
         "road condition": 2,
         "Air temprture": -5,
         "perciption type": 2,
         "perciption intensity": 2,
         "relatve humadity": 40,
         "wind direction": 90,
         "wind speed": 25,
         "Lighting condition": 2
         },
     "нормальный": {
         "veincle length": 460,
         "veincle weight": 1600,
         "axles number": 2,
         "perceding veincle time-gap": 2.5,
         "Lane of the road": 2,
         "veincle speed": 90,
         "perceding veincle speed": 85,
         "perceding veincle weight": 1500,
         "perceding veincle length": 450,
         "road condition": 1,
         "Air temprture": 15,
         "perciption type": 1,
         "perciption intensity": 1,
         "relatve humadity": 70,
         "wind direction": 180,
         "wind speed": 3,
         "Lighting condition": 1
         },
     "неопределенный": {
         "veincle length": 470,
         "veincle weight": 1700,
         "axles number": 2,
         "perceding veincle time-gap": 3.5,
         "Lane of the road": 1,
         "veincle speed": 70,
         "perceding veincle speed": 65,
         "perceding veincle weight": 1600,
         "perceding veincle length": 460,
         "road condition": 2,
         "Air temprture": 0,
         "perciption type": 2,
         "perciption intensity": 2,
         "relatve humadity": 85,
         "wind direction": 270,
         "wind speed": 4,
         "Lighting condition": 2
         }
     }

dict_result = {1: 'Агрессивный', 2: 'Нормальный', 3: 'Неопределенный'}

for value in examples_driving_styles.values():
  result = clf2.predict([normalazed(value)])
  result = result.tolist()[0]
  print('========')
  print(f'Предсказание: {dict_result.get(result)}')

Предсказание: Агрессивный
Предсказание: Агрессивный
Предсказание: Нормальный




In [None]:
# * Сохранение модели и загрузка с последующим предсказанием
import pickle
filename = "model.pickle"
pickle.dump(clf2, open(filename, "wb"))
clf2 = pickle.load(open(filename, "rb"))

Предсказание: Агрессивный
Предсказание: Агрессивный
Предсказание: Нормальный




# Вариант 4.

Реализовано с помощью FastAI.
Точность около 91%.

In [20]:
import pandas as pd
data = pd.read_csv('/content/cleared_data.csv')

  data = pd.read_csv('/content/cleared_data.csv')


In [21]:
# * Удалить лишние столбцы
import numpy as np
data = data[ [ "axles number", "Lane of the road", "perciption type", "perciption intensity", "road condition", "Lighting condition", "DrivingStyle",  "veincle length", "veincle weight", "perceding veincle time-gap", "veincle speed", "perceding veincle speed", "perceding veincle weight", "perceding veincle length", "Air temprture", "relatve humadity", "wind direction", "wind speed"] ]
data = data.astype({"DrivingStyle":np.float32})

In [23]:
from fastai.tabular.all import *

target_variable = "DrivingStyle"
categorical_variables = ["axles number", "Lane of the road", "perciption type", "perciption intensity", "road condition", "Lighting condition"]
cont_variables = ["veincle length", "veincle weight", "perceding veincle time-gap", "veincle speed", "perceding veincle speed", "perceding veincle weight", "perceding veincle length", "Air temprture", "relatve humadity", "wind direction", "wind speed"]
splits = RandomSplitter(valid_pct=0.2)(range_of(data))
to = TabularPandas(
    data,
    y_names=target_variable,
    y_block = CategoryBlock(),
    cat_names = categorical_variables,
    cont_names = cont_variables,
    procs = [Categorify, FillMissing, Normalize],
    splits=splits)
dls = to.dataloaders(bs=64)
learn = tabular_learner(dls, metrics=accuracy)
learn.fit_one_cycle(1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

epoch,train_loss,valid_loss,accuracy,time
0,0.42261,0.337408,0.872142,11:26


In [None]:
# * Показать результаты тестовых предсказаний
learn.show_results()

Unnamed: 0,axles number,Lane of the road,perciption type,perciption intensity,road condition,Lighting condition,perceding veincle time-gap_na,wind direction_na,wind speed_na,veincle length,veincle weight,perceding veincle time-gap,veincle speed,perceding veincle speed,perceding veincle weight,perceding veincle length,Air temprture,relatve humadity,wind direction,wind speed,DrivingStyle,DrivingStyle_pred
0,1.0,2.0,1.0,0.0,1.0,2.0,1.0,1.0,1.0,-0.417907,-0.381868,-0.538669,-0.01124,0.326459,0.048294,-0.438925,0.703005,-1.441067,0.245438,1.603445,1.0,1.0
1,3.0,1.0,1.0,0.0,3.0,2.0,1.0,1.0,1.0,0.796507,2.517418,4.121282,-1.812245,-2.599542,-0.273464,-0.458316,-0.858928,1.97691,-0.541105,-1.350526,1.0,1.0
2,1.0,2.0,1.0,0.0,1.0,3.0,1.0,1.0,1.0,-0.330797,-0.366361,-0.538669,0.213885,-0.236233,-0.442603,-0.511642,-1.171314,-0.264386,-1.877151,0.618788,2.0,2.0
3,1.0,1.0,1.0,0.0,1.0,2.0,2.0,1.0,1.0,-0.405097,-0.408887,-0.35649,-0.348929,-0.236233,-0.407831,-0.388023,0.703005,0.071808,-1.693983,-1.022307,1.0,1.0
4,1.0,1.0,1.0,0.0,1.0,2.0,1.0,1.0,1.0,-0.433279,-0.437984,-0.155134,-0.348929,-0.236233,-0.121002,0.673644,1.327778,0.408003,-0.778145,-1.350526,2.0,2.0
5,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,-0.522951,-0.493779,-0.011308,-2.375059,0.664075,-0.460067,-0.291068,0.078232,-1.216937,0.363958,1.275226,1.0,1.0
6,1.0,1.0,1.0,0.0,1.0,2.0,1.0,1.0,1.0,-0.410221,-0.309766,-0.164722,-0.574054,-0.236233,-0.322553,-0.298339,-0.858928,-0.376451,-0.05625,1.275226,2.0,2.0
7,3.0,2.0,1.0,0.0,4.0,3.0,1.0,1.0,1.0,1.201312,-0.203451,0.218813,0.776699,2.239614,-0.327273,-0.240166,-1.171314,1.97691,1.764651,-0.03765,2.0,1.0
8,1.0,2.0,1.0,0.0,4.0,1.0,1.0,1.0,1.0,-0.548572,-0.27955,-0.538669,-0.461491,-1.361618,0.268097,2.389763,-1.171314,1.360553,-0.724273,-1.350526,1.0,1.0


In [37]:
# * Сохранить в файл
# learn.export('model.pkl')

# * Загрузить из файла
learn = load_learner('model.pkl')

If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn("load_learner` uses Python's insecure pickle module, which can execute malicious arbitrary code when loading. Only load files you trust.\nIf you only need to load model weights and optimizer state, use the safe `Learner.load` instead.")


In [38]:
import pandas as pd
examples_driving_styles = \
 {
     "агрессивный": {
         "veincle length": 450,
         "veincle weight": 2500,
         "axles number": 2,
         "perceding veincle time-gap": 0.5,
         "Lane of the road": 2,
         "veincle speed": 200,
         "perceding veincle speed": 150,
         "perceding veincle weight": 1400,
         "perceding veincle length": 440,
         "road condition": 2,
         "Air temprture": -5,
         "perciption type": 2,
         "perciption intensity": 2,
         "relatve humadity": 40,
         "wind direction": 90,
         "wind speed": 25,
         "Lighting condition": 2
         },
     "нормальный": {
         "veincle length": 460,
         "veincle weight": 1600,
         "axles number": 2,
         "perceding veincle time-gap": 2.5,
         "Lane of the road": 2,
         "veincle speed": 90,
         "perceding veincle speed": 85,
         "perceding veincle weight": 1500,
         "perceding veincle length": 450,
         "road condition": 1,
         "Air temprture": 15,
         "perciption type": 1,
         "perciption intensity": 1,
         "relatve humadity": 70,
         "wind direction": 180,
         "wind speed": 3,
         "Lighting condition": 1
         },
     "неопределенный": {
         "veincle length": 470,
         "veincle weight": 1700,
         "axles number": 2,
         "perceding veincle time-gap": 3.5,
         "Lane of the road": 1,
         "veincle speed": 70,
         "perceding veincle speed": 65,
         "perceding veincle weight": 1600,
         "perceding veincle length": 460,
         "road condition": 2,
         "Air temprture": 0,
         "perciption type": 2,
         "perciption intensity": 2,
         "relatve humadity": 85,
         "wind direction": 270,
         "wind speed": 4,
         "Lighting condition": 2
         }
     }

dict_result = {1: 'Агрессивный', 2: 'Нормальный', 3: 'Неопределенный'}

columns = ["veincle length", "veincle weight", "axles number", "perceding veincle time-gap", "Lane of the road", "veincle speed", "perceding veincle speed", "perceding veincle weight", "perceding veincle length", "road condition", "Air temprture", "perciption type", "perciption intensity", "relatve humadity", "wind direction", "wind speed", "Lighting condition"]

for value in examples_driving_styles.values():
  input_data = pd.Series(
      data=value.values(),
      index=columns
  )
  row, clas, probs = learn.predict(input_data)
  number_class = int( row["DrivingStyle"][0] )
  print( dict_result.get(number_class) )

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

Агрессивный


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

Нормальный


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

Нормальный
