In [26]:

# Импорт необходимых библиотек
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
    

In [27]:

# Загрузка данных
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_test = pd.read_csv('X_test.csv')
    

In [28]:

# Просмотр первых нескольких строк данных
print(X_train.head())
print(y_train.head())
print(X_test.head())
    

  row_id  series_id  measurement_number  orientation_X  orientation_Y  \
0    0_0          0                   0       -0.75853       -0.63435   
1    0_1          0                   1       -0.75853       -0.63434   
2    0_2          0                   2       -0.75853       -0.63435   
3    0_3          0                   3       -0.75852       -0.63436   
4    0_4          0                   4       -0.75852       -0.63435   

   orientation_Z  orientation_W  angular_velocity_X  angular_velocity_Y  \
0       -0.10488       -0.10597            0.107650            0.017561   
1       -0.10490       -0.10600            0.067851            0.029939   
2       -0.10492       -0.10597            0.007275            0.028934   
3       -0.10495       -0.10597           -0.013053            0.019448   
4       -0.10495       -0.10596            0.005135            0.007652   

   angular_velocity_Z  linear_acceleration_X  linear_acceleration_Y  \
0            0.000767               -0.

In [29]:

# Объединение данных X_train с y_train по столбцу 'series_id'
train_data = X_train.merge(y_train, on='series_id')
    

In [30]:

# Проверка на пропущенные значения
train_data.isnull().sum()
    

row_id                   0
series_id                0
measurement_number       0
orientation_X            0
orientation_Y            0
orientation_Z            0
orientation_W            0
angular_velocity_X       0
angular_velocity_Y       0
angular_velocity_Z       0
linear_acceleration_X    0
linear_acceleration_Y    0
linear_acceleration_Z    0
group_id                 0
surface                  0
dtype: int64

In [31]:

# # Нормализация данных
# # Нормализация данных
from sklearn.preprocessing import StandardScaler

# # Определим признаки, которые будут использованы для нормализации
# features = [col for col in X_train.columns if col not in ['row_id', 'series_id', 'surface']]

# # Применяем StandardScaler
# scaler = StandardScaler()

# # Обучаем scaler на тренировочных данных и трансформируем их
# X_train_scaled = scaler.fit_transform(X_train[features])

# # Применяем тот же scaler на тестовых данных (убедимся, что используем только те же самые столбцы)
# X_test_scaled = scaler.transform(X_test[features])

# # Целевые значения
# # y_train_values = train_data['surface'].values
# from sklearn.preprocessing import LabelEncoder

# label_encoder = LabelEncoder()
# y_train_encoded = label_encoder.fit_transform(train_data['surface'])

# # Нормализация данных
# features = [col for col in X_train.columns if col not in ['row_id', 'series_id', 'surface']]
# scaler = StandardScaler()

# X_train_scaled = scaler.fit_transform(X_train[features])
# X_test_scaled = scaler.transform(X_test[features])

# # Разделение данных на обучающую и тестовую выборки
# X_tr, X_te, y_tr, y_te = train_test_split(X_train_scaled, y_train_encoded, test_size=0.2, random_state=42)

# # Обучение модели
# forest = RandomForest(n_estimators=10, max_depth=5)
# forest.fit(X_tr, y_tr)

# # Предсказание на тестовых данных
# y_pred = forest.predict(X_te)

In [32]:

# # Разделение данных на обучающую и тестовую выборки
# from sklearn.model_selection import train_test_split

# X_tr, X_te, y_tr, y_te = train_test_split(X_train_scaled, y_train_values, test_size=0.2, random_state=42)
    

In [33]:

# Реализация узла дерева решений
class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, *, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
    

In [34]:

# Реализация дерева решений
class DecisionTree:
    def __init__(self, max_depth=10, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X, y):
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        num_labels = len(np.unique(y))

        if (depth >= self.max_depth or num_labels == 1 or num_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        feat_idxs = np.random.choice(num_features, num_features, replace=False)
        best_feat, best_thresh = self._best_criteria(X, y, feat_idxs)

        left_idxs, right_idxs = self._split(X[:, best_feat], best_thresh)

        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth + 1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth + 1)

        return Node(best_feat, best_thresh, left, right)

    def _best_criteria(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_thresh = None, None

        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)

            for threshold in thresholds:
                gain = self._information_gain(y, X_column, threshold)

                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_thresh = threshold

        return split_idx, split_thresh

    def _information_gain(self, y, X_column, threshold):
        parent_entropy = self._entropy(y)
        left_idxs, right_idxs = self._split(X_column, threshold)

        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0

        n = len(y)
        n_left, n_right = len(left_idxs), len(right_idxs)

        e_left, e_right = self._entropy(y[left_idxs]), self._entropy(y[right_idxs])

        child_entropy = (n_left / n) * e_left + (n_right / n) * e_right

        ig = parent_entropy - child_entropy

        return ig

    def _split(self, X_column, threshold):
        left_idxs = np.argwhere(X_column <= threshold).flatten()
        right_idxs = np.argwhere(X_column > threshold).flatten()
        return left_idxs, right_idxs

    def _entropy(self, y):
        hist = np.bincount(y)
        ps = hist / len(y)
        return -np.sum([p * np.log2(p) for p in ps if p > 0])

    def _most_common_label(self, y):
        return np.bincount(y).argmax()

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value

        if x[node.feature_index] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)
    

In [35]:

# Реализация случайного леса
class RandomForest:
    def __init__(self, n_estimators=10, max_depth=10, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_estimators):
            idxs = np.random.choice(len(X), len(X), replace=True)
            X_sample, y_sample = X[idxs], y[idxs]
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        tree_preds = np.swapaxes(tree_preds, 0, 1)
        y_pred = [np.bincount(tree_pred.astype('int')).argmax() for tree_pred in tree_preds]
        return np.array(y_pred)
    

In [36]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(train_data['surface'])

# Нормализация данных
features = [col for col in X_train.columns if col not in ['row_id', 'series_id', 'surface']]
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train[features])
X_test_scaled = scaler.transform(X_test[features])

# Разделение данных на обучающую и тестовую выборки
X_tr, X_te, y_tr, y_te = train_test_split(X_train_scaled, y_train_encoded, test_size=0.2, random_state=42)

# Обучение модели
forest = RandomForest(n_estimators=10, max_depth=10)
forest.fit(X_tr, y_tr)

# Предсказание на тестовых данных
y_pred = forest.predict(X_te)
# # Обучение модели
# forest = RandomForest(n_estimators=10, max_depth=10)
# forest.fit(X_tr, y_tr)

# # Предсказание на тестовых данных
# y_pred = forest.predict(X_te)
    

KeyboardInterrupt: 

In [None]:

# Оценка качества модели
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

accuracy = accuracy_score(y_te, y_pred)
precision = precision_score(y_te, y_pred, average='macro')
recall = recall_score(y_te, y_pred, average='macro')
f1 = f1_score(y_te, y_pred, average='macro')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
    

In [None]:

# Построение ROC-кривой
y_prob = np.mean([tree.predict(X_te) for tree in forest.trees], axis=0)
fpr, tpr, thresholds = roc_curve(y_te, y_prob, pos_label=1)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC кривая (площадь = {roc_auc:0.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Ложноположительные')
plt.ylabel('Истинно положительные')
plt.title('ROC-кривая')
plt.legend(loc="lower right")
plt.show()
    

In [None]:

# Ограничение глубины предотвращает переобучение, так как глубокие деревья могут "запоминать" тренировочные данные, плохо обобщая на новые. 
# Ограничение также уменьшает время обучения.
    