In [11]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation

In [12]:
# Закладка семян для воспроизводимости
np.random.seed(1234)  
PYTHONHASHSEED = 0

In [13]:
# Подготовка обучающей даты

# Чтение обучающей даты
train_df = pd.read_csv('../engines_failure/PM_train.txt', sep=" ", header=None)
train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True)
train_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']
train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [14]:
# Подготовка тестировочной даты

# Чтение тестировочной даты
test_df = pd.read_csv('../engines_failure/PM_test.txt', sep=" ", header=None)
test_df.drop(test_df.columns[[26, 27]], axis=1, inplace=True)
test_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']
train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [15]:
# Подготовка правдивой даты

# Чтение правдивой даты
truth_df = pd.read_csv('../engines_failure/PM_truth.txt', sep=" ", header=None)
truth_df.drop(truth_df.columns[[1]], axis=1, inplace=True)
truth_df.columns = ['ttf']
truth_df.head()

Unnamed: 0,ttf
0,112
1,98
2,69
3,82
4,91


In [16]:
truth_df[truth_df['ttf'] <= 30]

Unnamed: 0,ttf
17,28
19,16
23,20
30,8
33,7
34,11
35,19
36,21
39,28
40,18


In [17]:
# Создание столбца RUL для обучающих данных

# Создание DataFrame с максимальными значениями 'cycle' для каждого 'id'
rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index()

# Задание названий столбцов для DataFrame 'rul'
rul.columns = ['id', 'max']

# Слияние DataFrame 'train_df' с DataFrame 'rul' по столбцу 'id'
train_df = train_df.merge(rul, on=['id'], how='left')

# Создание столбца 'RUL', содержащего разницу между 'max' и 'cycle'
train_df['RUL'] = train_df['max'] - train_df['cycle']

# Удаление столбца 'max' из DataFrame 'train_df'
train_df.drop('max', axis=1, inplace=True)

# Вывод первых нескольких строк DataFrame для проверки изменений
train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


In [18]:
# Генерация столбцов меток для обучающих данных
w1 = 30  # Задание значения для порога w1
w0 = 15  # Задание значения для порога w0

# Создание столбца 'label1'. Если 'RUL' меньше или равно w1, то значение становится 1, иначе 0
train_df['label1'] = np.where(train_df['RUL'] <= w1, 1, 0)

# Создание столбца 'label2', который копирует значения из 'label1'
train_df['label2'] = train_df['label1']

# Обновление значений столбца 'label2'. Если 'RUL' меньше или равно w0, то значение становится 2
train_df.loc[train_df['RUL'] <= w0, 'label2'] = 2

# Вывод первых нескольких строк DataFrame для проверки изменений
train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s15,s16,s17,s18,s19,s20,s21,RUL,label1,label2
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,8.4195,0.03,392,2388,100.0,39.06,23.419,191,0,0
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,8.4318,0.03,392,2388,100.0,39.0,23.4236,190,0,0
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,8.4178,0.03,390,2388,100.0,38.95,23.3442,189,0,0
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,8.3682,0.03,392,2388,100.0,38.88,23.3739,188,0,0
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,8.4294,0.03,393,2388,100.0,38.9,23.4044,187,0,0


In [19]:
train_df[(train_df['id'] == 1) & (train_df['cycle'] >= 186)]

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s15,s16,s17,s18,s19,s20,s21,RUL,label1,label2
185,1,186,0.0027,-0.0003,100.0,518.67,643.51,1595.16,1426.3,14.62,...,8.5227,0.03,397,2388,100.0,38.47,23.0564,6,1,2
186,1,187,-0.0047,-0.0,100.0,518.67,643.32,1592.1,1427.27,14.62,...,8.5218,0.03,396,2388,100.0,38.42,23.0822,5,1,2
187,1,188,-0.0067,0.0003,100.0,518.67,643.75,1602.38,1422.78,14.62,...,8.5207,0.03,396,2388,100.0,38.51,22.9588,4,1,2
188,1,189,-0.0006,0.0002,100.0,518.67,644.18,1596.17,1428.01,14.62,...,8.5183,0.03,395,2388,100.0,38.48,23.1127,3,1,2
189,1,190,-0.0027,0.0001,100.0,518.67,643.64,1599.22,1425.95,14.62,...,8.5223,0.03,398,2388,100.0,38.49,23.0675,2,1,2
190,1,191,-0.0,-0.0004,100.0,518.67,643.34,1602.36,1425.77,14.62,...,8.5174,0.03,394,2388,100.0,38.45,23.1295,1,1,2
191,1,192,0.0009,-0.0,100.0,518.67,643.54,1601.41,1427.2,14.62,...,8.5113,0.03,396,2388,100.0,38.48,22.9649,0,1,2


In [20]:
# MinMax нормализация обучающих данных
train_df['cycle_norm'] = train_df['cycle']  # Создание нового столбца 'cycle_norm', который копирует значения из столбца 'cycle'

# Определение столбцов для нормализации, исключая ненужные столбцы
cols_normalize = train_df.columns.difference(['id','cycle','RUL','label1','label2'])

# Создание экземпляра MinMaxScaler для нормализации данных
min_max_scaler = preprocessing.MinMaxScaler()

# Нормализация выбранных столбцов с помощью метода fit_transform
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]), 
                             columns=cols_normalize, 
                             index=train_df.index)

# Объединение нормализованных данных с исходными данными
join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)

# Обновление DataFrame train_df с объединенными данными и переиндексацией столбцов
train_df = join_df.reindex(columns = train_df.columns)

# Вывод первых нескольких строк DataFrame для проверки изменений
train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s16,s17,s18,s19,s20,s21,RUL,label1,label2,cycle_norm
0,1,1,0.45977,0.166667,0.0,0.0,0.183735,0.406802,0.309757,0.0,...,0.0,0.333333,0.0,0.0,0.713178,0.724662,191,0,0,0.0
1,1,2,0.609195,0.25,0.0,0.0,0.283133,0.453019,0.352633,0.0,...,0.0,0.333333,0.0,0.0,0.666667,0.731014,190,0,0,0.00277
2,1,3,0.252874,0.75,0.0,0.0,0.343373,0.369523,0.370527,0.0,...,0.0,0.166667,0.0,0.0,0.627907,0.621375,189,0,0,0.00554
3,1,4,0.54023,0.5,0.0,0.0,0.343373,0.256159,0.331195,0.0,...,0.0,0.333333,0.0,0.0,0.573643,0.662386,188,0,0,0.00831
4,1,5,0.390805,0.333333,0.0,0.0,0.349398,0.257467,0.404625,0.0,...,0.0,0.416667,0.0,0.0,0.589147,0.704502,187,0,0,0.01108


In [21]:
# MinMax нормализация тренировочных данных

# Создание нового столбца 'cycle_norm', который копирует значения из столбца 'cycle' в test_df
test_df['cycle_norm'] = test_df['cycle']

# Нормализация тестовых данных test_df с помощью метода transform
norm_test_df = pd.DataFrame(min_max_scaler.transform(test_df[cols_normalize]), 
                            columns=cols_normalize, 
                            index=test_df.index)

# Объединение нормализованных данных с оставшимися данными в test_df
test_join_df = test_df[test_df.columns.difference(cols_normalize)].join(norm_test_df)

# Обновление DataFrame test_df с объединенными данными и переиндексацией столбцов
test_df = test_join_df.reindex(columns=test_df.columns)

# Сброс индексов источника данных для обучения и удаление старых индексов из test_df
test_df = test_df.reset_index(drop=True)

# Вывод первых нескольких строк DataFrame для проверки изменений
test_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,cycle_norm
0,1,1,0.632184,0.75,0.0,0.0,0.545181,0.310661,0.269413,0.0,...,0.220588,0.13216,0.308965,0.0,0.333333,0.0,0.0,0.55814,0.661834,0.0
1,1,2,0.344828,0.25,0.0,0.0,0.150602,0.379551,0.222316,0.0,...,0.264706,0.204768,0.213159,0.0,0.416667,0.0,0.0,0.682171,0.686827,0.00277
2,1,3,0.517241,0.583333,0.0,0.0,0.376506,0.346632,0.322248,0.0,...,0.220588,0.15564,0.458638,0.0,0.416667,0.0,0.0,0.728682,0.721348,0.00554
3,1,4,0.741379,0.5,0.0,0.0,0.370482,0.285154,0.408001,0.0,...,0.25,0.17009,0.257022,0.0,0.25,0.0,0.0,0.666667,0.66211,0.00831
4,1,5,0.58046,0.5,0.0,0.0,0.391566,0.352082,0.332039,0.0,...,0.220588,0.152751,0.300885,0.0,0.166667,0.0,0.0,0.658915,0.716377,0.01108


In [22]:
# Создание столбца RUL и столбцов меток для тестировочных данных

# Создание столбца 'max' для тестовых данных
rul = pd.DataFrame(test_df.groupby('id')['cycle'].max()).reset_index()

# Задание названий столбцов для DataFrame 'rul'
rul.columns = ['id', 'max']

# Изменение названия столбца в DataFrame 'truth_df'
truth_df.columns = ['more']

# Создание столбца 'id' в DataFrame 'truth_df' с помощью индексов и увеличение его значений на 1
truth_df['id'] = truth_df.index + 1

# Создание столбца 'max' в DataFrame 'truth_df', который является суммой 'max' из DataFrame 'rul' и столбца 'more' из DataFrame 'truth_df'
truth_df['max'] = rul['max'] + truth_df['more']

# Удаление столбца 'more' из DataFrame 'truth_df'
truth_df.drop('more', axis=1, inplace=True)

# Генерация столбца 'RUL' в test_df, который представляет разницу между 'max' и 'cycle'
test_df = test_df.merge(truth_df, on=['id'], how='left')
test_df['RUL'] = test_df['max'] - test_df['cycle']

# Удаление столбца 'max' из DataFrame 'test_df'
test_df.drop('max', axis=1, inplace=True)

# Генерация столбцов меток 'label1' и 'label2' для тестовых данных, используя пороговые значения w0 и w1
test_df['label1'] = np.where(test_df['RUL'] <= w1, 1, 0)  # Значение 1, если 'RUL' меньше или равно w1, в противном случае 0
test_df['label2'] = test_df['label1']  # Копирование значений из 'label1' в 'label2'

# Обновление значений в столбце 'label2'. Если 'RUL' меньше или равно w0, то значение становится 2
test_df.loc[test_df['RUL'] <= w0, 'label2'] = 2

# Вывод первых нескольких строк DataFrame для проверки изменений
test_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s16,s17,s18,s19,s20,s21,cycle_norm,RUL,label1,label2
0,1,1,0.632184,0.75,0.0,0.0,0.545181,0.310661,0.269413,0.0,...,0.0,0.333333,0.0,0.0,0.55814,0.661834,0.0,142,0,0
1,1,2,0.344828,0.25,0.0,0.0,0.150602,0.379551,0.222316,0.0,...,0.0,0.416667,0.0,0.0,0.682171,0.686827,0.00277,141,0,0
2,1,3,0.517241,0.583333,0.0,0.0,0.376506,0.346632,0.322248,0.0,...,0.0,0.416667,0.0,0.0,0.728682,0.721348,0.00554,140,0,0
3,1,4,0.741379,0.5,0.0,0.0,0.370482,0.285154,0.408001,0.0,...,0.0,0.25,0.0,0.0,0.666667,0.66211,0.00831,139,0,0
4,1,5,0.58046,0.5,0.0,0.0,0.391566,0.352082,0.332039,0.0,...,0.0,0.166667,0.0,0.0,0.658915,0.716377,0.01108,138,0,0


In [23]:
test_df[(test_df['id'] == 1) & (test_df['cycle'] >= 25)]

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s16,s17,s18,s19,s20,s21,cycle_norm,RUL,label1,label2
24,1,25,0.66092,0.25,0.0,0.0,0.313253,0.24831,0.303511,0.0,...,0.0,0.416667,0.0,0.0,0.635659,0.668738,0.066482,118,0,0
25,1,26,0.770115,0.083333,0.0,0.0,0.38253,0.266841,0.435854,0.0,...,0.0,0.25,0.0,0.0,0.488372,0.637255,0.069252,117,0,0
26,1,27,0.45977,0.583333,0.0,0.0,0.262048,0.34031,0.304862,0.0,...,0.0,0.333333,0.0,0.0,0.565891,0.688898,0.072022,116,0,0
27,1,28,0.626437,0.916667,0.0,0.0,0.216867,0.505995,0.321404,0.0,...,0.0,0.333333,0.0,0.0,0.534884,0.62966,0.074792,115,0,0
28,1,29,0.58046,0.583333,0.0,0.0,0.222892,0.35121,0.267725,0.0,...,0.0,0.333333,0.0,0.0,0.682171,0.646092,0.077562,114,0,0
29,1,30,0.356322,0.833333,0.0,0.0,0.475904,0.320035,0.316003,0.0,...,0.0,0.25,0.0,0.0,0.736434,0.707954,0.080332,113,0,0
30,1,31,0.465517,0.833333,0.0,0.0,0.412651,0.221932,0.281229,0.0,...,0.0,0.416667,0.0,0.0,0.51938,0.636564,0.083102,112,0,0


In [24]:
# Выбираем большой размер окна в 50 циклов
sequence_length = 50

In [25]:
# Функция для преобразования признаков в формат (образцы, временные шаги, признаки)
def gen_sequence(id_df, seq_length, seq_cols):
    """
    Функция gen_sequence принимает следующие аргументы:
    id_df: DataFrame, который содержит данные для конкретного идентификатора
    seq_length: длина последовательности
    seq_cols: список столбцов, используемых для создания последовательности
    """
    """ Рассматриваются только последовательности, которые соответствуют длине окна, без использования заполнения.
    Это означает, что при тестировании нам нужно исключить те, которые находятся ниже длины окна. 
    Альтернативой могло бы быть заполнение последовательностей, чтобы мы могли использовать более короткие """

    data_array = id_df[seq_cols].values  # Извлечение значений из DataFrame в массив
    num_elements = data_array.shape[0]  # Получение количества элементов в массиве

    # Цикл для создания последовательностей
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        yield data_array[start:stop, :]
        # Генерация последовательности от start до stop из массива данных

In [26]:
# Определение столбцов признаков
sensor_cols = ['s' + str(i) for i in range(1,22)]  # Генерация списка столбцов 's1' до 's21'
sequence_cols = ['setting1', 'setting2', 'setting3', 'cycle_norm']  # Определение списка столбцов 'setting1', 'setting2', 'setting3' и 'cycle_norm'

sequence_cols.extend(sensor_cols)  # Добавление сгенерированных столбцов 's1' до 's21' к списку sequence_cols

In [27]:
# Генератор для последовательностей
seq_gen = (list(gen_sequence(train_df[train_df['id']==id], sequence_length, sequence_cols)) 
           for id in train_df['id'].unique())

In [28]:
# Генерация последовательностей и преобразование их в массив numpy
seq_array = np.concatenate(list(seq_gen)).astype(np.float32)  # Объединение списка последовательностей в один массив numpy и преобразование в тип данных np.float32
seq_array.shape  # Вывод формы массива

(15631, 50, 25)

In [29]:
train_df[train_df['id']==1][sequence_cols]

Unnamed: 0,setting1,setting2,setting3,cycle_norm,s1,s2,s3,s4,s5,s6,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,0.459770,0.166667,0.0,0.000000,0.0,0.183735,0.406802,0.309757,0.0,1.0,...,0.633262,0.205882,0.199608,0.363986,0.0,0.333333,0.0,0.0,0.713178,0.724662
1,0.609195,0.250000,0.0,0.002770,0.0,0.283133,0.453019,0.352633,0.0,1.0,...,0.765458,0.279412,0.162813,0.411312,0.0,0.333333,0.0,0.0,0.666667,0.731014
2,0.252874,0.750000,0.0,0.005540,0.0,0.343373,0.369523,0.370527,0.0,1.0,...,0.795309,0.220588,0.171793,0.357445,0.0,0.166667,0.0,0.0,0.627907,0.621375
3,0.540230,0.500000,0.0,0.008310,0.0,0.343373,0.256159,0.331195,0.0,1.0,...,0.889126,0.294118,0.174889,0.166603,0.0,0.333333,0.0,0.0,0.573643,0.662386
4,0.390805,0.333333,0.0,0.011080,0.0,0.349398,0.257467,0.404625,0.0,1.0,...,0.746269,0.235294,0.174734,0.402078,0.0,0.416667,0.0,0.0,0.589147,0.704502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,0.114943,0.750000,0.0,0.518006,0.0,0.765060,0.683235,0.684166,0.0,1.0,...,0.234542,0.514706,0.091599,0.753367,0.0,0.666667,0.0,0.0,0.286822,0.089202
188,0.465517,0.666667,0.0,0.520776,0.0,0.894578,0.547853,0.772451,0.0,1.0,...,0.189765,0.661765,0.090670,0.744132,0.0,0.583333,0.0,0.0,0.263566,0.301712
189,0.344828,0.583333,0.0,0.523546,0.0,0.731928,0.614345,0.737677,0.0,1.0,...,0.287846,0.691176,0.065229,0.759523,0.0,0.833333,0.0,0.0,0.271318,0.239299
190,0.500000,0.166667,0.0,0.526316,0.0,0.641566,0.682799,0.734639,0.0,1.0,...,0.187633,0.617647,0.075704,0.740669,0.0,0.500000,0.0,0.0,0.240310,0.324910


In [30]:
train_df[(train_df['id'] == 1) & (train_df['cycle'] >= 0)]

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s16,s17,s18,s19,s20,s21,RUL,label1,label2,cycle_norm
0,1,1,0.459770,0.166667,0.0,0.0,0.183735,0.406802,0.309757,0.0,...,0.0,0.333333,0.0,0.0,0.713178,0.724662,191,0,0,0.000000
1,1,2,0.609195,0.250000,0.0,0.0,0.283133,0.453019,0.352633,0.0,...,0.0,0.333333,0.0,0.0,0.666667,0.731014,190,0,0,0.002770
2,1,3,0.252874,0.750000,0.0,0.0,0.343373,0.369523,0.370527,0.0,...,0.0,0.166667,0.0,0.0,0.627907,0.621375,189,0,0,0.005540
3,1,4,0.540230,0.500000,0.0,0.0,0.343373,0.256159,0.331195,0.0,...,0.0,0.333333,0.0,0.0,0.573643,0.662386,188,0,0,0.008310
4,1,5,0.390805,0.333333,0.0,0.0,0.349398,0.257467,0.404625,0.0,...,0.0,0.416667,0.0,0.0,0.589147,0.704502,187,0,0,0.011080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,1,188,0.114943,0.750000,0.0,0.0,0.765060,0.683235,0.684166,0.0,...,0.0,0.666667,0.0,0.0,0.286822,0.089202,4,1,2,0.518006
188,1,189,0.465517,0.666667,0.0,0.0,0.894578,0.547853,0.772451,0.0,...,0.0,0.583333,0.0,0.0,0.263566,0.301712,3,1,2,0.520776
189,1,190,0.344828,0.583333,0.0,0.0,0.731928,0.614345,0.737677,0.0,...,0.0,0.833333,0.0,0.0,0.271318,0.239299,2,1,2,0.523546
190,1,191,0.500000,0.166667,0.0,0.0,0.641566,0.682799,0.734639,0.0,...,0.0,0.500000,0.0,0.0,0.240310,0.324910,1,1,2,0.526316


In [31]:
# Функция для генерации меток
def gen_labels(id_df, seq_length, label):
    data_array = id_df[label].values  # Извлечение значений меток из DataFrame в массив
    num_elements = data_array.shape[0]  # Получение количества элементов в массиве
    return data_array[seq_length:num_elements, :]  # Возвращение значений меток, начиная с индекса seq_length и до конца массива

In [32]:
# Генерация меток
label_gen = [gen_labels(train_df[train_df['id']==id], sequence_length, ['label1']) 
             for id in train_df['id'].unique()]  # Генерация меток для каждого идентификатора в train_df
label_array = np.concatenate(label_gen).astype(np.float32)  # Объединение списка меток в один массив numpy и преобразование в тип данных np.float32
label_array.shape  # Вывод формы массива меток

(15631, 1)

In [33]:
label_array.shape[1]

1

In [34]:
# Построение нейронной сети
nb_features = seq_array.shape[2]  # Получение количества признаков из размерности seq_array
nb_out = label_array.shape[1]  # Получение количества выходных переменных из размерности label_array

model = Sequential()  # Инициализация последовательной модели

model.add(LSTM(
         input_shape=(sequence_length, nb_features),
         units=100,
         return_sequences=True))  # Добавление слоя LSTM с 100 юнитами, возвращающего последовательности
model.add(Dropout(0.2))  # Добавление слоя Dropout для регуляризации

model.add(LSTM(
          units=50,
          return_sequences=False))  # Добавление слоя LSTM с 50 юнитами, не возвращающего последовательности
model.add(Dropout(0.2))  # Добавление слоя Dropout для регуляризации

model.add(Dense(units=nb_out, activation='sigmoid'))  # Добавление полносвязного слоя с активацией sigmoid
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  # Компиляция модели с выбранной функцией потерь, оптимизатором и метриками

In [35]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 50, 100)           50400     
                                                                 
 dropout (Dropout)           (None, 50, 100)           0         
                                                                 
 lstm_1 (LSTM)               (None, 50)                30200     
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 80651 (315.04 KB)
Trainable params: 80651 (315.04 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [36]:
# Процесс обучения модели нейронной сети
model.fit(seq_array, label_array, epochs=10, batch_size=200, validation_split=0.05, verbose=1,
          callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto')])
# seq_array: input-data / входные данные
# label_array: output-data / выходные данные; предсказания модели
# epochs: одна полная итерация по всему набору обучающих данных. Один проход через весь набор данных считается одной эпохой.
# batch_size: определяет количество образцов обучающих данных, которые будут переданы в сеть для обработки за один раз.
# validation_split=0.05: Доля данных, которая будет использоваться для проверки во время обучения. (Во время каждой эпохи процесс 
# обучения будет вычислять потери и метрики на валидационном наборе данных, чтобы оценить, как хорошо модель обобщает данные, которые она еще не видела.)
# verbose=1: В этом режиме модель будет выводить прогресс обучения для каждой эпохи, включая информацию о потерях и метриках.
# callbacks: Список обратных вызовов, которые могут выполняться во время обучения. В данном случае используется обратный вызов 
# EarlyStopping, который остановит обучение, если значение функции потерь на проверочном наборе данных не улучшается.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.src.callbacks.History at 0x19c20710e50>

In [37]:
# Оценка производительности модели на обучающем наборе данных. Метод evaluate 
# вычисляет потери и метрики модели для переданных данных. В данном случае, seq_array 
# представляет собой входные данные, а label_array - соответствующие этим входным данным метки.

scores = model.evaluate(seq_array, label_array, verbose=1, batch_size=200)
print('Accurracy: {}'.format(scores[1]))

Accurracy: 0.9690998792648315


In [38]:
# Прогнозирование результатов и вычисление матрицы-путаницы на обучающих данных

# Матрица путаницы визуализирует точность классификатора, сравнивая фактические и прогнозируемые классы.
# Матрица двоичной путаницы состоит из квадратов:

#                       Predicted
#                   FALSE   |    TRUE
#           FALSE    (TN)   |    (FP)
#  Actual ------------------|--------------------------
#           TRUE     (FN)   |    (TP)

# TP: True Positive: прогнозируемые значения, правильно прогнозируемые как фактические положительные
# FP: Предсказанные значения неправильно предсказывают фактический положительный результат. т.е. отрицательные значения прогнозируются как положительные
# FN: False Negative: положительные значения прогнозируются как отрицательные
# TN: True Negative: прогнозируемые значения, правильно прогнозируемые как фактические негативные

# использование метода predict для получения вероятностей
y_pred_prob = model.predict(seq_array, batch_size=200)
# преобразование полученных вероятностей в бинарные предсказания с помощью сравнения с пороговым значением 0.5
y_pred = (y_pred_prob > 0.5).astype("float32")

# Сохранение истинных меток в y_true
y_true = label_array

# Вывод структуры матрицы неточностей
print('Матрица неточностей\n- По оси x находятся истинные метки.\n- По оси y находятся предсказанные метки')

# Вычисление матрицы неточностей с использованием функции confusion_matrix из библиотеки sklearn
cm = confusion_matrix(y_true, y_pred)

# Вывод матрицы неточностей
print(cm)

Матрица неточностей
- По оси x находятся истинные метки.
- По оси y находятся предсказанные метки
[[12497    34]
 [  449  2651]]


In [39]:
# Расчет метрик precision (точность) и recall (полнота)

# Precision (точность) - это метрика, которая оценивает долю правильно предсказанных 
# положительных классов от общего числа классифицированных как положительные. Формула для 
# вычисления precision: TP / (TP + FP), где TP - истинно положительные прогнозы, 
# а FP - ложно положительные прогнозы.

precision = precision_score(y_true, y_pred)

# Recall (полнота) - это метрика, которая оценивает долю правильно предсказанных положительных 
# классов от общего числа реальных положительных классов. Формула для вычисления recall: TP / (TP + FN), 
# где TP - истинно положительные прогнозы, а FN - ложно отрицательные прогнозы.

recall = recall_score(y_true, y_pred)

print( 'precision = ', precision, '\n', 'recall = ', recall)


precision =  0.9873370577281192 
 recall =  0.8551612903225806


Проверим метрики модели на тестировочных данных. Будем производить предсказания на 50 последних временных рядах для каждого двигателя.

In [40]:
# Генерация последовательностей
seq_array_test_last = [test_df[test_df['id']==id][sequence_cols].values[-sequence_length:] 
                       for id in test_df['id'].unique() if len(test_df[test_df['id']==id]) >= sequence_length]

seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)
seq_array_test_last.shape

(93, 50, 25)

In [41]:
seq_array_test_last[0]

array([[0.3505747 , 0.8333333 , 0.        , ..., 0.        , 0.4728682 ,
        0.45457056],
       [0.39655173, 0.5833333 , 0.        , ..., 0.        , 0.6511628 ,
        0.5608948 ],
       [0.5114943 , 0.25      , 0.        , ..., 0.        , 0.6356589 ,
        0.4975145 ],
       ...,
       [0.43678162, 0.75      , 0.        , ..., 0.        , 0.41860464,
        0.4710025 ],
       [0.5804598 , 0.6666667 , 0.        , ..., 0.        , 0.3255814 ,
        0.45954156],
       [0.40804598, 0.8333333 , 0.        , ..., 0.        , 0.6124031 ,
        0.52444077]], dtype=float32)

In [42]:
test_df[test_df['id']==3]

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s16,s17,s18,s19,s20,s21,cycle_norm,RUL,label1,label2
80,3,1,0.494253,0.583333,0.0,0.0,0.246988,0.411598,0.441256,0.0,...,0.0,0.333333,0.0,0.0,0.658915,0.554819,0.00000,194,0,0
81,3,2,0.724138,0.250000,0.0,0.0,0.307229,0.572705,0.432984,0.0,...,0.0,0.416667,0.0,0.0,0.542636,0.586716,0.00277,193,0,0
82,3,3,0.534483,0.750000,0.0,0.0,0.533133,0.342926,0.427583,0.0,...,0.0,0.250000,0.0,0.0,0.426357,0.667219,0.00554,192,0,0
83,3,4,0.436782,0.416667,0.0,0.0,0.364458,0.424678,0.323599,0.0,...,0.0,0.583333,0.0,0.0,0.449612,0.653687,0.00831,191,0,0
84,3,5,0.477011,0.250000,0.0,0.0,0.382530,0.262699,0.398717,0.0,...,0.0,0.333333,0.0,0.0,0.519380,0.540044,0.01108,190,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,3,122,0.626437,0.083333,0.0,0.0,0.433735,0.584042,0.515024,0.0,...,0.0,0.416667,0.0,0.0,0.364341,0.493096,0.33518,73,0,0
202,3,123,0.172414,0.666667,0.0,0.0,0.400602,0.554829,0.463707,0.0,...,0.0,0.500000,0.0,0.0,0.395349,0.467827,0.33795,72,0,0
203,3,124,0.436782,0.750000,0.0,0.0,0.557229,0.424024,0.515361,0.0,...,0.0,0.583333,0.0,0.0,0.418605,0.471002,0.34072,71,0,0
204,3,125,0.580460,0.666667,0.0,0.0,0.611446,0.383693,0.512323,0.0,...,0.0,0.500000,0.0,0.0,0.325581,0.459542,0.34349,70,0,0


In [43]:
# Создаем маску, которая позволяет выбрать только те данные, для которых количество записей по
# идентификатору больше или равно значению переменной sequence_length.

y_mask = [len(test_df[test_df['id']==id]) >= sequence_length for id in test_df['id'].unique()]

# Генерация меток
label_array_test_last = test_df.groupby('id')['label1'].nth(-1)[y_mask].values
label_array_test_last = label_array_test_last.reshape(label_array_test_last.shape[0],1).astype(np.float32)
label_array_test_last.shape

(93, 1)

In [44]:
# Оценка производительности модели на тестировочном наборе данных. Метод evaluate 
# вычисляет потери и метрики модели для переданных данных. В данном случае, seq_array_test_last 
# представляет собой входные данные, а label_array_test_last - соответствующие этим входным данным метки.
scores_test = model.evaluate(seq_array_test_last, label_array_test_last, verbose=2)
print('Accurracy: {}'.format(scores_test[1]))

3/3 - 0s - loss: 0.0887 - accuracy: 0.9462 - 78ms/epoch - 26ms/step
Accurracy: 0.9462365508079529


In [45]:
# Прогнозирование результатов и вычисление матрицы-путаницы на тестировочных данных

# использование метода predict для получения вероятностей
y_pred_test_prob = model.predict(seq_array_test_last, batch_size=200)
# преобразование полученных вероятностей в бинарные предсказания с помощью сравнения с пороговым значением 0.5
y_pred_test = (y_pred_test_prob > 0.5).astype("float32")

y_true_test = label_array_test_last

# Вывод структуры матрицы неточностей
print('Матрица неточностей\n- По оси x находятся истинные метки.\n- По оси y находятся предсказанные метки')
cm = confusion_matrix(y_true_test, y_pred_test)
# Вывод матрицы неточностей
print(cm)

Матрица неточностей
- По оси x находятся истинные метки.
- По оси y находятся предсказанные метки
[[68  0]
 [ 5 20]]


In [46]:
# Расчет метрик precision (точность), recall (полнота)
precision_test = precision_score(y_true_test, y_pred_test)
recall_test = recall_score(y_true_test, y_pred_test)

# F1-score представляет собой среднее гармоническое precision и recall, обеспечивая баланс между ними
f1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)

print( 'Precision: ', precision_test, '\n', 'Recall: ', recall_test,'\n', 'F1-score:', f1_test )

Precision:  1.0 
 Recall:  0.8 
 F1-score: 0.888888888888889


In [47]:
test_df[test_df['RUL']<=40]

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s16,s17,s18,s19,s20,s21,cycle_norm,RUL,label1,label2
2103,18,121,0.390805,0.666667,0.0,0.0,0.602410,0.615653,0.382681,0.0,...,0.0,0.416667,0.0,0.0,0.449612,0.548467,0.332410,40,0,0
2104,18,122,0.339080,0.250000,0.0,0.0,0.373494,0.420754,0.477043,0.0,...,0.0,0.500000,0.0,0.0,0.379845,0.438277,0.335180,39,0,0
2105,18,123,0.609195,0.750000,0.0,0.0,0.391566,0.435797,0.352296,0.0,...,0.0,0.250000,0.0,0.0,0.558140,0.537006,0.337950,38,0,0
2106,18,124,0.385057,0.833333,0.0,0.0,0.373494,0.452583,0.469109,0.0,...,0.0,0.500000,0.0,0.0,0.534884,0.393123,0.340720,37,0,0
2107,18,125,0.551724,0.166667,0.0,0.0,0.472892,0.378461,0.490378,0.0,...,0.0,0.500000,0.0,0.0,0.356589,0.604529,0.343490,36,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13091,100,194,0.781609,0.500000,0.0,0.0,0.611446,0.619359,0.566172,0.0,...,0.0,0.500000,0.0,0.0,0.395349,0.418669,0.534626,24,1,1
13092,100,195,0.436782,0.416667,0.0,0.0,0.605422,0.537388,0.671843,0.0,...,0.0,0.583333,0.0,0.0,0.333333,0.528721,0.537396,23,1,1
13093,100,196,0.465517,0.250000,0.0,0.0,0.671687,0.482014,0.414754,0.0,...,0.0,0.583333,0.0,0.0,0.372093,0.429301,0.540166,22,1,1
13094,100,197,0.281609,0.583333,0.0,0.0,0.617470,0.522128,0.626435,0.0,...,0.0,0.583333,0.0,0.0,0.403101,0.518779,0.542936,21,1,1


In [48]:
label_array_test_last[37]

array([0.], dtype=float32)

In [49]:
new_seq_array_test_last = seq_array_test_last[37].reshape(1, 50, 25)
# использование метода predict для получения вероятностей
y_pred_test_prob = model.predict(new_seq_array_test_last, batch_size=200)
# преобразование полученных вероятностей в бинарные предсказания с помощью сравнения с пороговым значением 0.5
y_pred_test = (y_pred_test_prob > 0.5).astype("float32")
print(y_pred_test)

[[0.]]


In [50]:
new_seq_array_test_last = seq_array_test_last[0].reshape(1, 50, 25)