In [75]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline 

In [76]:
# Load the datasets
dataset_train = pd.read_csv('PM_train.txt', sep=' ', header=None).drop([26, 27], axis=1)
col_names = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']
dataset_train.columns = col_names
print('Shape of Train dataset: ', dataset_train.shape)
dataset_train.head()

Shape of Train dataset:  (20631, 26)


Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [77]:
dataset_test = pd.read_csv('PM_test.txt', sep=' ', header=None).drop([26, 27], axis=1)
dataset_test.columns = col_names
print('Shape of Test dataset: ', dataset_train.shape)
dataset_train.head()

Shape of Test dataset:  (20631, 26)


Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [78]:
pm_truth = pd.read_csv('Truth Value.txt', sep=' ', header=None).drop([1], axis=1)
pm_truth.columns = ['more']
pm_truth['id'] = pm_truth.index + 1
pm_truth.head()

Unnamed: 0,more,id
0,93,1
1,91,2
2,95,3
3,111,4
4,96,5


In [84]:
rul = pd.DataFrame(dataset_test.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
rul.head()

Unnamed: 0,id,max
0,1,31
1,2,49
2,3,126
3,4,106
4,5,98


In [86]:
pm_truth['rtf'] = pm_truth['more'] + rul['max']
pm_truth.head()


Unnamed: 0,more,id,rtf
0,93,1,124.0
1,91,2,140.0
2,95,3,221.0
3,111,4,217.0
4,96,5,194.0


In [88]:
pm_truth.drop('more', axis=1, inplace=True)
dataset_test = dataset_test.merge(pm_truth, on=['id'], how='left')
dataset_test['ttf'] = dataset_test['rtf'] - dataset_test['cycle']
dataset_test.drop('rtf', axis=1, inplace=True)
dataset_test.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,ttf
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735,123.0
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916,122.0
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,2388.03,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166,121.0
3,1,4,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,...,2388.05,8132.9,8.3917,0.03,391,2388,100.0,39.0,23.3737,120.0
4,1,5,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,...,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.413,119.0


In [90]:
dataset_train['ttf'] = dataset_train.groupby(['id'])['cycle'].transform(max) - dataset_train['cycle']
dataset_train.head()


  dataset_train['ttf'] = dataset_train.groupby(['id'])['cycle'].transform(max) - dataset_train['cycle']


Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,ttf
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


In [92]:
df_train = dataset_train.copy()
df_test = dataset_test.copy()


In [94]:
features_col_name = ['setting1', 'setting2', 'setting3', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11',
                     's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']
target_col_name = 'ttf'


In [96]:
sc = MinMaxScaler()
df_train[features_col_name] = sc.fit_transform(df_train[features_col_name])
df_test[features_col_name] = sc.transform(df_test[features_col_name])


In [98]:
X_train = df_train[features_col_name]
y_train = df_train[target_col_name]
X_test = df_test[features_col_name]
y_test = df_test[target_col_name]


In [100]:
# Create linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [102]:
# Predict and evaluate the model
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

In [104]:
print("Train set evaluation:")
print('Mean Squared Error:', mean_squared_error(y_train, y_pred_train))
print('R-squared:', r2_score(y_train, y_pred_train))


Train set evaluation:
Mean Squared Error: 1994.547737619331
R-squared: 0.5795961126406292


In [106]:
# If y_test and y_pred_test are pandas Series
if isinstance(y_test, pd.Series):
    print("NaN values in y_test:", y_test.isna().sum())
    y_test = y_test.fillna(y_test.mean())
    y_test = y_test.reset_index(drop=True)


NaN values in y_test: 647


In [108]:
# If y_test and y_pred_test are numpy arrays
if isinstance(y_test, np.ndarray):
    print("NaN values in y_test:", np.isnan(y_test).sum())
    y_test = np.where(np.isnan(y_test), np.nanmean(y_test), y_test)


In [110]:
# If y_pred_test is pandas Series
if isinstance(y_pred_test, pd.Series):
    print("NaN values in y_pred_test:", y_pred_test.isna().sum())
    y_pred_test = y_pred_test.fillna(y_pred_test.mean())
    y_pred_test = y_pred_test.reset_index(drop=True)


In [112]:
# If y_pred_test is numpy arrays
if isinstance(y_pred_test, np.ndarray):
    print("NaN values in y_pred_test:", np.isnan(y_pred_test).sum())
    y_pred_test = np.where(np.isnan(y_pred_test), np.nanmean(y_pred_test), y_pred_test)


NaN values in y_pred_test: 0


In [114]:
# Ensure both are numpy arrays
if not isinstance(y_test, np.ndarray):
    y_test = y_test.to_numpy()

In [116]:
if not isinstance(y_pred_test, np.ndarray):
    y_pred_test = y_pred_test.to_numpy()

In [118]:
# Ensure that the lengths of y_test and y_pred_test match
if len(y_test) != len(y_pred_test):
    min_len = min(len(y_test), len(y_pred_test))
    y_test = y_test[:min_len]
    y_pred_test = y_pred_test[:min_len]

In [120]:
# Calculate and print the metrics
print("\nTest set evaluation:")
print('Mean Squared Error:', mean_squared_error(y_test, y_pred_test))
print('R-squared:', r2_score(y_test, y_pred_test))


Test set evaluation:
Mean Squared Error: 4625.645675545877
R-squared: 0.004406514052377575


In [122]:
def prob_failure(machine_id):
    machine_df = df_test[df_test.id == machine_id]
    machine_test = machine_df[features_col_name]
    m_pred = model.predict(machine_test)
    last_prediction = m_pred[-1]

In [124]:
last_prediction = y_pred_test[-1] if len(y_pred_test) > 0 else None



In [126]:
# Calculate the probability of failure within 30 days
if last_prediction is not None:
    if last_prediction <= 30:
        failure_prob = 1.0
    else:
        failure_prob = 0.0
    print("Probability of failure within 30 days:", failure_prob)
else:
    print("No predictions available to calculate failure probability.")

Probability of failure within 30 days: 1.0


In [128]:
#y_pred_test = np.array([50, 45, 40, 35, 20, 25, 15, 10])

In [196]:
# Set the desired threshold
threshold_days = 3


In [198]:
# Ensure there are predictions available
if len(y_pred_test) > 0:
    # Define last_prediction
    last_prediction = y_pred_test[-1]


In [200]:
 # Calculate the probability of failure within the threshold_days
if last_prediction <= threshold_days:
        failure_prob = 1.0
else:
        failure_prob = 0.0

print(f"Probability of failure within {threshold_days} days:", failure_prob) 


Probability of failure within 3 days: 1.0


In [202]:
pip install streamlit

Note: you may need to restart the kernel to use updated packages.


In [137]:
import streamlit as st