In [32]:
import pandas as pd 
import numpy as np 
import seaborn as sns

from scipy import constants
import math
import scipy 


In [33]:
def various_data_info(df):
    print(df.describe())
    print(df.isna().sum())
    print(df.info())

def merger_with_duplicate_row_remover(df1 , df2 ):
    print("*"*100)
    if ("datetime" in df2.columns):
        merged_df =pd.merge(df1, df2, on=['datetime','machineID'],how='left')
        merged_df = merged_df.replace(np.NaN,0)
        print("Shape of left dataset:                             ",df1.shape)
        print("Shape of the right dataset:                        ",df2.shape)
        print("Shape of merged dataset before checking duplicates:",merged_df.shape)

        #creating an extra column that will have unique datetime+machineID
        merged_df['combo'] = merged_df['machineID'].astype(str) + merged_df['datetime'].astype(str) 
        # merged_df['combo'].value_counts() to check duplicates Anything greater than 1 will be duplicated
        li = merged_df['combo'].value_counts()
        valids = li[li > 1].index  
        print("Duplicate rows found:", len(valids))

        merged_df[merged_df['combo'].isin(valids)] #create a dataframe To get rows of deficit indices
        # Here dropping the duplicate rows becomes essential 
        merged_df = merged_df.drop_duplicates(subset=['combo']) 
        print("Duplicates rows removed:", len(valids)/2 )
        print("Shape of merged dataset after removing duplicate columns:", merged_df.shape)
    else:
        # Machine dataframe has no datatime plus no duplicates
        merged_df =pd.merge(df1, df2, on=['machineID'],how='left')
        merged_df = merged_df.replace(np.NaN,0)
        print("Shape of left dataset:                             ",df1.shape)
        print("Shape of the right dataset:                        ",df2.shape)
        print("Shape of merged dataset before checking duplicates:",merged_df.shape)
        
    return merged_df



In [34]:
df_failure = pd.read_csv('PdM_failures.csv')
df_errors = pd.read_csv('PdM_errors.csv')
df_machines = pd.read_csv('PdM_machines.csv')
df_maint = pd.read_csv('PdM_maint.csv')
df_telemetry = pd.read_csv('PdM_telemetry.csv')

In [35]:
df_list = [df_failure, df_errors, df_maint, df_machines]  
df_merged = df_telemetry.copy()
for i in df_list:
    df_merged = merger_with_duplicate_row_remover(df_merged,i)

df_merged = df_merged.rename(columns={'comp':'maint_comp'}) 

#df_merged['failure'] = pd.Categorical(df_merged['failure'])
df_merged['errorID'] = pd.Categorical(df_merged['errorID'])
df_merged['maint_comp'] = pd.Categorical(df_merged['maint_comp'])

df = df_merged.copy()

df['datetime'] = df['datetime'].astype("datetime64[ns]")
# Extract features from timestamp
df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek
df['month'] = df['datetime'].dt.month
df['year'] = df['datetime'].dt.year
df['date'] = df['datetime'].dt.date
df['time'] = df['datetime'].dt.time
# 
df['rotate_in_radians'] = (df['rotate'] * np.pi/180)

****************************************************************************************************
Shape of left dataset:                              (876100, 6)
Shape of the right dataset:                         (761, 3)
Shape of merged dataset before checking duplicates: (876142, 7)
Duplicate rows found: 42
Duplicates rows removed: 21.0
Shape of merged dataset after removing duplicate columns: (876100, 8)
****************************************************************************************************
Shape of left dataset:                              (876100, 8)
Shape of the right dataset:                         (3919, 3)
Shape of merged dataset before checking duplicates: (876403, 9)
Duplicate rows found: 274
Duplicates rows removed: 137.0
Shape of merged dataset after removing duplicate columns: (876100, 9)
****************************************************************************************************
Shape of left dataset:                              (876100, 9)
Sh

In [6]:
rotation = np.deg2rad( df_telemetry['rotate'])
rotation

0         7.304274
1         7.029270
2         9.203991
3         6.041446
4         7.598760
            ...   
876095    6.897940
876096    7.787798
876097    7.815873
876098    7.221678
876099    8.658524
Name: rotate, Length: 876100, dtype: float64

In [7]:
rotation2 = (df_telemetry['rotate'] * (np.pi/180))
rotation2

0         7.304274
1         7.029270
2         9.203991
3         6.041446
4         7.598760
            ...   
876095    6.897940
876096    7.787798
876097    7.815873
876098    7.221678
876099    8.658524
Name: rotate, Length: 876100, dtype: float64

In [19]:
grouped = df.groupby(['machineID', 'date']).mean() 
grouped

['volt',
 'rotate',
 'pressure',
 'vibration',
 'age',
 'hour',
 'day_of_week',
 'month',
 'year']

In [31]:
#https://towardsdatascience.com/feature-engineering-on-time-series-data-transforming-signal-data-of-a-smartphone-accelerometer-for-72cbe34b8a60
list_volt = []
list_rotate = []
list_pressure = []
list_vibration = []
train_labels = []

window_size = 24
step_size = 7

#Create overlapping window
for i in range(0, df_train.shape[0] - window_size, step_size):
    volt = df_train['volt'].values[i: i + 24]
    rotate = df_train['rotate'].values[i: i + 24]
    pressure = df_train["pressure"].values[i: i + 24]
    vibration = df_train['vibration'].values[i: i + 24]
    #label = stats.mode(df_train['failure'][i: i + 100])[0]

    list_volt.append(volt)
    list_rotate.append(rotate)
    list_pressure.append(pressure)
    list_vibration.append(vibration)
    #train_labels.append(label)

NameError: name 'df_train' is not defined

### Hypothesis Testing 
Null Hypothesis: The sample data is from the original distribution (nothing interesting is going on; in other words the received data is based on the underlying population)


In [None]:
x_train = pd.DataFrame() 

# min
x_train['volt_min'] = pd.Series(list_volt).apply(lambda x: x.min())
x_train['rotate_min'] = pd.Series(list_rotate).apply(lambda x: x.min())
x_train['pressure_min'] = pd.Series(list_pressure).apply(lambda x: x.min())
x_train['vibration_min'] = pd.Series(list_vibration).apply(lambda x: x.min())


# max 
x_train['volt_max'] = pd.Series(list_volt).apply(lambda x: x.max())
x_train['rotate_max'] = pd.Series(list_rotate).apply(lambda x: x.max())
x_train['pressure_max'] = pd.Series(list_pressure).apply(lambda x: x.max())
x_train['vibration_max'] = pd.Series(list_vibration).apply(lambda x: x.max())

# Interquartile  range
x_train['volt_IQR'] = pd.Series(list_volt).apply(lambda x: np.percentile(x,75) - np.percentile(x,25))
x_train['rotate_IQR'] = pd.Series(list_rotate).apply(lambda x: np.percentile(x, 75) - np.percentile(x, 25))
x_train['pressure_IQR'] = pd.Series(list_pressure).apply(lambda x: np.percentile(x, 75) - np.percentile(x, 25))
x_train['vibration_IQR'] = pd.Series(list_vibration).apply(lambda x: np.percentile(x, 75) - np.percentile(x, 25))

# median
x_train['volt_median'] = pd.Series(list_volt).apply(lambda x: np.median(x))
x_train['rotate_median'] = pd.Series(list_rotate).apply(lambda x: np.median(x))
x_train['pressure_median'] = pd.Series(list_pressure).apply(lambda x: np.median(x))
x_train['vibration_median'] = pd.Series(list_vibration).apply(lambda x: np.median(x))

# mean 
x_train['volt_mean'] = pd.Series(list_volt).apply(lambda x: x.mean())
x_train['rotate_mean'] = pd.Series(list_rotate).apply(lambda x: x.mean())
x_train['pressure_mean'] = pd.Series(list_pressure).apply(lambda x: x.mean())
x_train['vibration_mean'] = pd.Series(list_vibration).apply(lambda x: x.mean())

# std
x_train['volt_std'] = pd.Series(list_volt).apply(lambda x: x.std())
x_train['rotate_std'] = pd.Series(list_rotate).apply(lambda x: x.std())
x_train['pressure_std'] = pd.Series(list_pressure).apply(lambda x: x.std())
x_train['vibration_std'] = pd.Series(list_vibration).apply(lambda x: x.std())

# peaks
x_train['volt_peaks'] = pd.Series(list_volt).apply(lambda x: len(find_peaks(x)[0]))
x_train['rotate_peaks'] = pd.Series(list_rotate).apply(lambda x: len(find_peaks(x)[0]))
x_train['pressure_peaks'] = pd.Series(list_pressure).apply(lambda x: len(find_peaks(x)[0]))
x_train['vibration_peaks'] = pd.Series(list_vibration).apply(lambda x: len(find_peaks(x)[0]))

# skewness
x_train['volt_skewness'] = pd.Series(list_volt).apply(lambda x: stats.skew(x))
x_train['rotate_skewness'] = pd.Series(list_rotate).apply(lambda x: stats.skew(x))
x_train['pressure_skewness'] = pd.Series(list_pressure).apply(lambda x: stats.skew(x))
x_train['vibration_skewness'] = pd.Series(list_vibration).apply(lambda x: stats.skew(x))

# Kurtosis 
x_train['volt_kurtosis'] = pd.Series(list_volt).apply(lambda x: stats.kurtosis(x))
x_train['rotate_kurtosis'] = pd.Series(list_rotate).apply(lambda x: stats.kurtosis(x))
x_train['pressure_kurtosis'] = pd.Series(list_pressure).apply(lambda x: stats.kurtosis(x))
x_train['vibration_kurtosis'] = pd.Series(list_vibration).apply(lambda x: stats.kurtosis(x))

# Signal Magnitude Area
x_train['sma'] = pd.Series(list_volt).apply(lambda x: np.sum(abs(x)/100)) \
    + pd.Series(list_rotate.apply(lambda x: np.sum(abs(x)/100)) + pd.Series(list_pressure).apply(lambda x: np.sum(abs(x)/100)) \
    + pd.Series(list_vibration).apply(lambda x: np.sum(abs(x)/100))



In [36]:
# Signal Magnitude Area
sma = pd.Series(list_volt).apply(lambda x: np.sum(abs(x)/100)) + pd.Series(list_rotate).apply(lambda x: np.sum(abs(x)/100)) + pd.Series(list_pressure).apply(lambda x: np.sum(abs(x)/100)) + pd.Series(list_vibration).apply(lambda x: np.sum(abs(x)/100))

  sma = pd.Series(list_volt).apply(lambda x: np.sum(abs(x)/100)) + pd.Series(list_rotate).apply(lambda x: np.sum(abs(x)/100)) + pd.Series(list_pressure).apply(lambda x: np.sum(abs(x)/100)) + pd.Series(list_vibration).apply(lambda x: np.sum(abs(x)/100))


If you have the rotation column given in RPM (rotations per minute) on an hourly basis and you want to derive the velocity values, you can use the following steps:

Convert RPM to Rotations per Second: Since the rotation values are given in RPM, you need to convert them to rotations per second (RPS) to align with the time unit. Divide the RPM values by 60 to convert them to RPS.

Convert Rotations per Second to Angular Velocity: Angular velocity is the rate of change of angle with respect to time. Multiply the RPS values by 2π to obtain the angular velocity values in radians per second.

Calculate Linear Velocity: If you know the radius of the rotating object, you can calculate linear velocity based on the angular velocity. Multiply the angular velocity values by the radius to obtain the linear velocity values.

Here's an example implementation in Python assuming you have a DataFrame named df with a column named 'rotation' containing the rotation values in RPM:

In [1]:
# Convert RPM to RPS
df['rotation_rps'] = df['rotate'] / 60

# Convert RPS to angular velocity in radians per second
df['angular_velocity'] = df['rotation_rps'] * 2 * np.pi

# Calculate linear velocity assuming a given radius (replace radius with the actual value)
radius = 0.5  # Example radius in meters
df['linear_velocity'] = df['angular_velocity'] * radius

NameError: name 'df' is not defined