# Condition Labeling

Labels time-series data points in Cleaned dataset based on whether they are associated with a normal condition or with a failure mode. Takes input data from both the Time-Series csv and a separate csv template which will include info on work orders. Assumes all conditions apply to 2 months before the failure date.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import time
from datetime import datetime as tm
import seaborn as sns

In [2]:
filename = "Merged Dataset.xlsx"

df = pd.read_excel(filename)
number_rows = len(df.index)
df.head()

Unnamed: 0.1,Unnamed: 0,Time,Vib,16DI226,16FI226,16FI172,16LC128,16TI8110,16TI089,16VE1135,...,16VE1133,16SC127,16SC126,16LS128,16VE1134,16VE1132,16ZS5163,Unnamed: 27,Unnamed: 28,Unnamed: 29
0,1.0,2008-04-21 10:44:00,0.231,-1.279732,2.542525,32.663965,50.444592,104.267584,635.30254,0.289514,...,,,,,,,,,NaT,
1,2.0,2008-05-12 09:56:00,0.219,-1.488527,3.076608,32.081648,50.828064,91.097248,645.419165,0.279166,...,,,,,,,,,2019-05-13 10:44:00,1.0
2,3.0,2008-06-09 10:17:00,0.277,1.573682,0.19103,26.772534,50.274291,98.008864,648.432065,0.335089,...,,,,,,,,,2019-10-25 10:44:00,1.0
3,4.0,2008-08-07 10:26:00,0.197,-2.447651,1.795644,30.372475,47.98755,106.899254,654.593062,0.249114,...,,,,,,,,,2015-10-20 10:44:00,1.0
4,5.0,2008-09-29 16:17:00,0.163,-1.132853,3.038294,28.865257,48.734111,91.61091,638.442097,0.235917,...,,,,,,,,,2013-09-10 10:44:00,1.0


In [3]:
failuresfile = "Work Order Template.xlsx"

dfFailures = pd.read_excel(failuresfile)
number_failures = len(dfFailures.index)



In [4]:
def datetime_to_unix(dfname,colname): # pd natively detects dates as datetimes if in mm/dd/yyyy format, must convert to epoch to run calculations
    dfname[colname] = dfname[colname].apply(lambda x: time.mktime(x.timetuple())) # converts times into time_struct objects, then epoch
    return(dfname)

df = datetime_to_unix(df, 'Time')
dfFailures = datetime_to_unix(dfFailures,'Failure Date')

failures_dates = dfFailures.loc[:,'Failure Date'].values # Converts column into a numpy array of the values
failures_modes = dfFailures.loc[:,'Failure Mode'].values

df.dtypes
df.head()

Unnamed: 0.1,Unnamed: 0,Time,Vib,16DI226,16FI226,16FI172,16LC128,16TI8110,16TI089,16VE1135,...,16VE1133,16SC127,16SC126,16LS128,16VE1134,16VE1132,16ZS5163,Unnamed: 27,Unnamed: 28,Unnamed: 29
0,1.0,1208793000.0,0.231,-1.279732,2.542525,32.663965,50.444592,104.267584,635.30254,0.289514,...,,,,,,,,,NaT,
1,2.0,1210604000.0,0.219,-1.488527,3.076608,32.081648,50.828064,91.097248,645.419165,0.279166,...,,,,,,,,,2019-05-13 10:44:00,1.0
2,3.0,1213025000.0,0.277,1.573682,0.19103,26.772534,50.274291,98.008864,648.432065,0.335089,...,,,,,,,,,2019-10-25 10:44:00,1.0
3,4.0,1218123000.0,0.197,-2.447651,1.795644,30.372475,47.98755,106.899254,654.593062,0.249114,...,,,,,,,,,2015-10-20 10:44:00,1.0
4,5.0,1222723000.0,0.163,-1.132853,3.038294,28.865257,48.734111,91.61091,638.442097,0.235917,...,,,,,,,,,2013-09-10 10:44:00,1.0


In [5]:
df['Condition'] = 'Normal' # inserts new column for conditions. All are 'Normal' to start but will be overwritten if the Failures dataset indicates a failure is associated with (happened soon after) the data point 


In [13]:
def determine_failure_mode(row, failure_dates, failure_modes, date_span):  # date_span = 5259600 for 2 months going back
    date = row['Time']
    distances = [x-date for x in failure_dates] # positive if date comes before the failure
    #print(distances)
    try:
        closestfailure = min(i for i in distances if i > 0) # takes closest failure in the future from the data point of interest
        #print(closestfailure)
        closestfailuremode = failure_modes[distances.index(closestfailure)] # finds corresponding failure mode for nearest future failure
        #print(closestfailuremode)
        if closestfailure <= date_span: # tagging failure mode as being associated with this data point only if within desired proximity
            row['Condition'] = closestfailuremode
            print(row['Condition'])
        else:
            pass
    except:
        pass
    return(row)


In [15]:
df = df.apply(determine_failure_mode, axis=1, args = (failures_dates,failures_modes,5259600)) # assume every failure mode is best diagnosed if we consider datapoints from past 2 months prior to failure
df['Condition'].describe()

test4
test4
test4
test4
test3
test2
test2
test2
test2
test2
test2
test2
test2
test2
test2
test2
test2
test4
test4
test4
test4
test3
test2
test2
test2
test2
test2
test2
test4
test4
test3
test3
test5
test5
test1
test1
test2
test2


count       5536
unique         6
top       Normal
freq        5498
Name: Condition, dtype: object

In [1]:
df['Condition'].value_counts()

NameError: name 'df' is not defined