In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import os
import ntpath
import pickle as pkl
import xlrd
import time
import string
import os
import glob
import math

from os import listdir
from os.path import isfile, join

### File Operations

In [2]:
def remove_file_in_folder(folder_path):

    files = glob.glob(folder_path + '*')
    for f in files:
        os.remove(f)

In [3]:
def create_dir(dir_path):    
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

### String operations

In [4]:
def remove_punctuation(x):
    table = str.maketrans({key: None for key in string.punctuation})
    return x.translate(table)

In [5]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [6]:
def get_time(dt_str):
    dt_str = dt_str.strip()
    dtobj = datetime.strptime(dt_str, '%m/%d/%Y %I:%M:%S %p')
    return dtobj

In [7]:
def parse_tags_time(txt):
    '''
    @{PIPoint=SCTM:22GTWY_E403:FALE22E23SP.PNT; Value=60; Timestamp=12/30/2017 11:48:05 PM}
    '''
    pi_point, val, time  = None, None, None
    delimiter = ';'
    sub_delimiter = '='
    
    txt = txt[txt.find('{')+1:txt.find('}')]    
    parsed_vals = txt.split(';')
    
    if len(parsed_vals) >= 3:
        pi_point = parsed_vals[0].split(sub_delimiter)[1]
    
        if pi_point is not None:
            values = parsed_vals[1].split(sub_delimiter)
            if len(values) >= 2:
                val = values[1]
                if is_number(val):
                    val = float(val)
                else:
                    val = val
            else:
                val = None

            time_vals = parsed_vals[2].split(sub_delimiter)
            if len(time_vals) >= 2:
                time = time_vals[1]
                time = get_time(time)
            else:
                return None, None, None

    if pi_point is not None:
        pi_point = pi_point.replace('SCTM:', '')
    
    return pi_point, val, time 

In [8]:
txt = '@{PIPoint=SCTM:22GTWY_E403:FALE22E23SP.PNT; Value=60; Timestamp=12/30/2017 11:48:05 PM}'
txt = '@{PIPoint=SCTM:22GTWY_E403:FALE22E23SP.PNT; Value=Error; Timestamp=12/30/2017 11:48:05 PM}'
a,b,c = parse_tags_time(txt)
print(a,b,c)

22GTWY_E403:FALE22E23SP.PNT Error 2017-12-30 23:48:05


In [9]:
def parse(txt):
    '''
    @{PIPoint=SCTM:22GTWY_E403:FALE22E23SP.PNT; Value=60; Timestamp=12/30/2017 11:48:05 PM}
    '''
    pi_point, val, time  = None, None, None
    delimiter = ';'
    sub_delimiter = '='
    
    txt = txt[txt.find('{')+1:txt.find('}')]    
    parsed_vals = txt.split(';')
    
    if len(parsed_vals) >= 3:
        pi_point = parsed_vals[0].split(sub_delimiter)[1]
    
        if pi_point is not None:
            values = parsed_vals[1].split(sub_delimiter)
            if len(values) >= 2:
                val = values[1]
                if is_number(val):
                    val = float(val)
                else:
                    val = val
            else:
                val = None

            time_vals = parsed_vals[2].split(sub_delimiter)
            if len(time_vals) >= 2:
                time = time_vals[1]
                time = get_time(time)
            else:
                return None, None, None

    if pi_point is not None:
        pi_point = pi_point.replace('SCTM:', '')
    
    return pi_point, val, time    

In [10]:
def longestSubstringFinder(string1, string2):
    '''
    Code from https://stackoverflow.com/questions/18715688/find-common-substring-between-two-strings    
    '''
    answer = ""
    len1, len2 = len(string1), len(string2)
    for i in range(len1):
        match = ""
        for j in range(len2):
            if (i + j < len1 and string1[i + j] == string2[j]):
                match += string2[j]
            else:
                if (len(match) > len(answer)): answer = match
                match = ""
    return answer

### Reading Data

In [11]:
def read_data_withfeature(input_path, print_debug = False):
    
    df_features = {}
    
    if os.path.isdir(input_path):
        input_files = [f for f in listdir(input_path) if (isfile(join(input_path, f))) and ((f.endswith('.pkl')) or (f.endswith('.csv')))]
    elif os.path.isfile(input_path):
        input_files = input_path
    
    if print_debug:
        print('Number of files found in %s is %d ' % (input_path, len(input_files)))
    
    for input_file in input_files:
        # feature,_ = os.path.splitext(input_file)
        input_file = input_path + input_file  
        
        with open(input_file, 'rb') as f:
            df = pkl.load(f)
            unq_features = np.unique(df['feature'])
            # print(input_file, unq_features)
            if len(unq_features) > 0:
                if len(unq_features) > 1:
                    if print_debug:
                        print('There are %d features in file %s' % (len(unq_features), input_file))
                    continue            
                feature = unq_features[0]            
                df_features[feature]= df
    
    if print_debug:
        print('Number of features extracted from %d files is %d ' % (len(input_files), len(df_features)))
    
    return df_features

In [12]:
def read_data(input_path, print_debug = False):
    
    df_features = {}
    
    if os.path.isdir(input_path):
        input_files = [f for f in listdir(input_path) if (isfile(join(input_path, f))) and ((f.endswith('.pkl')) or (f.endswith('.csv')))]
    elif os.path.isfile(input_path):
        input_files = input_path
    
    if print_debug:
        print('Number of files found in %s is %d ' % (input_path, len(input_files)))
    
    for input_file in input_files:
        
        feature,_ = os.path.splitext(input_file)
        input_file = input_path + input_file  
        feature = feature.replace('-', ':')
                
        with open(input_file, 'rb') as f:
            df = pkl.load(f)
            df_features[feature] = df
                
    if print_debug:
        print('Number of features extracted from %d files is %d ' % (len(input_files), len(df_features)))
    
    return df_features

### Generate Statistics Table

In [13]:
def generate_stats_df(df_features):

    df_stats = pd.DataFrame(columns=['feature', 'total_count', 'missing_val_count', 'min_date', 'max_date', 'max_val', 'min_val', 'variance', 'std', 'mean_val', 'median_val', 'kurt', 'skew'])
    idx = 0

    for feature, df in df_features.items():
        df_stats.loc[idx] = [feature, len(df), len(df)-df['val'].count(), df['datetime'].min(), df['datetime'].max(), df['val'].max(), df['val'].min(), df['val'].var(), df['val'].std(), df['val'].mean(), df['val'].median(), df['val'].kurt(), df['val'].skew()]
        idx += 1
        
    # Get the percentage missing values
    df_stats['perc_missing'] = df_stats['missing_val_count']/df_stats['total_count'] * 100
    return df_stats

### Remove features with substantial missing values

In [14]:
def remove_missing_features(df_features, value_col, max_missing_vals_pcnt):
    features = list(df_features.keys())
    for feature in features:
        df = df_features[feature]
        num_rows = len(df)
        count = df[value_col].count()
        percent_missing = (num_rows-count)/num_rows * 100
        # print(feature, percent_missing)
        if percent_missing > max_missing_vals_pcnt:
            del df_features[feature]
            # print('-------------------')
            
    return df_features

### Dateime <--> Integer Functions

In [15]:
def get_seconds_after(current_date, base_date):
    
    base_ts = time.mktime(base_date.timetuple()) # Converting to Unix timestamp
    current_ts = time.mktime(current_date.timetuple())
    time_diff = round((current_ts - base_ts))
    
    return time_diff

In [16]:
def get_minutes_after(current_date, base_date):
        
    base_ts = time.mktime(base_date.timetuple()) # Converting to Unix timestamp
    current_ts = time.mktime(current_date.timetuple())
    time_diff = round((current_ts - base_ts) / 60.0) + 1
    
    return time_diff

In [17]:
def get_hours_after(current_date, base_date):
    
    base_ts = time.mktime(base_date.timetuple()) # Converting to Unix timestamp
    current_ts = time.mktime(current_date.timetuple())
    time_diff = round((current_ts - base_ts) / 60.0 / 60.0) + 1
    
    return time_diff

In [18]:
def get_days_after(current_date, base_date):
    
    base_ts = time.mktime(base_date.timetuple()) # Converting to Unix timestamp
    current_ts = time.mktime(current_date.timetuple())
    time_diff = round((current_ts - base_ts) / 60.0 / 60.0 / 24) + 1
    
    return time_diff

In [19]:
def get_months_after(current_date, base_date):    
    time_diff = ((current_date.year - base_date.year) * 12) + current_date.month - base_date.month + 1    
    return time_diff

In [20]:
def get_years_after(current_date, base_date):    
    time_diff = (current_date.year - base_date.year) + 1 
    return time_diff

In [21]:
def get_time_from_minutes(time_in_mins, base_date):    
    new_date = base_date + timedelta(minutes = time_in_mins)        
    return new_date

In [22]:
def get_granulairty_function(granularity, current_date, base_date):
    
    if granularity == 'sec':
        return get_seconds_after(current_date, base_date)
    elif granularity == 'min':
        return get_minutes_after(current_date, base_date)
    elif granularity == 'hr':
        return get_hours_after(current_date, base_date)
    elif granularity == 'day':
        return get_days_after(current_date, base_date)
    elif granularity == 'mon':
        return get_months_after(current_date, base_date)
    elif granularity == 'yr':
        return get_years_after(current_date, base_date)
    
    return get_minutes_after(current_date, base_date) # Default return function is for minutes

### Data Aggregation for duplicate timestamps

In [23]:
def agg_data_dup_timestamps(df_features, 
                            feature_set, 
                            time_granularity, 
                            time_col, 
                            value_col, 
                            time_gran_col, 
                            base_date):
    
    df_features_e = {}
    
    if len(feature_set) == 0:
        feature_set = list(df_features.keys())
        
    for feature in feature_set:
        
        print(feature + ' -- Started ', end='')
        
        df = df_features[feature].copy()
        len_df_before = len(df)
        
        df.drop(columns=['feature'], inplace=True) # Drop the feature column as its redundant

        # Drop duplicates
        df.drop_duplicates(inplace=True)
        df.sort_values(by=[time_col], inplace=True, ascending=True)

        # Compute the granularity
        df[time_gran_col] = df[time_col].apply(lambda x:get_granulairty_function(time_granularity, x, base_date))

        # Average if there are more readings within the same granularity level
        df = df.dropna(subset=[value_col])
        df = df.drop(columns=[time_col])

        # Convert the value column to numeric to help in aggregation of duplicate timestamps
        df[value_col] = pd.to_numeric(df[value_col])

        # AGGREGATE the duplicate timestamps - take the mean
        df_g = df[[time_gran_col, value_col]].groupby(time_gran_col).mean()
        df = df_g.reset_index(level=0, inplace=False)
        len_df_after = len(df)
        
        print('Ended - Aggregated %d rows to %d rows' % (len_df_before, len_df_after))
        
        
        df_features_e[feature] = df 
        
        
    return df_features_e

### Writing down files

In [24]:
def write_feature_dict(dir_path, df_features, remove_existing=True):
    
    if remove_existing:
        # Remove the files from the directory
        remove_file_in_folder(dir_path)

    # Write the pickle files to the folder
    for feature in df_features.keys():
        df = df_features[feature].copy()
        
        fname = feature.replace(':', '-')

        pkl_file = dir_path + fname + '.pkl'
        # print('Writing to file ', pkl_file, df.shape, '[', feature, ']')
        
        with open(pkl_file, 'wb') as f:
            pkl.dump(df, f, protocol=pkl.HIGHEST_PROTOCOL)

### Generate Master Dataframe for time

In [25]:
from datetime import datetime
def generate_master_df(time_granularity,
                       time_gran_col,
                       base_date,
                       end_date,
                       print_debug = False):
    '''
    Generates a master dataframe
    Dataframe will have an integer column that denotes x minutes have passed after the base_date
    granulaity - can take one of the following - 'sec' (seconds), min ' (minutes), 'hr' (hour), 
                'day' (day), 'mon' (month), 'yr' (year)
    base_date = date of reference since which the unit of time is computed
    '''
    
    if print_debug:
        print('Granularity is', time_granularity, '\tStart Date = ', base_date, '\tEnd Date = ', end_date)
            
    max_td = get_granulairty_function(time_granularity, end_date, base_date)
    
    df_master = pd.DataFrame(columns=[time_gran_col])    
    df_master[time_gran_col] = [i for i in range(1, max_td+1)]
    
    if print_debug:
        print('Shape of the master dataframe is ', df_master.shape, 'with columns ', df_master.columns.values)
    
    return df_master

### Normalization

In [26]:
def scale_val(val, min_val, max_val):
    if val is not None:
        return (val-min_val)/(max_val-min_val + 1e-7)
    return None

In [27]:
def lcl_divmul(val, div_by, mul_by):
    val = math.floor(val/div_by)
    val = val * mul_by
    return val

### Renaming files inside sub directory

### Counting Frequency of read time intervals

In [28]:
from collections import Counter

def freq_intervals(df_features, feature_set, bin_size, min_occurence, percentage=False, plot=False):
    
    prev_time_gran_col = 'prev_' + time_gran_col
    prev_value_col = 'prev_' + value_col
    diff_in_time = 'diff_in_time'
    # total_plots = 0

    if len(feature_set) == 0:
        feature_set = list(df_features.keys())
                
    if plot:        
        total_plots = len(feature_set)
        cols_plot = 3
        rows_plot = math.ceil(total_plots/cols_plot)
        
        plt.rcParams['figure.figsize'] = [cols_plot * 10, rows_plot * 7] # Size of the plots
        plt.figure()
        
        xlabel = 'Time Granularity - mins/hr/day'
        ylabel = 'Frequency'
        
            
    idx = 1
    for feature in feature_set:
    
        df = df_features[feature].copy()
        
        print(feature + ' -- Started ', end='')

        df[prev_value_col] = df[value_col]
        df[prev_time_gran_col] = df[time_gran_col]

        # for i in range(1, len(df)):
        #    df.loc[i, prev_time_gran_col] = df.loc[i-1, time_gran_col]  
        
        df[prev_time_gran_col] = df[time_gran_col].shift(1)
        df.dropna(subset=[prev_time_gran_col], inplace=True)
        
        # Get the difference values
        df[diff_in_time] = df[time_gran_col] - df[prev_time_gran_col]
        
        # print(df.head())
        
        # Binning the dataset
        diff = df[diff_in_time].copy()
        
        buckets = diff.apply(lambda x: math.floor(x/bin_size) * bin_size)
        buckets = list(buckets)
        bucket_len = len(buckets)
        counter = Counter(buckets)

        # The percentages in frequency_dict wont add upto 100 since only
        # values more than 1 are added to the new dictionary - look at the 
        # if statement below
        frequency_dict = {}
        for k in counter:
            v = counter[k]
            if v >= min_occurence:
                if percentage:
                    v = v / bucket_len * 100.0
                    ylabel += ' (%)'
                k = str(k * bin_size + 1) + '-' + str( (k+1) * bin_size)
                frequency_dict[k] = v
                
        if plot:
            x = list(frequency_dict.keys())
            x = list(map(str, x))
            y = list(frequency_dict.values())
                        
            plt.subplot(rows_plot, cols_plot, idx)
            # plt.xlabel(xlabel)
            plt.ylabel(ylabel)
            plt.title(feature)
            plt.xticks(rotation=45)
            plt.bar(x,y)
            
            for i, v in enumerate(y):
                v = round(v, 2)
                plt.text(i-.25, v + .25, str(v), color='blue', fontweight='bold')
            
        print('Ended')
            
        idx += 1

### Finding the missing intervals

In [29]:
from collections import Counter

def missing_data_intervals(df_features, feature_set, min_interval_span, plot=False):
    
    prev_time_gran_col = 'prev_' + time_gran_col
    prev_value_col = 'prev_' + value_col
    diff_in_time = 'diff_in_time'
    # total_plots = 0

    if len(feature_set) == 0:
        feature_set = list(df_features.keys())
                
    if plot:        
        total_plots = len(feature_set)
        cols_plot = 3
        rows_plot = math.ceil(total_plots/cols_plot)
        
        plt.rcParams['figure.figsize'] = [cols_plot * 10, rows_plot * 7] # Size of the plots
        plt.figure()
        
        xlabel = 'Time Granularity - mins/hr/day'
        ylabel = 'Frequency'
        
            
    idx = 1
    for feature in feature_set:
    
        df = df_features[feature].copy()

        df[prev_value_col] = df[value_col]
        df[prev_time_gran_col] = df[time_gran_col]
        
        df[prev_time_gran_col] = df[time_gran_col].shift(1)
        df.dropna(subset=[prev_time_gran_col], inplace=True)
        
        # Get the difference values
        df[diff_in_time] = df[time_gran_col] - df[prev_time_gran_col]

        # Find only the dataset which is at least min_interval_span size
        df = df_features_agg[feature].copy()
        
        print(feature + ' -- Started ', end='')

        df[prev_value_col] = df[value_col]
        df[prev_time_gran_col] = df[time_gran_col]

        df[prev_time_gran_col] = df[time_gran_col].shift(1)
        df.dropna(subset=[prev_time_gran_col], inplace=True)

        # Get the difference values
        df[diff_in_time] = df[time_gran_col] - df[prev_time_gran_col]

        # Find only the dataset which is at least min_interval_span size
        df = df.loc[df[diff_in_time] >= min_occurence]

        df[prev_time_col] = df[prev_time_gran_col].apply(lambda x: get_time_from_minutes(x, base_date))
        df[prev_time_col] = df[prev_time_col].apply(lambda x: x.date())
                     
                
        if plot:
            x = df[prev_time_col]
            x = list(map(str, x))
            y = df[diff_in_time]
                        
            plt.subplot(rows_plot, cols_plot, idx)
            plt.ylabel(ylabel)
            plt.title(feature)
            plt.xticks(rotation=45)
            plt.bar(x,y)
            
            for i, v in enumerate(y):
                v = round(v, 2)
                plt.text(i-.25, v + .25, str(v), color='blue', fontweight='bold')
            
        print('Ended')
            
        idx += 1

### Define Alarm Tag types

In [30]:
from collections import Counter

def freq_intervals(df_features, feature_set, bin_size, min_occurence, percentage=False, plot=False):
    
    prev_time_gran_col = 'prev_' + time_gran_col
    prev_value_col = 'prev_' + value_col
    diff_in_time = 'diff_in_time'
    time_gran_col = 'datetime_gran'
    # total_plots = 0

    if len(feature_set) == 0:
        feature_set = list(df_features.keys())
                
    if plot:        
        total_plots = len(feature_set)
        cols_plot = 3
        rows_plot = math.ceil(total_plots/cols_plot)
        
        plt.rcParams['figure.figsize'] = [cols_plot * 10, rows_plot * 7] # Size of the plots
        plt.figure()
        
        xlabel = 'Time Granularity - mins/hr/day'
        ylabel = 'Frequency'
        
            
    idx = 1
    for feature in feature_set:
    
        df = df_features[feature].copy()
        
        print(feature + ' -- Started ', end='')

        df[prev_value_col] = df[value_col]
        df[prev_time_gran_col] = df[time_gran_col]

        # for i in range(1, len(df)):
        #    df.loc[i, prev_time_gran_col] = df.loc[i-1, time_gran_col]  
        
        df[prev_time_gran_col] = df[time_gran_col].shift(1)
        df.dropna(subset=[prev_time_gran_col], inplace=True)
        
        # Get the difference values
        df[diff_in_time] = df[time_gran_col] - df[prev_time_gran_col]
        
        # print(df.head())
        
        # Binning the dataset
        diff = df[diff_in_time].copy()
        
        buckets = diff.apply(lambda x: math.floor(x/bin_size) * bin_size)
        buckets = list(buckets)
        bucket_len = len(buckets)
        counter = Counter(buckets)

        # The percentages in frequency_dict wont add upto 100 since only
        # values more than 1 are added to the new dictionary - look at the 
        # if statement below
        frequency_dict = {}
        for k in counter:
            v = counter[k]
            if v >= min_occurence:
                if percentage:
                    v = v / bucket_len * 100.0
                    ylabel += ' (%)'
                k = str(k * bin_size + 1) + '-' + str( (k+1) * bin_size)
                frequency_dict[k] = v
                
        if plot:
            x = list(frequency_dict.keys())
            x = list(map(str, x))
            y = list(frequency_dict.values())
                        
            plt.subplot(rows_plot, cols_plot, idx)
            # plt.xlabel(xlabel)
            plt.ylabel(ylabel)
            plt.title(feature)
            plt.xticks(rotation=45)
            plt.bar(x,y)
            
            for i, v in enumerate(y):
                v = round(v, 2)
                plt.text(i-.25, v + .25, str(v), color='blue', fontweight='bold')
            
        print('Ended')
            
        idx += 1

In [31]:
from collections import Counter
from utils import get_time_from_minutes

def missing_data_intervals(df_features, feature_set, min_interval_span, plot=False):
    
    prev_time_gran_col = 'prev_' + time_gran_col
    prev_value_col = 'prev_' + value_col
    prev_time_col = 'prev_' + 'time'
    diff_in_time = 'diff_in_time'
    time_gran_col = 'datetime_gran'
    # total_plots = 0

    if len(feature_set) == 0:
        feature_set = list(df_features.keys())
                
    if plot:        
        total_plots = len(feature_set)
        cols_plot = 3
        rows_plot = math.ceil(total_plots/cols_plot)
        
        plt.rcParams['figure.figsize'] = [cols_plot * 10, rows_plot * 7] # Size of the plots
        plt.figure()
        
        xlabel = 'Time Granularity - mins/hr/day'
        ylabel = 'Frequency'
        
            
    idx = 1
    for feature in feature_set:
    
        df = df_features[feature].copy()

        df[prev_value_col] = df[value_col]
        df[prev_time_gran_col] = df[time_gran_col]
        
        df[prev_time_gran_col] = df[time_gran_col].shift(1)
        df.dropna(subset=[prev_time_gran_col], inplace=True)
        
        # Get the difference values
        df[diff_in_time] = df[time_gran_col] - df[prev_time_gran_col]

        # Find only the dataset which is at least min_interval_span size
        df = df_features_agg[feature].copy()
        
        print(feature + ' -- Started ', end='')

        df[prev_value_col] = df[value_col]
        df[prev_time_gran_col] = df[time_gran_col]

        df[prev_time_gran_col] = df[time_gran_col].shift(1)
        df.dropna(subset=[prev_time_gran_col], inplace=True)

        # Get the difference values
        df[diff_in_time] = df[time_gran_col] - df[prev_time_gran_col]

        # Find only the dataset which is at least min_interval_span size
        df = df.loc[df[diff_in_time] >= min_interval_span]

        df[prev_time_col] = df[prev_time_gran_col].apply(lambda x: get_time_from_minutes(x, base_date))
        df[prev_time_col] = df[prev_time_col].apply(lambda x: x.date())
                     
                
        if plot:
            x = df[prev_time_col]
            x = list(map(str, x))
            y = df[diff_in_time]/min_interval_span
            
            if len(x) > 0:
                        
                plt.subplot(rows_plot, cols_plot, idx)
                plt.ylabel(ylabel)
                plt.title(feature)
                plt.xticks(rotation=45)
                plt.bar(x,y)
                
                idx += 1

                for i, v in enumerate(y):
                    v = round(v, 2)
                    plt.text(i-.25, v + .25, str(v), color='blue', fontweight='bold')
            
        print('Ended')

22GTWY_E403:FALE22E23SP.PNT Error 2017-12-30 23:48:05


In [32]:
def tag_categories():

    tag_categ = {}
    tag_categ['vibration'] = [
        '05GTWY_BN06:XT61B10.PNT',
        '05GTWY_BN06:XT61B11.PNT',
        '05GTWY_BN06:XT61B12.PNT',
        '05GTWY_BN06:XT61B13.PNT',
        '05GTWY_BN06:XT61B17.PNT',
        '05GTWY_BN06:XT61B18.PNT',
        '05GTWY_BN06:XT61B19.PNT',
        '05GTWY_BN06:XT61B20.PNT',
        '05GTWY_BN06:ZT61B14.PNT',
        '05GTWY_BN06:ZT61B15.PNT'
    ]

    tag_categ['temperature'] = [
        'TT61B01.PV',
        'TT61B02.PV',
        'TT61B03.PV',
        'TT61B04.PV',
        'TT61B05.PV',
        'TT61B06.PV',
        'TC63109E.AV',
        'TT63109.PV',
        '07DATASCRCP1:AI07003.PNT'
    ]

    tag_categ['pressure'] = [
        'PIE61608.PV',
        'PIE61B00.PV',
        'PIE61B22.PV',
        'PIE61B23.PV',
        'PIE63113.PV',
        'PT61A98.PV',
        'PT61B00.PV',
        'PT63103.PV',
        'PC63112E.AV',
        'PT63112.PV'
    ]

    tag_categ['level'] = [
        'LT63114.PV',
        'PC61A98.AV',
        'PC63112.AV',
        'TC63109.AV',
        'F61221VP',
        'T6150.PV',
        'T6151.PV',
        'T6152.PV',
        'T6153.PV'
    ]

    tag_categ['alarm'] = [
        'FA61A99.PV',
        'LA63114.PV',
        'LAL63114.PV',
        'PA61B00.PV',
        'PA61B223.PV',
        'PA63110.PV',
        'PA63112.PV',
        'PA63113.PV',
        'PAH61A98.PV',
        'PAL61A98.PV',
        'PDA61B21.PV',
        'TA61B47.PV',
        'TA63109.PV',
        'TAE61B47.PV',
        'XA61B34.PV',
        'XA61B58.PV',
        'XAE61B34.PV'
    ]

    tag_categ['flow'] = [
        'FT61A99.PV',
        'F61221'
    ]

    tag_categ['status'] = [
        'P6302BDI.PV',
        'P6302BSD.PV'
    ]

    tag_categ['setpoint'] = [
        '05GTWY_BN06:XT61B16.PNT',
        'PC61A98E.AV'
    ]
    
    return tag_categ

In [33]:
def get_tags(tag_type):
    tag_categ = tag_categories()
    if tag_type in tag_categ.keys():
        return tag_categ[tag_type]
    else:
        return []

In [34]:
def get_tag_type(tag):
    tag_categ = tag_categories()
    tag_types = []
    for tag_type, tag_names in tag_categ.items():
        if tag in tag_names:
            tag_types.append(tag_type)
        
    return tag_types