In [1]:
import os
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Tuple

In [2]:
DATA_PATH = './data/'
DATA_SOURCES = ['valve1','valve2','other']


In [3]:
def load_dfs_by_source(data_dir:str,data_source:str)-> pd.DataFrame:
    """
    Load all dfs by source ('valve1','valve2','other')

    Args:
        data_dir (str): _description_
        data_source (str): _description_

    Returns:
        pd.DataFrame: _description_
    """
    all_files=[]
    for root, dirs, files in os.walk(data_dir):
        for file in files:
            if file.endswith(".csv"):
                all_files.append(os.path.join(root, file))
    all_files.sort()
    
    
    dfs_list = [pd.read_csv(file, 
                          sep=';', 
                          index_col='datetime', 
                          parse_dates=True) for file in all_files if data_source in file]
    
    return (
        pd.concat(dfs_list)
        # .drop(columns=['changepoint'])
        .sort_index()
    )

def train_val_test_split_original(valve1_data_sets:pd.DataFrame)\
    ->Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Load original lgbm data sets

    Args : valve1_data_set
    
    Returns:
         x_train, x_valid, x_test, y_train, y_valid, y_test 
    """
    #train_pre(valve1_data is dataframe)
    train_pre=valve1_data_sets

    #train_pre ⇒ train:valid_pre=7:3
    train_pre_size=len(train_pre)
    train_size=int(train_pre_size*0.7)
    train=train_pre[0:train_size]
    x_train_pre=train.drop('anomaly',axis=1)
    x_train=x_train_pre.drop('changepoint',axis=1)
    y_train=train['anomaly'].values

    #valid_pre ⇒ valid:test=2:1
    valid_pre_size=train_pre_size-train_size
    valid_size=int(valid_pre_size*0.66)
    valid=train_pre[train_size:train_size+valid_size]
    x_valid_pre=valid.drop('anomaly',axis=1)
    x_valid=x_valid_pre.drop('changepoint',axis=1)
    y_valid=valid['anomaly'].values

    test=train_pre[train_size+valid_size:]
    x_test_pre=test.drop('anomaly',axis=1)
    x_test=x_test_pre.drop('changepoint',axis=1)
    y_test=test['anomaly'].values
    
    return x_train, x_valid, x_test, y_train, y_valid, y_test

def data_perprocessing_original():
    pass

def smooth_curve(x):
    #x=1 dimension array
    window_len = 11
    s = np.r_[x[window_len-1:0:-1], x, x[-1:-window_len:-1]]
    w = np.kaiser(window_len, 2)
    y = np.convolve(w/w.sum(), s, mode='valid')
    return y[5:len(y)-5] 

def create_dataset(dataset,look_back=10):
    
    data_X=np.zeros((len(dataset)-look_back+1,3))
    j=0
    
    for i in range(look_back-1,len(dataset)):
        
        data_pre=dataset[i-look_back+1:i+1,0]
    
        data_pre_mean=np.mean(data_pre,axis=0)
        data_pre_min=np.min(data_pre,axis=0)
        data_pre_max=np.max(data_pre,axis=0)
        
        data_X[j,:]=np.array([data_pre_mean,data_pre_min,data_pre_max])
        j+=1
    
    return np.array(data_X).reshape(-1,3) 

In [4]:
data_dict_by_source = {source:load_dfs_by_source(DATA_PATH,source) for source in DATA_SOURCES}

In [5]:
x_train, x_valid, x_test, y_train, y_valid, y_test = train_val_test_split_original(data_dict_by_source['valve1'])

In [6]:
# x_train

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer


In [8]:
class TargetsWindow(BaseEstimator, TransformerMixin):
    
    def __init__(self,look_back:int=10) -> None:
        self.look_back = look_back
    
    def fit(self, X, y=None):
        return self

    def transform(self, X:np.array):
    
        return pd.Series(y_train[self.look_back-1:]).to_frame().rename(columns={0:'anomaly'})


class SmoothCurve(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X:np.array):
        # Perform arbitary transformation
        X_win=np.zeros_like(X.values)
        data_dim = X.shape[1]
        
        for i in range(0,data_dim):
            X_win[:,i]=smooth_curve(X.values[:,i].flatten())
    
        return X_win


class CreateStatsDataframe(BaseEstimator, TransformerMixin):
    
    stats_cols=['A1_mean','A1_min','A1_max', \
          'A2_mean','A2_min','A2_max', \
          'Cur_mean','Cur_min','Cur_max', \
          'Pre_mean','Pre_min','Pre_max', \
          'Temp_mean','Temp_min','Temp_max', \
          'Ther_mean','Ther_min','Ther_max', \
          'Vol_mean','Vol_min','Vol_max', \
          'Flow_mean','Flow_min','Flow_max']
    
    
    def __init__(self,look_back:int=10) -> None:
        self.look_back = look_back
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        
        data_dim = X.shape[1]
        for i in range(data_dim):
            if i==0:
                X_win=create_dataset(X[:,i].reshape(-1,1),look_back=self.look_back)
            else:
                X_win =  np.concatenate([X_win,create_dataset(X[:,i].reshape(-1,1),look_back=self.look_back)],axis=-1)
                
        X_win=X_win.reshape(-1,3*data_dim)
        
        df = pd.DataFrame(data=X_win,columns=CreateStatsDataframe.stats_cols)
        
        return df

In [9]:
smooth = SmoothCurve()
scaler = StandardScaler()
df_creator = CreateStatsDataframe(look_back=10)
targets_win = TargetsWindow(look_back=10)

In [10]:
features_pipeline = Pipeline(
    [
        ('smooth',smooth),
        ('scaler',scaler),
        ('df_creator',df_creator)
    ]
)

targets_pipeline = Pipeline([('targets_win',targets_win)])

In [11]:
features_pipeline.fit_transform(x_train)

Unnamed: 0,A1_mean,A1_min,A1_max,A2_mean,A2_min,A2_max,Cur_mean,Cur_min,Cur_max,Pre_mean,...,Temp_max,Ther_mean,Ther_min,Ther_max,Vol_mean,Vol_min,Vol_max,Flow_mean,Flow_min,Flow_max
0,-2.679439,-2.865004,-2.451339,-0.692745,-0.766555,-0.560740,-0.023137,-0.057393,0.030316,1.620902,...,3.284588,2.029763,2.022074,2.037021,0.005519,-0.777578,0.981011,0.415308,0.385021,0.484809
1,-2.699386,-2.865004,-2.451339,-0.671569,-0.766555,-0.433652,-0.030842,-0.057393,0.018154,1.330175,...,3.284588,2.031201,2.023224,2.037021,-0.152550,-0.777578,0.910140,0.426186,0.385021,0.493801
2,-2.707761,-2.865004,-2.511303,-0.634760,-0.766555,-0.314949,-0.036504,-0.057393,0.008848,0.992358,...,3.284588,2.032564,2.023664,2.037021,-0.278967,-0.777578,0.710646,0.437376,0.385021,0.496921
3,-2.696608,-2.865004,-2.399775,-0.577114,-0.766555,-0.129678,-0.039949,-0.057393,-0.011042,0.642892,...,3.284588,2.033838,2.025164,2.037021,-0.383274,-0.777578,0.317025,0.453149,0.385021,0.542746
4,-2.663746,-2.865004,-2.256753,-0.507170,-0.766555,-0.021328,-0.041509,-0.057393,-0.025603,0.328793,...,3.284588,2.034965,2.028704,2.037021,-0.467132,-0.777578,-0.071414,0.469971,0.385021,0.553246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12699,1.382576,1.160770,1.751949,-0.419943,-0.642846,-0.120054,-0.099773,-0.113770,-0.086471,0.074434,...,-1.614974,-1.195367,-1.197172,-1.193615,0.474209,-0.239293,1.067034,0.624893,0.562813,0.682260
12700,1.447580,1.160770,1.920958,-0.358168,-0.642846,-0.001004,-0.102299,-0.113770,-0.086471,0.040007,...,-1.621192,-1.195247,-1.197172,-1.193615,0.331514,-0.359915,0.997815,0.635605,0.562813,0.689802
12701,1.538176,1.160770,2.094342,-0.284097,-0.586892,0.097864,-0.103829,-0.113770,-0.086471,-0.082566,...,-1.626568,-1.195255,-1.197172,-1.193615,0.199998,-0.359915,0.813437,0.645480,0.583189,0.689802
12702,1.650044,1.160770,2.303791,-0.210430,-0.549470,0.149778,-0.105232,-0.113770,-0.094699,-0.271264,...,-1.630165,-1.195310,-1.197172,-1.193939,0.099791,-0.359915,0.744264,0.653965,0.594894,0.689802


In [17]:
targets_pipeline.transform(y_train) 

Unnamed: 0,anomaly
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
12699,0.0
12700,0.0
12701,0.0
12702,0.0


In [None]:
def calc_metrics(metrics,y_test,y_pred):
    pass
def plot_metrics(metrics,metrics_res):
    pass

class ModelEvaluation:
    
    """
    eval_config = {
      'lgb' : {
          'y_test' : pd.Series(),
          'y_pred': pd.Series()
      } ,
      'lgb_new': {
          'y_test' : pd.Series(),
          'y_pred': pd.Series()
      }
}
    """
    
    def __init__(self,eval_config) -> None:
        self.eval_config = eval_config
        
    def run(self):
        
        for model in self.eval_config.keys():
            

In [18]:

# eval_config = {
#       'lgb' : {
#           'y_test' : pd.Series(),
#           'y_pred': pd.Series()
#       } ,
#       'lgb_new': {
#           'y_test' : pd.Series(),
#           'y_pred': pd.Series()
#       }
# }

res = {
    'lgb' : {
        'cm' : cm,
        'roc_res' : (fpr, tpr, thresholds),
        'auc' : roc_auc
    },
    'lgb_new': {
        'cm' : cm,
        'roc_res' : (fpr, tpr, thresholds),
        'auc' : roc_auc
    }
}

  'y_test' : pd.Series(),
  'y_pred': pd.Series()
  'y_test' : pd.Series(),
  'y_pred': pd.Series()


{'y_test': Series([], dtype: float64), 'y_pred': Series([], dtype: float64)}

In [15]:
# # exmaple for custom transformers
# from numpy.random import randint
# from sklearn.base import BaseEstimator, TransformerMixin


# class CustomTransformer(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self

#     def transform(self, X, y=None):
#         # Perform arbitary transformation
#         X["random_int"] = randint(0, 10, X.shape[0])
#         return X
    
# df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})

# pipe = Pipeline(
#     steps=[
#         ("use_custom_transformer", CustomTransformer())
#     ]
# )
# transformed_df = pipe.fit_transform(df)

# print(df)