In [1]:
import numpy as np

from sklearn.neighbors import KernelDensity #사이킷런(머신러닝 #최근접 이웃(K-Nearest Neighbor)을 이용한 분류 - 유사한 특성을 가진 데이터는 유사한 범주에 속하는 경향이 있다. #커널 밀도 측정
from sklearn import metrics
import pandas as pd

import os as os
import time

# 공용 parameter
path = None
target_path = None


In [2]:


## load_data 함수: csv 파일 불러와서 각 SN/날짜별로 마지막 데이터만 남김
def load_data(file):
    global path, target_path
    print('Loading {}: '.format(file) +
         time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
    start = time.time()
    # 데이터 불러오기: KDE 작업 시 필요한 column들만 사용
    data = pd.read_csv(path + '/' + file, usecols=cols, dtype=types_dict)

    data['date'] = data.TIMESTAMP.str[:10]
    # Category화를 통한 소요 메모리 절약
    data['date'] = data['date'].astype('category')
    # Sort by: DriveSerialNumber, TIMESTAMP
    data = data.sort_values(by = ['DriveSerialNumber', 'TIMESTAMP'])
    # 각 날짜별 마지막 data만 남김
    data = data.drop_duplicates(subset = ['DriveSerialNumber', 'date'], keep = 'last')
    end = time.time()
    print('Loading data {} done - Elapsed time: '.format(file) +
          f'{(end-start):.3f} seconds')
    return data


In [1]:
import datetime
import os as os
import time

## MTP / pyspark 라이브러리 호출
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql import types
from pyspark.sql.types import *
from pyspark.sql.functions import *
# from pyspark.sql.functions import lower, upper, col
# from pyspark.sql.functions import when
import numpy as np
from sklearn.neighbors import KernelDensity 
from sklearn import metrics
#import pandas as pd #as-is
from pyspark.sql import Row

TypeError: an integer is required (got type bytes)

In [16]:
print('start',datetime.datetime.now())
df = pd.read_csv('./PM963_2021_w34_SMART.csv')
print('fin',datetime.datetime.now())

start 2022-06-20 13:15:39.267133
fin 2022-06-20 13:17:33.524878


In [17]:
print('start',datetime.datetime.now())
data = sqlContext.read.format('com.databricks.spark.csv')\
                .options(header='true', inferSchema='true')\
                .option("mode", "DROPMALFORMED")\
                .load(file)
print('fin',datetime.datetime.now())

start 2022-06-20 13:17:33.607658


NameError: name 'sqlContext' is not defined

In [None]:

## load_data_w_fw 함수: 파일 불러올 때 FW 정보 포함, FW: ERRORMOD 데이터 필터링
def load_data_w_fw(file):
    global path, target_path

    # 데이터 로드 시 FW 정보 관련 처리
    cols_fw = cols + ['FirmwareRevision']
    types_fw = {'FirmwareRevision': 'category'}
    try:
        types_fw.update(types_dict)
    except TypeError:  # types_dict None일 때 pass
        pass

    print('Loading {}: '.format(file) +
          time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
    start = time.time()
    # 데이터 불러오기: KDE 작업 시 필요한 column들 + FW 정보
    
    data = pd.read_csv('./ssd/' + file, usecols=cols_fw, dtype=types_fw, encoding='ISO-8859-1')
    # data = pd.read_csv(path + '/' + file,encoding='ISO-8859-1')

    data['date'] = data.TIMESTAMP.str[:10]
    # Category화를 통한 소요 메모리 절약
    data['date'] = data['date'].astype('category')
    # Sort by: DriveSerialNumber, TIMESTAMP
    data = data.sort_values(by=['DriveSerialNumber', 'TIMESTAMP'])

    # FW: ERRORMOD인 데이터 별도 파일로 저장
    df_err = data[data['FirmwareRevision'] == 'ERRORMOD']
    df_err.to_csv(target_path+ '/' +'FW_ERRORMOD_{}_{}'.format(logitem, file), index=False)

    # 불러온 데이터에서 FW: ERRORMOD 데이터 제외
    data = data.drop(index=df_err.index)

    # 각 날짜별 마지막 data만 남김
    data = data.drop_duplicates(subset=['DriveSerialNumber', 'date'], keep='last')
    
    end = time.time()
    print('Loading data {} done - Elapsed time: '.format(file) +
          f'{(end-start):.3f} seconds')
    return data


In [None]:


## get_kde_dist 함수: 데이터에 대한 분포 추정(KDE)
def get_kde_dist(array):
    print(array)
    tmp = array[~np.isnan(array)]

    # 아주 극단적인 분포에 대해 보다 나은 밀도 추정을 위해 데이터 범위 사전 조절 (0.01 ~ 99.99 percentile)
    lower_b = np.percentile(tmp, 0.01)
    upper_b = np.percentile(tmp, 99.99)
    tmp = tmp[(tmp >= lower_b) & (tmp <= upper_b)]
    tmp = tmp.reshape(-1, 1)
    tmp_avg = np.average(tmp)
    tmp_std = np.std(tmp)

    if tmp_std != 0:
        tmp_norm = (tmp - tmp_avg) / tmp_std
    else:
        tmp_norm = tmp

    x = np.linspace(np.min(tmp_norm), np.max(tmp_norm), num_of_density_layer)[:, np.newaxis]

    # x 값의 구간 첫 값과 마지막 값(데이터 0.01percentile / 99.99percentile)이 동일한 경우
    # KDE 수행하는 의미가 없고 시간만 과도하게 소요되므로 해당 경우 KDE 수행하지 않도록 설정
    if x[-1, 0] - x[0, 0] > 1e-3:  # 부동소수점 이슈 고려하여 설정
        if data_sampling:  # Data sampling 옵션 True일 때, 1/10으로 샘플링하여 KDE 수행
            tmp_norm_sample = tmp_norm.reshape(-1)
            np.random.seed(0)
            tmp_norm_sample = np.random.choice(tmp_norm_sample, len(tmp_norm_sample) // 10)
            tmp_norm_sample = tmp_norm_sample.reshape(-1, 1)
            kde = KernelDensity(kernel='gaussian').fit(tmp_norm_sample)
        else:
            kde = KernelDensity(kernel='gaussian').fit(tmp_norm)

        y_pdf = np.exp(kde.score_samples(x))

    else:
        x[:, 0] = x[-1, 0]  # 부동소수점 이슈 제거
        y_pdf = np.ones(num_of_density_layer)

        # x 생성 시 자연히 크기 순이므로 sort value 불필요 (sort 시 부동소수점 이슈로 인해 index 및 cdf 계산 결과 꼬일 수 있음)
    density = pd.DataFrame({'x': x[:, 0], 'y_pdf': y_pdf})
    density['y_cdf'] = [metrics.auc(density[:ind + 2].x, density[:ind + 2].y_pdf) for ind in density.index]
    density['y_cdf'] = density['y_cdf'] / np.max(density['y_cdf'])
    density['x'] = density['x'] * tmp_std + tmp_avg

    return density['x'], density['y_cdf']


# Crit list 출력
def get_crit_list(item, df_pivot):
    tmp_info = crit_warn_info.loc[item.lower()].str.split(',')
    #print('---------------------')
    crit_list = pd.DataFrame().index
    for column in df_pivot.columns:
        #print(df_pivot.columns)
        temp = df_pivot[column].dropna()
        if (tmp_info.CRITICAL_LOW[0] == 'VALUE'):
            crit = temp[temp < float(tmp_info.CRITICAL_LOW[1])].index
            #print(column,'LOW',len(crit))
            crit_list = crit_list.union(crit)
            
        if (tmp_info.CRITICAL_HIGH[0] == 'VALUE'):
            crit = temp[temp > float(tmp_info.CRITICAL_HIGH[1])].index
            crit_list = crit_list.union(crit)
            #print('get_crit_list-crit_list', column, 'HIGH',len(crit),float(tmp_info.CRITICAL_HIGH[1]))
        #print('get_crit_list-crit_list', len(crit_list), tmp_info.CRITICAL_HIGH[1], crit_list)
    #print('=======================')
    return crit_list


# Warn list 출력
def get_warn_list(item, df_pivot):
    tmp_info = crit_warn_info.loc[item.lower()].str.split(',')

    warn_list = pd.DataFrame().index
    # Warning List
    for column in df_pivot.columns:
        temp = df_pivot[column].dropna()

        if (tmp_info.WARNING_LOW[0] == 'PPM'):
            warn = temp[temp < np.percentile(temp, float(tmp_info.WARNING_LOW[1]) / 1000000 * 100)].index
            warn_list = warn_list.union(warn)
        elif tmp_info.WARNING_LOW[0] == 'VALUE':
            warn = temp[temp < float(tmp_info.WARNING_LOW[1])].index
            warn_list = warn_list.union(warn)
        if (tmp_info.WARNING_HIGH[0] == 'PPM'):
            warn = temp[temp > np.percentile(temp, 100 - float(tmp_info.WARNING_HIGH[1]) / 1000000 * 100)].index
            warn_list = warn_list.union(warn)
        elif tmp_info.WARNING_HIGH[0] == 'VALUE':
            warn = temp[temp > float(tmp_info.WARNING_HIGH[1])].index
            warn_list = warn_list.union(warn)

    return warn_list


# Warn list 및 boundary 출력
def get_warn_list_boundary(item, df_pivot):
    tmp_info = crit_warn_info.loc[item.lower()].str.split(',')

    warn_list = pd.DataFrame().index
    lower_list = []
    upper_list = []
    # Warning List
    for column in df_pivot.columns:
        temp = df_pivot[column].dropna()
        print("----------------",temp)
        if tmp_info.WARNING_LOW[0] == 'PPM':
            lower_b = np.percentile(temp, float(tmp_info.WARNING_LOW[1]) / 1000000 * 100)
            lower_list.append(lower_b)
            warn = temp[temp < lower_b].index
            warn_list = warn_list.union(warn)
        elif tmp_info.WARNING_LOW[0] == 'VALUE':
            lower_b = float(tmp_info.WARNING_LOW[1])
            lower_list.append(lower_b)
            warn = temp[temp < lower_b].index
            warn_list = warn_list.union(warn)
        else:
            lower_list.append(np.nan)

        if (tmp_info.WARNING_HIGH[0] == 'PPM'):
            upper_b = np.percentile(temp, 100 - float(tmp_info.WARNING_HIGH[1]) / 1000000 * 100)
            upper_list.append(upper_b)
            warn = temp[temp > upper_b].index
            warn_list = warn_list.union(warn)
        elif tmp_info.WARNING_HIGH[0] == 'VALUE':
            upper_b = float(tmp_info.WARNING_HIGH[1])
            upper_list.append(upper_b)
            warn = temp[temp > upper_b].index
            warn_list = warn_list.union(warn)
        else:
            upper_list.append(np.nan)

    warn_boundary = pd.DataFrame({'Lower': lower_list, 'Upper': upper_list})
    return warn_list, warn_boundary


# process_selected_column 함수:
# 선택한 column에 대해 데이터 처리 작업
# (Critical / Warning SN list 및 데이터 분포 정보 csv 파일로 저장)
# 2021.09.08 수정: KDE 미수행 옵션 추가
def process_selected_column(selected_column, data, kde=True):
    global path, target_path
    print('{}: '.format(selected_column) +
          time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))

    # Pivot : DriveSerialNumber x Date
    sub_df = data[['DriveSerialNumber', 'date', selected_column]]
    sub_df_pivot = sub_df.pivot(index='DriveSerialNumber', columns='date', values=selected_column)
    #print(sub_df_pivot)
    crit_list = get_crit_list(selected_column, sub_df_pivot)  # Critical List
    # return 220615
    sub_df_pivot = sub_df_pivot.loc[~sub_df_pivot.index.isin(crit_list)]  # Critical 포함 SN 제외
    warn_list, warn_boundary = get_warn_list_boundary(selected_column, sub_df_pivot)  # Warning List & Boundary

    # Data Save
    if os.path.exists(target_path +'/' + selected_column) == False:
        os.mkdir(target_path +'/' + selected_column)

    pd.DataFrame(crit_list).to_csv(target_path +'/' + selected_column + '/Crit_' + file)
    pd.DataFrame(warn_list).to_csv(target_path +'/'+ selected_column + '/Warn_' + file)
    warn_boundary.to_csv(target_path +'/' + selected_column + '/Boundary_' + file)

    if kde == True:
        sub_df_pivot = sub_df_pivot.loc[~sub_df_pivot.index.isin(warn_list)]  # Warning 포함 SN 제외
        ary_normal = np.array(sub_df_pivot)
        # Density Estimation : KDE
        day = ary_normal.shape[1]
        SDist = np.zeros((day, num_of_density_layer))  ### Day by Day KDE Based CDF
        weight = np.zeros((day, num_of_density_layer))  ### Day by Day Weight (CDF Diff)

        for i in range(day):
            print('^^^^^^^^^^^^',selected_column, day)
            SDist[i,], weight[i,] = get_kde_dist(ary_normal[:, i])
        if False == os.path.exists(target_path +'/'+ selected_column + '/Value_'):
            os.mkdir(target_path +'/'+ selected_column + '/Value_')
        else:
            pass
        if False == os.path.exists(target_path +'/'+ selected_column + '/Weight_'):
            os.mkdir(target_path +'/'+ selected_column + '/Weight_')
        else:
            pass
        pd.DataFrame(SDist).to_csv(target_path +'/'+ selected_column + '/Value_' + file)
        pd.DataFrame(weight).to_csv(target_path +'/' + selected_column + '/Weight_' + file)


def KDE_main(target_product, target_logitems, num_of_last_week):
    global path, target_path, product, logitem, system_info, cols, types_dict, crit_warn_info, file_list, data_sampling, num_of_density_layer, file

    product = target_product
    logitem = target_logitems

    print('Start making data for KDE')

    # System 정보
    system_info = ['TIMESTAMP', 'Cluster', 'NodeId', 'Generation', 'HwSkuId',
                   'DriveProductId', 'DriveSerialNumber', 'FirmwareRevision']


    # SMART 항목
    item_smart = ['CritWarning', 'Temperature', 'AvailableSpare', 'AvailSpareThreshold',
                  'PercentageUsed', 'DataUnitsRead', 'DataUnitsWritten',
                  'HostReadCommands', 'HostWriteCommands', 'ControllerBusyTime',
                  'PowerCycles', 'PowerOnHours', 'UnsafeShutdowns', 'MediaErrors',
                  'NumErrInfoLogEntries', 'WarnCompositeTempTime',
                  'CritCompositeTempTime', 'TempSensor1', 'TempSensor2', 'TempSensor3',
                  'TempSensor4', 'TempSensor5', 'TempSensor6', 'TempSensor7',
                  'TempSensor8']

    # Extended SMART 항목 전부 표기 - KDE 미수행 항목들은 뒤쪽에 표기
    item_ext_smart = ['Media_Units_Written', 'ECC_Iterations', 'Wear_Range_Delta',
                      'Unaligned_IO', 'Mapped_LBAs', 'Program_Fail_Count', 'Erase_Fail_Count',
                      'Capacitor_Health', 'Supported_Features', 'Power_Consumption',
                      'Temperature_Throttling']

    ## 제품 / Log item 별 item list 설정
    # 2021.09.08 수정: Critical Warning과 Capacitor Health도 Crit/Warn list 추출 및 KDE 작업 수행
    if logitem == 'SMART':
        if product == 'PM963':  # PM963: Temp Sensor 2까지 있음
            item_list = item_smart[0:3] + item_smart[4:19]
        elif product in ['PM983', 'PM1725b']:  # PM983 & PM1725b: Temp Sensor 3까지 있음
            item_list = item_smart[0:3] + item_smart[4:20]
        elif product == 'PM953':  # PM953: NVMe 1.1 spec - No. of Err Info Log Entries까지
            item_list = item_smart[0:3] + item_smart[4:15]
    elif logitem == 'Ext_SMART':
        if product in ['PM963', 'PM983']:
            item_list = item_ext_smart[:8]
        elif product == 'PM953':  # PM953: Ext SMART 지원 항목 적음
            item_list = [item_ext_smart[0]] + item_ext_smart[2:5]

    # 데이터 로드 시 가져올 columns : 필요한 column들만 사용
    cols = [system_info[0]] + [system_info[6]] + item_list
    print(cols)
    # Telemetry 항목값의 Type 지정 : 필요 시 설정 (기본 None)
    types_dict = None

    # 제품별 Critical / Warning 정보 불러오기
    crit_warn_info = pd.read_csv('Anomaly_Rulebase_{}.csv'.format(product), index_col=0)

    # 파일 경로 및 파일 목록
    print('file path :', path)
    file_list = os.listdir('./ssd/')
    file_list.sort()
    # print(file_list)
    # 동작 시간 단축을 위해 data를 sampling하여 사용할 지 옵션
    data_sampling = False

    num_of_density_layer =  100


    ### Density Estimation - All

    # file_list 범위 설정
    file_list = file_list[-num_of_last_week:]

   

    for file in file_list:
        print('Processing {}: '.format(file) +
              time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
        start = time.time()
        data = load_data_w_fw(file)  # FW ERRORMOD 필터링 적용

        i = 0
        for selected_column in item_list:
            process_selected_column(selected_column, data)
            #if i>=9 : break
            i+=1
        del data  # For memory

        end = time.time()
        print('Processing {} done - Elapsed time: '.format(file) +
              f'{(end-start):.3f} seconds')

In [16]:
## make_EDA_plot 셀 수행

## MTP / 노트북 통합으로 인한 주석
# import KDEprocess_SMART_2021_v3 as kde_f
# import Visualization_SMART_2021_v3 as viz_f

#공용 parameter
num_of_last_week = 2 #새로 KDE 추출할 데이터 갯수(Week 단위 기준)

## MTP / adls 경로 추출
# # Primary storage info
# account_name = 'stpockr001' # fill in your primary account name
# container_name = 'poc-001' # fill in your container name
# #relative_path = 'EDA/Telemetry_raw/Daily/To_Weekly/PM963/SMART/' # fill in your relative folder path
# relative_path = 'EDA/' # fill in your relative folder path

# adls_path = 'abfss://%s@%s.dfs.core.windows.net/%s' % (container_name, account_name, relative_path)
# print('Primary storage account path: ' + adls_path)

# ####### PM963

# ## Parameters 입력 : 제품 / Log 아이템
product = 'PM963' # PM963, PM983, PM1725b, PM953

# #읽어올 파일 경로 및 파일 목록
# #path = 'F:/Telemetry_raw/Daily/To_Weekly/{}/SMART'.format(product) #as-is
path = './'
# ## MTP / 신규 추가
# path = adls_path + 'Telemetry_raw/Daily/To_Weekly/{}/SMART'.format(product)
# pathList = path + "/*"
# print('pathList: ' + pathList)
# df_pathList = spark.read.format("csv").load(pathList)
# files = df_pathList.inputFiles()    
# file_list = files.sort(reverse=True)


# #KDE 시각화를 위해 생성되는 파일들 저장 위치
target_path = './target/'
# target_path = 'F:/Telemetry_output/MSC/{}'.format(product) #as-is

# ## MTP / 변경
# target_path = adls_path + 'Telemetry_output/MSC/{}'.format(product)
# print('path : ' + path)
# print('target_path : ' + target_path)

# #file_list = os.listdir(path).sort() #as-is
# ### Make Data for KDE estimation
#kde_f.path = path #SMART 데이터 위치 #as-is
#kde_f.target_path = target_path #KDE용 생성 데이터 저장 위치 #as-is

## MTP / 노트북 통합으로 kde_f 제거
# kde_f.KDE_main(product,'SMART', num_of_last_week) #as-is
# kde_f.KDE_main(product,'Ext_SMART', num_of_last_week) #as-is
KDE_main(product,'SMART', num_of_last_week)
# KDE_main(product,'Ext_SMART', num_of_last_week)


Start making data for KDE
file path : ./
Processing PM963_2021_w34_SMART.csv: 2022-06-17 10:10:34
Loading PM963_2021_w34_SMART.csv: 2022-06-17 10:10:34


KeyboardInterrupt: 

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
from matplotlib import cm
import os
from matplotlib.ticker import FuncFormatter
import time
from pandas.plotting import register_matplotlib_converters


#### 공용 변수
# path = None
# target_path = None

# System 정보
system_info = ['TIMESTAMP', 'Cluster', 'NodeId', 'Generation', 'HwSkuId',
               'DriveProductId', 'DriveSerialNumber', 'FirmwareRevision']

# SMART 항목
item_smart = ['CritWarning', 'Temperature', 'AvailableSpare', 'AvailSpareThreshold',
              'PercentageUsed', 'DataUnitsRead', 'DataUnitsWritten',
              'HostReadCommands', 'HostWriteCommands', 'ControllerBusyTime',
              'PowerCycles', 'PowerOnHours', 'UnsafeShutdowns', 'MediaErrors',
              'NumErrInfoLogEntries', 'WarnCompositeTempTime',
              'CritCompositeTempTime', 'TempSensor1', 'TempSensor2', 'TempSensor3',
              'TempSensor4', 'TempSensor5', 'TempSensor6', 'TempSensor7',
              'TempSensor8']

# Extended SMART 항목 전부 표기 - KDE 미수행 항목들은 뒤쪽에 표기
item_ext_smart = ['Media_Units_Written', 'ECC_Iterations', 'Wear_Range_Delta',
                  'Unaligned_IO', 'Mapped_LBAs', 'Program_Fail_Count', 'Erase_Fail_Count',
                  'Capacitor_Health', 'Supported_Features', 'Power_Consumption',
                  'Temperature_Throttling']


def millions(x,pos):
    return '%1.0fM' % (x*1e-6)

def kilo(x,pos):
    return '%1.0fK' % (x*1e-3)

 #DriveSerialNumber 하나에 대해 여러 SSDUID를 가지는 경우 가장 값이 많이 나타나는 SSDUID 기준으로 필터링하는 함수
# 2021.08.03 수정 - 대문자 변환
def get_df_1ssduid(df):
    df['SSDUID'] = df['SSDUID'].str.upper()
    vc = df['SSDUID'].value_counts()
    return df[df['SSDUID']==vc.index[0]]


def Visualization_main(target_product):
    global path, target_path, product, system_info, item_smart, item_ext_smart, cols, crit_warn_info, item_list

    product = target_product

    # 결과 파일 (Abnormal Data / Figure) 저장 경로
    if os.path.exists(target_path+'/0_Abnormal_Data') == False:
        os.mkdir(target_path+'/0_Abnormal_Data')
    if os.path.exists(target_path+'/1_Figure') == False:
        os.mkdir(target_path+'/1_Figure')

    register_matplotlib_converters()


    file_list = os.listdir('./target')
    file_list.sort()

    if product == 'PM963':  # PM963: Temp Sensor 2까지 있음
        item_list = item_smart[1:3] + item_smart[4:19]
    elif product in ['PM983', 'PM1725b']:  # PM983 & PM1725b: Temp Sensor 3까지 있음
        item_list = item_smart[1:3] + item_smart[4:20]
    elif product == 'PM953':  # PM953: NVMe 1.1 spec - No. of Err Info Log Entries까지
        item_list = item_smart[1:3] + item_smart[4:15]

        # Extended SMART 항목 포함 (제품별 차이)
    if product in ['PM963', 'PM983']:
        item_list = item_list + item_ext_smart[:7]
    elif product == 'PM953':
        item_list = item_list + [item_ext_smart[0]] + item_ext_smart[2:5]


    # Telemetry 항목값의 Type 지정 : 필요 시 설정 (기본 None)
    types_dict = None

    # 데이터 전체 column 로드 시 category화할 column들
    cols_category = system_info[1:6] + [system_info[7]]
    crit_warn_info = pd.read_csv('./Anomaly_Rulebase_{}.csv'.format(product), index_col=0)#./ 추가220615
    colormap1 = cm.viridis



    ###### Visualization (Long / Short)
    # 최신 File로부터 모든 항목의 Critical/Warning SSD의 SN 가져오기
    crit_list = pd.DataFrame().index
    warn_list = pd.DataFrame().index

    for selected_column in item_list:
        if False == (os.path.exists(target_path + '/' + selected_column + '/')):
            os.mkdir(target_path + '/' + selected_column + '/')
        else:
            pass
        tmp = os.listdir(target_path + '/' + selected_column + '/')
        tmp.sort()
        all_file_list = pd.DataFrame(tmp)
        print(all_file_list,'allfile_____')##220615 추가
        crit_file_list = list(all_file_list[(all_file_list.loc[:, 0].str.endswith(".csv") == True) &
                                            (all_file_list.loc[:, 0].str.startswith("Crit") == True)].loc[:, 0])
        warn_file_list = list(all_file_list[(all_file_list.loc[:, 0].str.endswith(".csv") == True) &
                                            (all_file_list.loc[:, 0].str.startswith("Warn") == True)].loc[:, 0])
        crit = pd.read_csv(target_path + '/' + selected_column + '/' + crit_file_list[-1], index_col=1).index
        print(crit)
        crit_list = crit_list.union(crit)
        warn = pd.read_csv(target_path + '/' + selected_column + '/' + warn_file_list[-1], index_col=1).index
        print(warn)
        warn_list = warn_list.union(warn)

    ssd_list = crit_list.union(warn_list)

    # 모든 Critical/Warning drive들의 long-term data DataFrame
    start = time.time()
    data_total = pd.DataFrame()
    print('ssd_list',ssd_list)
    for file in file_list:#file_list
        print('Visualization_main - Loading filename {}: '.format(file) + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
        tmp_data = pd.DataFrame()
        for chunk in pd.read_csv('./ssd/PM963_2021_w34_SMART.csv', chunksize=1000000, dtype=types_dict):
            chunk = chunk[chunk['DriveSerialNumber'].isin(ssd_list)]
            chunk['date'] = chunk.TIMESTAMP.str[:10]
            tmp_data = pd.concat([tmp_data, chunk])
        # Category화를 통한 소요 메모리 절약
        for col in cols_category:
            tmp_data[col] = tmp_data[col].astype('category')
        tmp_data['date'] = tmp_data['date'].astype('category')
        tmp_data = tmp_data.sort_values(by=['DriveSerialNumber', 'TIMESTAMP'])  # Sort by: DriveSerialNumber, TIMESTAMP

        data_total = pd.concat([data_total, tmp_data])

    print('data_total',data_total)
    data_total = data_total.sort_values(by=['DriveSerialNumber', 'TIMESTAMP'])  # Sort by: DriveSerialNumber, TIMESTAMP
    end = time.time()
    print(f'Elapsed time : {(end-start):.3f} seconds')

    # groupby 사용해서 각 DriveSerialNumber에 대해 유일한 SSDUID만 남기기
    start = time.time()
    grouped = data_total.groupby(by='DriveSerialNumber')
    data_long_fixed = grouped.apply(get_df_1ssduid)
    data_long_fixed = data_long_fixed.reset_index(drop=True)
    end = time.time()
    print(f'Elapsed time : {(end-start):.3f} seconds')
    print('Total no. of rows in crit/warn drive data : {}'.format(len(data_long_fixed)))

    # 마지막 주 데이터의 날짜 중 가장 오래된 값 - short data의 기준 날짜
    short_datetime = tmp_data['date'].astype('datetime64[ns]').min()

    # Generate short-term Crit/Warn data
    #data_long_fixed['date'] = pd.to_datetime(data_long_fixed['date'])
    # aithe
    data_long_fixed['date'] = pd.to_datetime(data_long_fixed['date']).astype('datetime64[ns]')
    data_short = data_long_fixed[data_long_fixed['date'] >= short_datetime]
    

    for selected_column in item_list:
        print('Processing {}: '.format(selected_column) +
              time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
        start = time.time()

        ## Critical / Warning SN 정보 및 데이터 분포 정보 불러오기
        tmp = os.listdir(target_path +'/' + selected_column + '/')
        tmp.sort()
        tmp_info = crit_warn_info.loc[selected_column.lower()]
        all_file_list = pd.DataFrame(tmp)
        value_file_list = list(all_file_list[(all_file_list.loc[:, 0].str.endswith(".csv") == True) &
                                             (all_file_list.loc[:, 0].str.startswith("Value") == True)].loc[:, 0])
        weight_file_list = list(all_file_list[(all_file_list.loc[:, 0].str.endswith(".csv") == True) &
                                              (all_file_list.loc[:, 0].str.startswith("Weight") == True)].loc[:, 0])
        crit_file_list = list(all_file_list[(all_file_list.loc[:, 0].str.endswith(".csv") == True) &
                                            (all_file_list.loc[:, 0].str.startswith("Crit") == True)].loc[:, 0])
        warn_file_list = list(all_file_list[(all_file_list.loc[:, 0].str.endswith(".csv") == True) &
                                            (all_file_list.loc[:, 0].str.startswith("Warn") == True)].loc[:, 0])
        boundary_file_list = list(all_file_list[(all_file_list.loc[:, 0].str.endswith(".csv") == True) &
                                                (all_file_list.loc[:, 0].str.startswith("Boundary") == True)].loc[:, 0])

        # 데이터 구간 정보
        for file in value_file_list:
            value_temp = pd.read_csv(target_path + '/' + selected_column + '/' + file, index_col=0)
            if file == value_file_list[0]:
                value_tot = value_temp
            else:
                value_tot = pd.concat((value_tot, value_temp), axis=0)

        value_tot = np.array(value_tot.reset_index().iloc[:, 1:])

        value_short = value_temp
        value_short = np.array(value_short.reset_index().iloc[:, 1:])

        # 데이터 분포 정보
        for file in weight_file_list:
            weight_temp = pd.read_csv(target_path +'/' + selected_column + '/' + file, index_col=0)
            if file == weight_file_list[0]:
                weight_tot = weight_temp
            else:
                weight_tot = pd.concat((weight_tot, weight_temp), axis=0)

        weight_tot = np.array(weight_tot.reset_index().iloc[:, 1:])

        weight_short = weight_temp
        weight_short = np.array(weight_short.reset_index().iloc[:, 1:])

        # Critical / Warning 정보: 최신 데이터 사용
        crit_file = crit_file_list[-1]
        crit_tot = pd.read_csv(target_path +'/' + selected_column + '/' + crit_file, index_col=1).index

        warn_file = warn_file_list[-1]
        warn_tot = pd.read_csv(target_path + '/' + selected_column + '/' + warn_file, index_col=1).index
        print('0----------')
        print(target_path + '/' + selected_column + '/' + warn_file)
        print(warn_tot)
        # Long-term Crit/Warn for selected item
        sub_df = data_long_fixed[['DriveSerialNumber', 'date', selected_column]]
        sub_df = sub_df.drop_duplicates(subset=['DriveSerialNumber', 'date'], keep='last')

        sub_df_pivot = sub_df.pivot(index='DriveSerialNumber', columns='date', values=selected_column)
        xdate_tot = pd.Series(pd.unique(data_long_fixed['date']))
        xdate_tot = xdate_tot.sort_values()
        xdate_tot = xdate_tot.reset_index(drop=True)

        sub_df_crit_tot = sub_df_pivot[sub_df_pivot.index.isin(crit_tot)]
        sub_df_warn_tot = sub_df_pivot[sub_df_pivot.index.isin(warn_tot)]

        # Short-term Crit/Warn for selected item
        sub_df_short = data_short[['DriveSerialNumber', 'date', selected_column]]
        sub_df_short = sub_df_short.drop_duplicates(subset=['DriveSerialNumber', 'date'], keep='last')
        print('1----------')
        print(sub_df_short)
        sub_df_pivot_short = sub_df_short.pivot(index='DriveSerialNumber', columns='date', values=selected_column)
        xdate_short = pd.Series(pd.unique(data_short['date']))
        xdate_short = xdate_short.sort_values()
        xdate_short = xdate_short.reset_index(drop=True)

        sub_df_crit_short = sub_df_pivot_short[sub_df_pivot_short.index.isin(crit_tot)]
        sub_df_warn_short = sub_df_pivot_short[sub_df_pivot_short.index.isin(warn_tot)]
        print('2----------')
        print(sub_df_warn_short)
        print('21----------')
        print(warn_tot)
        # Filter Crit SN - Rulebase 정보 사용
        crit_low = tmp_info.CRITICAL_LOW.split(',')
        crit_high = tmp_info.CRITICAL_HIGH.split(',')
        if crit_low[0] == 'VALUE':
            if crit_high[0] == 'VALUE':
                sub_df_crit_temp = sub_df_crit_short[(sub_df_crit_short < float(crit_low[1]))
                                                     | (sub_df_crit_short > float(crit_high[1]))]
            else:
                sub_df_crit_temp = sub_df_crit_short[(sub_df_crit_short < float(crit_low[1]))]
        elif crit_high[0] == 'VALUE':
            sub_df_crit_temp = sub_df_crit_short[(sub_df_crit_short > float(crit_high[1]))]
        else:  # Crit 조건 없을 때
            sub_df_crit_temp = sub_df_crit_short

        sub_df_crit_temp = sub_df_crit_temp.dropna(axis=0, how='all')
        crit_tot_new = sub_df_crit_temp.index

        sub_df_crit_short = sub_df_crit_short.loc[crit_tot_new]
        sub_df_crit_tot = sub_df_crit_tot.loc[crit_tot_new]

        # Filter Warn SN - 별도 추출한 Warning boundary 정보 적용
        boundary_file = boundary_file_list[-1]
        boundary = pd.read_csv(target_path +'/' + selected_column + '/' + boundary_file)

        warn_low = tmp_info.WARNING_LOW.split(',')
        warn_high = tmp_info.WARNING_HIGH.split(',')

        sub_df_warn_temp = sub_df_warn_short.copy()

        if warn_low[0] != 'EMPTY':
            if warn_high[0] != 'EMPTY':
                for i in range(len(sub_df_warn_temp.columns)):
                    warn_col_temp = sub_df_warn_short.iloc[:, i]
                    sub_df_warn_temp.iloc[:, i] = warn_col_temp[(warn_col_temp < boundary['Lower'][i])
                                                                | (warn_col_temp > boundary['Upper'][i])]
            else:
                for i in range(len(sub_df_warn_temp.columns)):
                    warn_col_temp = sub_df_warn_short.iloc[:, i]
                    sub_df_warn_temp.iloc[:, i] = warn_col_temp[warn_col_temp < boundary['Lower'][i]]
        elif warn_high[0] != 'EMPTY':
            print("--------------")
            print(sub_df_warn_temp)
            print("len:",len(sub_df_warn_temp.columns))
            for i in range(len(sub_df_warn_temp.columns)):
                print(i)
                warn_col_temp = sub_df_warn_short.iloc[:, i]
                sub_df_warn_temp.iloc[:, i] = warn_col_temp[warn_col_temp > boundary['Upper'][i]]

        sub_df_warn_temp = sub_df_warn_temp.dropna(axis=0, how='all')
        warn_tot_new = sub_df_warn_temp.index

        sub_df_warn_short = sub_df_warn_short.loc[warn_tot_new]
        sub_df_warn_tot = sub_df_warn_tot.loc[warn_tot_new]

        print('warn length (org) : {}'.format(len(warn_tot)))
        print('warn length (fix) : {}'.format(len(warn_tot_new)))

        ## Last Week Abnormal Data Save (Excel File) #############################

        sub_df_raw = data_short.set_index('DriveSerialNumber')
        sub_df_crit_raw = sub_df_raw[sub_df_raw.index.isin(crit_tot_new)]
        sub_df_crit_raw.to_csv(target_path + '/0_Abnormal_Data/Crit_' + selected_column + '.csv')
        sub_df_warn_raw = sub_df_raw[sub_df_raw.index.isin(warn_tot_new)]
        sub_df_warn_raw.to_csv(target_path + '/0_Abnormal_Data/Warn_' + selected_column + '.csv')

        ##########################################################################

        day = weight_tot.shape[0]
        n = weight_tot.shape[1]

        day_short = weight_short.shape[0]
        n_short = weight_short.shape[1]

        tmp = [[weight_tot[j, i + 1] - weight_tot[j, i] for i in range(n - 1)] for j in range(day - 1)]
        if not np.isnan(tmp).all():
            normalize = np.nanmax(tmp)  # 전체 weight 중 NaN 제외한 최대값
            weight_tot = np.nan_to_num(weight_tot)  # weight 중 일부만 NaN일 경우 해당 값들을 0으로 초기화
        else:
            normalize = np.nan  # 모든 값이 NaN일 경우 NaN

        tmp_short = [[weight_short[j, i + 1] - weight_short[j, i] for i in range(n_short - 1)] for j in
                     range(day_short - 1)]
        if not np.isnan(tmp_short).all():
            normalize_short = np.nanmax(tmp_short)
            weight_short = np.nan_to_num(weight_short)
        else:
            normalize_short = np.nan

        if tmp_info.Y_FORMAT == 'TB':
            value_tot = value_tot * 512 * 1000 / (1024 ** 4)
            value_short = value_short * 512 * 1000 / (1024 ** 4)

            sub_df_crit_tot = sub_df_crit_tot * 512 * 1000 / (1024 ** 4)
            sub_df_crit_short = sub_df_crit_short * 512 * 1000 / (1024 ** 4)

            sub_df_warn_tot = sub_df_warn_tot * 512 * 1000 / (1024 ** 4)
            sub_df_warn_short = sub_df_warn_short * 512 * 1000 / (1024 ** 4)

        heatmap_color1 = np.zeros((day - 1, n - 1))
        heatmap_opacity = np.zeros((day - 1, n - 1))

        ## axis zoom range 설정:
        # upper bound가 주어지지 않은 항목들에 대한 조건 설정 및
        # upper bound가 주어진 경우 실제 density heatmap 구간의 최대값과 비교하여 axis_max 설정
        # Long / Short 간 zoom in 구간 분리
        value_max_tot = np.amax(value_tot)  # density heatmap 구간의 최대값
        if not np.isnan(tmp_info.Y_MAX):
            if selected_column.startswith('Temp') == True:  # 온도의 경우 항상 지정된 범위로 설정
                value_max_tot = tmp_info.Y_MAX
            else:
                value_max_tot = min(value_max_tot, tmp_info.Y_MAX)
        else:
            if value_max_tot <= tmp_info.Y_MIN:  # density 구간 고정이며 0보다 작거나 같은 경우
                value_max_tot = 200
        axis_max_tot = value_max_tot + (value_max_tot - tmp_info.Y_MIN) * 0.1
        axis_min_tot = tmp_info.Y_MIN - (value_max_tot - tmp_info.Y_MIN) * 0.1

        value_max_short = np.amax(value_short)  # density heatmap 구간의 최대값
        if not np.isnan(tmp_info.Y_MAX):
            if selected_column.startswith('Temp') == True:  # 온도의 경우 항상 지정된 범위로 설정
                value_max_short = tmp_info.Y_MAX
            else:
                value_max_short = min(value_max_short, tmp_info.Y_MAX)
        else:
            if value_max_short <= tmp_info.Y_MIN:  # density 구간 고정이며 0보다 작거나 같은 경우
                value_max_short = 200
        axis_max_short = value_max_short + (value_max_short - tmp_info.Y_MIN) * 0.1
        axis_min_short = tmp_info.Y_MIN - (value_max_short - tmp_info.Y_MIN) * 0.1

        ##Long Term Plot (ax1 : Whole, ax2 : Zoom in)
        fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, sharex=True, figsize=(12, 3), dpi=100)

        # Plot for warning SSDs
        for ind in sub_df_warn_tot.index:
            tmp_warn = sub_df_warn_tot.loc[ind]
            tmp_warn_df = pd.DataFrame(
                {'tmp_warn_x': np.arange(0, day, 1), 'tmp_warn_y': tmp_warn[:tmp_warn.size]}).dropna()

            if len(tmp_warn_df) == 1:  # Use marker if only one value exists
                ax1.plot(pd.to_datetime(tmp_warn_df.index), tmp_warn_df.tmp_warn_y, color='grey', linewidth=0.5,
                         zorder=0,
                         marker='o', markersize=1)
            else:
                ax1.plot(pd.to_datetime(tmp_warn_df.index), tmp_warn_df.tmp_warn_y, color='grey', linewidth=0.5,
                         zorder=0)

            if (selected_column.startswith('Temp') == True or
                    selected_column.startswith('Avail') == True or
                    np.isnan(normalize) == True):
                if len(tmp_warn_df) == 1:  # Use marker if only one value exists
                    ax2.plot(pd.to_datetime(tmp_warn_df.index), tmp_warn_df.tmp_warn_y, color='grey', linewidth=0.5,
                             zorder=0,
                             marker='o', markersize=1)
                else:
                    ax2.plot(pd.to_datetime(tmp_warn_df.index), tmp_warn_df.tmp_warn_y, color='grey', linewidth=0.5,
                             zorder=0)

        # Plot for critical SSDs
        for ind in sub_df_crit_tot.index:
            tmp_crit = sub_df_crit_tot.loc[ind]
            tmp_crit_df = pd.DataFrame(
                {'tmp_crit_x': np.arange(0, day, 1), 'tmp_crit_y': tmp_crit[:tmp_crit.size]}).dropna()
            if len(tmp_crit_df) == 1:  # Use marker if only one value exists
                ax1.plot(pd.to_datetime(tmp_crit_df.index), tmp_crit_df.tmp_crit_y, color='red', linewidth=0.5,
                         zorder=20,
                         marker='o', markersize=1)
            else:
                ax1.plot(pd.to_datetime(tmp_crit_df.index), tmp_crit_df.tmp_crit_y, color='red', linewidth=0.5,
                         zorder=20)

            if (selected_column.startswith('Temp') == True or
                    selected_column.startswith('Avail') == True or
                    np.isnan(normalize) == True):
                if len(tmp_crit_df) == 1:  # Use marker if only one value exists
                    ax2.plot(pd.to_datetime(tmp_crit_df.index), tmp_crit_df.tmp_crit_y, color='red', linewidth=0.5,
                             zorder=20,
                             marker='o', markersize=1)
                else:
                    ax2.plot(pd.to_datetime(tmp_crit_df.index), tmp_crit_df.tmp_crit_y, color='red', linewidth=0.5,
                             zorder=20)

        # Heatmap plot
        if not np.isnan(normalize):  # Density 존재하지 않는 항목에 대해 heatmap 그리지 않도록 함
            for j in range(day - 1):
                for i in range(n - 1):
                    heatmap_color1[j, i] = ((1 - (weight_tot[j, i + 1] - weight_tot[
                        j, i]) / normalize)) * 0.8 + 0.2  # darkblue 0.2 ~ yellow 1.0
                    heatmap_opacity[j, i] = (weight_tot[j, i + 1] - weight_tot[j, i]) / normalize
                    ax1.fill_between((xdate_tot[j], xdate_tot[j + 1]), value_tot[(j, j + 1), i],
                                     value_tot[(j, j + 1), (i + 1)],
                                     facecolor=colormap1(heatmap_color1[j, i])[:3] + (
                                     heatmap_opacity[j, i] * 0.9 + 0.1,),
                                     edgecolor=colormap1(heatmap_color1[j, i])[:3] + (
                                     heatmap_opacity[j, i] * 0.9 + 0.1,), zorder=15)

                    ax2.fill_between((xdate_tot[j], xdate_tot[j + 1]), value_tot[(j, j + 1), i],
                                     value_tot[(j, j + 1), (i + 1)],
                                     facecolor=colormap1(heatmap_color1[j, i])[:3] + (
                                     heatmap_opacity[j, i] * 0.9 + 0.1,),
                                     edgecolor=colormap1(heatmap_color1[j, i])[:3] + (
                                     heatmap_opacity[j, i] * 0.9 + 0.1,), zorder=15)

        # Warn / Crit / Heatmap 모두 존재하지 않을 경우
        elif (sub_df_crit_tot.size == 0) and (sub_df_warn_tot.size == 0) and np.isnan(normalize):
            if selected_column.startswith('Avail') == True:
                ax1.plot(xdate_tot, [100 for i in range(len(xdate_tot))], color='black', linewidth=1)
                ax2.plot(xdate_tot, [100 for i in range(len(xdate_tot))], color='black', linewidth=1)
            else:
                ax1.plot(xdate_tot, [0 for i in range(len(xdate_tot))], color='black', linewidth=1)
                ax2.plot(xdate_tot, [0 for i in range(len(xdate_tot))], color='black', linewidth=1)

        ax1.set_title(selected_column + ' (Whole Trend)', fontsize=15)
        ax1.tick_params(labelsize=10)
        ax1.set_xlabel('Date', fontsize=13)
        if tmp_info.Y_FORMAT == 'TB':
            ax1.set_ylabel(selected_column + ' (TB)', fontsize=13)
        else:
            ax1.set_ylabel(selected_column, fontsize=13)

        ax2.set_title(selected_column + ' (Zoom In)', fontsize=15)
        ax2.tick_params(labelsize=10)
        ax2.set_xlabel('Date', fontsize=13)
        if tmp_info.Y_FORMAT == 'TB':
            ax2.set_ylabel(selected_column + ' (TB)', fontsize=13)
        else:
            ax2.set_ylabel(selected_column, fontsize=13)
        ax2.set_ylim(axis_min_tot, axis_max_tot)

        # y축 상 과도하게 큰 숫자는 scientific format으로 변경
        ax1.ticklabel_format(style='sci', axis='y', scilimits=(-3, 6), useOffset=False)
        ax2.ticklabel_format(style='sci', axis='y', scilimits=(-3, 6), useOffset=False)

        # Host Read/Write의 Million format은 위의 scientific format으로 대체
        #     ax1.ticklabel_format(style = 'plain', axis='y', useOffset=False)
        #     ax2.ticklabel_format(style = 'plain', axis='y', useOffset=False)

        #     if tmp_info.Y_FORMAT == "M":
        #         formatter = FuncFormatter(millions)
        #         ax1.yaxis.set_major_formatter(formatter)
        #         ax2.yaxis.set_major_formatter(formatter)

        # Customize xticks for long-term plot
        xticks = ax1.get_xticks()
        ax1.set_xticks(np.linspace(xticks[0], xticks[-1], 7))

        ax1.xaxis.set_major_formatter(DateFormatter('%b %d'))
        ax2.xaxis.set_major_formatter(DateFormatter('%b %d'))

        fig.tight_layout()
        fig.savefig(target_path +'/1_Figure/' + selected_column + '_LongTerm_Trend.png', dpi=100)

        plt.close(fig)
        fig.clf()

        ##Short Term
        fig_short, (ax3, ax4) = plt.subplots(nrows=1, ncols=2, sharex=True, figsize=(12, 4.6), dpi=100)

        # Plot for warning SSDs
        for ind in sub_df_warn_short.index:
            tmp_warn = sub_df_warn_short.loc[ind]
            tmp_warn_df = pd.DataFrame(
                {'tmp_warn_x': np.arange(0, day_short, 1), 'tmp_warn_y': tmp_warn[:tmp_warn.size]}).dropna()
            if len(tmp_warn_df) == 1:  # Use marker if only one value exists
                ax3.plot(pd.to_datetime(tmp_warn_df.index), tmp_warn_df.tmp_warn_y, color='grey', linewidth=0.5,
                         zorder=0,
                         marker='o', markersize=1)
            else:
                ax3.plot(pd.to_datetime(tmp_warn_df.index), tmp_warn_df.tmp_warn_y, color='grey', linewidth=0.5,
                         zorder=0)

            if (selected_column.startswith('Temp') == True or
                    selected_column.startswith('Avail') == True or
                    np.isnan(normalize_short) == True):
                if len(tmp_warn_df) == 1:  # Use marker if only one value exists
                    ax4.plot(pd.to_datetime(tmp_warn_df.index), tmp_warn_df.tmp_warn_y, color='grey', linewidth=0.5,
                             zorder=0,
                             marker='o', markersize=1)
                else:
                    ax4.plot(pd.to_datetime(tmp_warn_df.index), tmp_warn_df.tmp_warn_y, color='grey', linewidth=0.5,
                             zorder=0)

        # Plot for critical SSDs
        for ind in sub_df_crit_short.index:
            tmp_crit = sub_df_crit_short.loc[ind]
            tmp_crit_df = pd.DataFrame(
                {'tmp_crit_x': np.arange(0, day_short, 1), 'tmp_crit_y': tmp_crit[:tmp_crit.size]}).dropna()
            if len(tmp_crit_df) == 1:  # Use marker if only one value exists
                ax3.plot(pd.to_datetime(tmp_crit_df.index), tmp_crit_df.tmp_crit_y, color='red', linewidth=0.5,
                         zorder=20,
                         marker='o', markersize=1)
            else:
                ax3.plot(pd.to_datetime(tmp_crit_df.index), tmp_crit_df.tmp_crit_y, color='red', linewidth=0.5,
                         zorder=20)

            if (selected_column.startswith('Temp') == True or
                    selected_column.startswith('Avail') == True or
                    np.isnan(normalize_short) == True):
                if len(tmp_crit_df) == 1:  # Use marker if only one value exists
                    ax4.plot(pd.to_datetime(tmp_crit_df.index), tmp_crit_df.tmp_crit_y, color='red', linewidth=0.5,
                             zorder=20,
                             marker='o', markersize=1)
                else:
                    ax4.plot(pd.to_datetime(tmp_crit_df.index), tmp_crit_df.tmp_crit_y, color='red', linewidth=0.5,
                             zorder=20)

        # Heatmap plot
        if not np.isnan(normalize_short):  # Density 존재하지 않는 항목에 대해 heatmap 그리지 않도록 함
            for j in range(day_short - 1):
                for i in range(n_short - 1):
                    heatmap_color1[j, i] = ((1 - (weight_short[j, i + 1] - weight_short[
                        j, i]) / normalize_short)) * 0.8 + 0.2  # darkblue 0.2 ~ yellow 1.0
                    heatmap_opacity[j, i] = (weight_short[j, i + 1] - weight_short[j, i]) / normalize_short

                    ax3.fill_between((xdate_short[j], xdate_short[j + 1]), value_short[(j, j + 1), i],
                                     value_short[(j, j + 1), (i + 1)],
                                     facecolor=colormap1(heatmap_color1[j, i])[:3] + (
                                     heatmap_opacity[j, i] * 0.9 + 0.1,),
                                     edgecolor=colormap1(heatmap_color1[j, i])[:3] + (
                                     heatmap_opacity[j, i] * 0.9 + 0.1,), zorder=15)

                    ax4.fill_between((xdate_short[j], xdate_short[j + 1]), value_short[(j, j + 1), i],
                                     value_short[(j, j + 1), (i + 1)],
                                     facecolor=colormap1(heatmap_color1[j, i])[:3] + (
                                     heatmap_opacity[j, i] * 0.9 + 0.1,),
                                     edgecolor=colormap1(heatmap_color1[j, i])[:3] + (
                                     heatmap_opacity[j, i] * 0.9 + 0.1,), zorder=15)

        # Warn / Crit / Heatmap 모두 존재하지 않을 경우
        elif (sub_df_crit_short.size == 0) and (sub_df_warn_short.size == 0) and np.isnan(normalize_short):
            if selected_column.startswith('Avail') == True:
                ax3.plot(xdate_short, [100 for i in range(len(xdate_short))], color='black', linewidth=1)
                ax4.plot(xdate_short, [100 for i in range(len(xdate_short))], color='black', linewidth=1)
            else:
                ax3.plot(xdate_short, [0 for i in range(len(xdate_short))], color='black', linewidth=1)
                ax4.plot(xdate_short, [0 for i in range(len(xdate_short))], color='black', linewidth=1)

        ax3.set_title(selected_column + ' (Whole Trend)', fontsize=16)
        ax3.tick_params(labelsize=11)
        ax3.set_xlabel('Date', fontsize=14)
        if tmp_info.Y_FORMAT == 'TB':
            ax3.set_ylabel(selected_column + ' (TB)', fontsize=14)
        else:
            ax3.set_ylabel(selected_column, fontsize=14)

        ax4.set_title(selected_column + ' (Zoom In)', fontsize=16)
        ax4.tick_params(labelsize=11)
        ax4.set_xlabel('Date', fontsize=14)
        if tmp_info.Y_FORMAT == 'TB':
            ax4.set_ylabel(selected_column + ' (TB)', fontsize=14)
        else:
            ax4.set_ylabel(selected_column, fontsize=14)
        ax4.set_ylim(axis_min_short, axis_max_short)

        # y축 상 과도하게 큰 숫자는 scientific format으로 변경
        ax3.ticklabel_format(style='sci', axis='y', scilimits=(-3, 6), useOffset=False)
        ax4.ticklabel_format(style='sci', axis='y', scilimits=(-3, 6), useOffset=False)

        # Host Read/Write의 Million format은 위의 scientific format으로 대체
        #     ax3.ticklabel_format(style = 'plain', axis='y', useOffset=False)
        #     ax4.ticklabel_format(style = 'plain', axis='y', useOffset=False)

        #     if tmp_info.Y_FORMAT == "M":
        #         formatter = FuncFormatter(millions)
        #         ax3.yaxis.set_major_formatter(formatter)
        #         ax4.yaxis.set_major_formatter(formatter)

        # Customize xticks for short-term plot
        xticks = ax3.get_xticks()
        day_count = int(xticks[-1] - xticks[0]) + 1
        ax3.set_xticks(np.linspace(xticks[0], xticks[-1], day_count))

        ax3.xaxis.set_major_formatter(DateFormatter('%m-%d'))
        ax4.xaxis.set_major_formatter(DateFormatter('%m-%d'))

        fig_short.tight_layout()
        print('shortterm_trend')
        fig_short.savefig(target_path +'/1_Figure/' + selected_column + '_ShortTerm_Trend.png', dpi=100)

        plt.close(fig_short)
        fig_short.clf()

        end = time.time()
        print('Processing {} done - Elapsed time: '.format(selected_column) +
              f'{(end-start):.3f} seconds')




    #### Visualization - Critical Warning / Capacitor Health (Short Term)
    # items - SMART: Critical Warning, Ext_SMART: Capacitor Health
    # item 이름 및 정상 조건
    if product in ['PM963', 'PM983', 'PM953']:
        item_sp = ['CritWarning', 'Capacitor_Health']
    else:
        item_sp = ['CritWarning']

    cols_sp = system_info + ['SSDUID'] + item_sp

    # 먼저 last data 중에서 Critical SN 목록을 정리
    print('Loading last data file {}: '.format(file_list[-1]) + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
    start = time.time()

    sn_critwarning = pd.Series(dtype='object')
    sn_capfail = pd.Series(dtype='object')
    for chunk in pd.read_csv(path + '/' + file_list[-1], chunksize=1000000, usecols=cols_sp, dtype=types_dict):
        chunk_critwarning = pd.Series(dtype='object')
        chunk_capfail = pd.Series(dtype='object')
        for item in item_sp:
            if item == 'CritWarning':
                default = 0
                item_crit = pd.Series(
                    chunk[(~chunk[item].isna()) & (chunk[item] != default)]['DriveSerialNumber'].unique())
                chunk_critwarning = pd.concat([chunk_critwarning, item_crit])
            elif item == 'Capacitor_Health':
                default = 100
                item_crit = pd.Series(
                    chunk[(~chunk[item].isna()) & (chunk[item] != default)]['DriveSerialNumber'].unique())
                chunk_capfail = pd.concat([chunk_capfail, item_crit])

        sn_critwarning = pd.concat([sn_critwarning, chunk_critwarning])
        sn_capfail = pd.concat([sn_capfail, chunk_capfail])

    sn_critwarning = pd.Series(sn_critwarning.unique())
    sn_capfail = pd.Series(sn_capfail.unique())

    sn_sp_total = pd.Series(pd.concat([sn_critwarning, sn_capfail]).unique())

    end = time.time()
    print(f'Elapsed time : {(end-start):.3f} seconds')


    # 모든 데이터들에서 Critical SN DataFrame 불러오기
    start = time.time()
    data_sp = pd.DataFrame()

    for file in file_list:
        print('Loading filename {}: '.format(file) + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
        tmp_data = pd.DataFrame()
        for chunk in pd.read_csv(path + '/' + file, chunksize=1000000, dtype=types_dict):
            chunk = chunk[chunk['DriveSerialNumber'].isin(sn_sp_total)]
            chunk['date'] = chunk.TIMESTAMP.str[:10]
            tmp_data = pd.concat([tmp_data, chunk])
        tmp_data = tmp_data.sort_values(by=['DriveSerialNumber', 'TIMESTAMP'])  # Sort by: DriveSerialNumber, TIMESTAMP

        data_sp = pd.concat([data_sp, tmp_data])
    data_sp = data_sp.sort_values(by=['DriveSerialNumber', 'TIMESTAMP'])  # Sort by: DriveSerialNumber, TIMESTAMP
    end = time.time()
    print(f'Elapsed time : {(end-start):.3f} seconds')

    # groupby 사용해서 각 DriveSerialNumber에 대해 유일한 SSDUID만 남기기
    grouped = data_sp.groupby(by='DriveSerialNumber')
    data_sp_long_fixed = grouped.apply(get_df_1ssduid)
    data_sp_long_fixed = data_sp_long_fixed.reset_index(drop=True)

    short_datetime = tmp_data['date'].astype('datetime64[ns]').min()

    # Generate short-term Crit/Warn data
    data_sp_long_fixed['date'] = pd.to_datetime(data_sp_long_fixed['date'])
    data_sp_short = data_sp_long_fixed[data_sp_long_fixed['date'] >= short_datetime]

    ## Abnormal Data Save & Short Term Plot
    for item in item_sp:
        sub_df = data_sp_short[['DriveSerialNumber', 'date', item]]
        sub_df = sub_df.drop_duplicates(subset=['DriveSerialNumber', 'date'], keep='last')

        sub_df_pivot = sub_df.pivot(index='DriveSerialNumber', columns='date', values=item)

        if item == 'CritWarning':
            crit_tot = sn_critwarning
            default = 0
        elif item == 'Capacitor_Health':
            crit_tot = sn_capfail
            default = 100

        sub_df_crit = sub_df_pivot[sub_df_pivot.index.isin(crit_tot)]

        # Filter Crit SN
        sub_df_crit_temp = sub_df_crit[sub_df_crit != default]

        sub_df_crit_temp = sub_df_crit_temp.dropna(axis=0, how='all')
        crit_tot_new = sub_df_crit_temp.index

        print('Before filtering: {}'.format(len(crit_tot)))
        print('After filtering: {}'.format(len(crit_tot_new)))

        sub_df_crit = sub_df_crit.loc[crit_tot_new]

        ## Last Week Abnormal Data Save (Excel File) #############################

        sub_df_raw = data_sp_short.set_index('DriveSerialNumber')
        sub_df_crit_raw = sub_df_raw[sub_df_raw.index.isin(crit_tot_new)]
        sub_df_crit_raw.to_csv(target_path +'/0_Abnormal_Data/Crit_' + item + '.csv')

        ##########################################################################

        fig, axw = plt.subplots(nrows=1, ncols=1, sharex=True, figsize=(6, 4.6), dpi=100)
        for ind in sub_df_crit.index:
            tmp_crit = sub_df_crit.loc[ind]
            tmp_crit_df = pd.DataFrame({'tmp_crit_y': tmp_crit[:tmp_crit.size]})
            axw.plot(pd.to_datetime(tmp_crit_df.index), tmp_crit_df.tmp_crit_y, color='red', linewidth=0.5, zorder=20)
        axw.plot(pd.to_datetime(tmp_crit_df.index), [default for i in range(len(tmp_crit_df.index))],
                 color='black', linewidth=1, zorder=25)
        axw.xaxis.set_major_formatter(DateFormatter('%m-%d'))

        axw.set_title('{} (Whole Trend)'.format(item), fontsize=16)
        axw.tick_params(labelsize=11)
        axw.set_xlabel('Date', fontsize=14)
        axw.set_ylabel(item, fontsize=14)

        if item == 'CritWarning':
            axw.set_ylim(-0.5, 33)
            axw.set_yticks((0, 2, 4, 8, 16, 32))

        # Customize xticks for short-term plot
        xticks = axw.get_xticks()
        day_count = int(xticks[-1] - xticks[0]) + 1
        axw.set_xticks(np.linspace(xticks[0], xticks[-1], day_count))

        fig.tight_layout()
        fig.savefig(target_path +'/1_Figure/{}_ShortTerm_Trend.png'.format(item), dpi=100)

    ## Long Term Plot (Optional)
    for item in item_sp:
        sub_df = data_sp_long_fixed[['DriveSerialNumber', 'date', item]]
        sub_df = sub_df.drop_duplicates(subset=['DriveSerialNumber', 'date'], keep='last')

        sub_df_pivot = sub_df.pivot(index='DriveSerialNumber', columns='date', values=item)

        if item == 'CritWarning':
            crit_tot = sn_critwarning
            default = 0
        elif item == 'Capacitor_Health':
            crit_tot = sn_capfail
            default = 100

        sub_df_crit = sub_df_pivot[sub_df_pivot.index.isin(crit_tot)]

        # Filter Crit SN
        sub_df_crit_temp = sub_df_crit[sub_df_crit != default]

        sub_df_crit_temp = sub_df_crit_temp.dropna(axis=0, how='all')
        crit_tot_new = sub_df_crit_temp.index

        print('Before filtering: {}'.format(len(crit_tot)))
        print('After filtering: {}'.format(len(crit_tot_new)))

        sub_df_crit = sub_df_crit.loc[crit_tot_new]

        fig, axw = plt.subplots(nrows=1, ncols=1, sharex=True, figsize=(6, 3), dpi=100)
        for ind in sub_df_crit.index:
            tmp_crit = sub_df_crit.loc[ind]
            tmp_crit_df = pd.DataFrame({'tmp_crit_y': tmp_crit[:tmp_crit.size]})
            axw.plot(pd.to_datetime(tmp_crit_df.index), tmp_crit_df.tmp_crit_y, color='red', linewidth=0.5, zorder=20)
        axw.plot(pd.to_datetime(tmp_crit_df.index), [default for i in range(len(tmp_crit_df.index))],
                 color='black', linewidth=1, zorder=25)
        axw.xaxis.set_major_formatter(DateFormatter('%m-%d'))

        axw.set_title('{} (Whole Trend)'.format(item), fontsize=16)
        axw.tick_params(labelsize=11)
        axw.set_xlabel('Date', fontsize=14)
        axw.set_ylabel(item, fontsize=14)

        if item == 'CritWarning':
            axw.set_ylim(-0.5, 33)
            axw.set_yticks((0, 2, 4, 8, 16, 32))

        fig.tight_layout()
        fig.savefig(target_path + '/1_Figure/{}_LongTerm_Trend.png'.format(item), dpi=100)


###### Visualization (Histogram)
def Visualization_hist(target_product, target_logitem):
    global path, target_path, product, logitem, system_info, item_smart, item_ext_smart

    product = target_product
    logitem = target_logitem

    file_list = os.listdir(path)
    file_list.sort()

    # Telemetry 항목값의 Type 지정 : 필요 시 설정 (기본 None)
    types_dict = None


    crit_warn_info = pd.read_csv('Anomaly_Rulebase_{}.csv'.format(product), index_col=0)


    # Histogram 용 Item_list_hist 별개 운영
    if logitem == 'SMART':
        if product == 'PM963':  # PM963: Temp Sensor 2까지 있음
            item_list_hist = item_smart[1:3] + item_smart[4:19]
        elif product in ['PM983', 'PM1725b']:  # PM983 & PM1725b: Temp Sensor 3까지 있음
            item_list_hist = item_smart[1:3] + item_smart[4:20]
        elif product == 'PM953':  # PM953: NVMe 1.1 spec - No. of Err Info Log Entries까지
            item_list_hist = item_smart[1:3] + item_smart[4:15]
    elif logitem == 'Ext_SMART':
        if product in ['PM963', 'PM983']:
            item_list_hist = item_ext_smart[:7]
        elif product == 'PM953':
            item_list_hist = [item_ext_smart[0]] + item_ext_smart[2:5]

    # Histogram 위한 데이터 로드 시 가져올 columns : 필요한 column들만 사용
    cols_hist = [system_info[0]] + [system_info[6]] + item_list_hist
    if logitem == 'SMART':
        cols_hist = cols_hist + [item_smart[0]]  # Critical Warning
    elif logitem == 'Ext_SMART':
        cols_hist = cols_hist + [item_ext_smart[7]]  # Capacitor Health

    print(file_list)
    # Load last data (for histogram): 각 DriveSerialNumber 별 마지막 data만 남김
    print('Loading last data file {}: '.format(file_list[-1]) + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
    start = time.time()
    last_data = pd.DataFrame()
    # 데이터 불러오기: Histogram 작업 시 필요한 column들만 사용
    for chunk in pd.read_csv('./ssd/PM963_2021_w34_SMART.csv', usecols=cols_hist, chunksize=2000000, dtype=types_dict, encoding='ISO-8859-1'):
        chunk = chunk.sort_values(by=['DriveSerialNumber', 'TIMESTAMP'])
        chunk = chunk.drop_duplicates(subset='DriveSerialNumber', keep='last')
        last_data = pd.concat([last_data, chunk])
        last_data = last_data.sort_values(by=['DriveSerialNumber', 'TIMESTAMP'])
        last_data = last_data.drop_duplicates(subset='DriveSerialNumber', keep='last')
    end = time.time()
    print(f'Elapsed time : {(end-start):.3f} seconds')

    # (Short Term) Histogram
    for selected_column in item_list_hist:
        if selected_column == 'Wear_Range_Delta':  # Wear_Range_Delta: histogram 대신 bar chart 사용하여 표시
            continue
        hist_data = np.array(last_data[selected_column])
        hist_data = hist_data[~np.isnan(hist_data)]

        if False == os.path.exists(target_path + '/' + selected_column + '/'):###220615
            os.mkdir(target_path + '/' + selected_column + '/')
        else:
            pass
        tmp = os.listdir(target_path + '/' + selected_column + '/')
        tmp.sort()
        tmp_info = crit_warn_info.loc[selected_column.lower()]

        if tmp_info.Y_FORMAT == 'TB':
            hist_data = hist_data * 512 * 1000 / (1024 ** 4)

        bin_num = 50

        # 99percentile 또는 rule 기반으로 histogram range 설정
        value_99p = np.percentile(hist_data, 99)
        if not np.isnan(tmp_info.Y_MAX):
            value_99p = max(value_99p, tmp_info.Y_MAX)
            axis_max = value_99p + (value_99p - tmp_info.Y_MIN) * 0.1
            axis_min = max(0, tmp_info.Y_MIN - (value_99p - tmp_info.Y_MIN) * 0.1)
        else:
            if value_99p <= 0:  # 99percentile이 0일 때
                value_99p = 10
            axis_max = value_99p + (value_99p - tmp_info.Y_MIN) * 0.1
            axis_min = max(0, tmp_info.Y_MIN - (value_99p - tmp_info.Y_MIN) * 0.1)

        fig_hist, axh = plt.subplots(nrows=1, ncols=1, sharex=True, figsize=(6, 3), dpi=100)
        axh.hist(hist_data, bins=bin_num,
                 range=(axis_min, axis_max), color='skyblue', edgecolor='grey', linewidth=1)

        axh.set_title(selected_column + ' (Histogram)', fontsize=15)
        axh.tick_params(labelsize=11)

        if tmp_info.Y_FORMAT == 'TB':
            axh.set_xlabel(selected_column + ' (TB)', fontsize=13)
        else:
            axh.set_xlabel(selected_column, fontsize=13)

        axh.set_ylabel('SSD Count (pcs)', fontsize=13)
        formatter = FuncFormatter(kilo)
        axh.yaxis.set_major_formatter(formatter)

        fig_hist.tight_layout()
        fig_hist.savefig(target_path + '/1_Figure/' + selected_column + '_Histogram.png', dpi=100)

        plt.close(fig_hist)
        fig_hist.clf()

    # Bar Chart for special items (SMART: Critical Warning, Ext_SMART: Capacitor Health & Wear Range Delta)
    if logitem == 'SMART':
        hist_sp_items = ['CritWarning']
    elif logitem == 'Ext_SMART':
        hist_sp_items = ['Wear_Range_Delta', 'Capacitor_Health']

    for hist_item in hist_sp_items:
        hist_data = last_data[hist_item]
        if hist_item == 'CritWarning':
            width = 0.02 * 33
            xlim = (-1, 32)
        elif hist_item == 'Wear_Range_Delta':
            width = 0.02 * 6
            xlim = (-0.5, 5.5)
        elif hist_item == 'Capacitor_Health':
            width = 0.02 * 101
            xlim = (-5, 105)

        title = '{} (Bar Chart)'.format(hist_item)
        xlabel = hist_item

        hist_data = hist_data[~np.isnan(hist_data)]
        value_count = hist_data.value_counts()

        fig_bar, ax = plt.subplots(nrows=1, ncols=1, sharex=True, figsize=(6, 3), dpi=100)
        ax.bar(value_count.index, value_count, color='skyblue', edgecolor='grey', width=width, linewidth=1)

        ax.set_xlim(xlim)
        ax.set_title(title, fontsize=15)
        ax.tick_params(labelsize=11)

        ax.set_xlabel(xlabel, fontsize=13)
        ax.set_ylabel('SSD Count (pcs)', fontsize=13)
        formatter = FuncFormatter(kilo)
        ax.yaxis.set_major_formatter(formatter)

        fig_bar.tight_layout()
        fig_bar.savefig(target_path + '/1_Figure/{}_Barchart.png'.format(hist_item), dpi=100)

In [11]:
Visualization_main(product)

                                   0
0  Boundary_PM963_2021_w34_SMART.csv
1      Crit_PM963_2021_w34_SMART.csv
2                             Value_
3     Value_PM963_2021_w34_SMART.csv
4      Warn_PM963_2021_w34_SMART.csv
5                            Weight_
6    Weight_PM963_2021_w34_SMART.csv allfile_____
Index(['3345_3430_4A70_4822_0025_3858_0000_0001.',
       '3345_3430_4A73_6411_0025_3858_0000_0001.',
       '3345_3430_4A81_3737_0025_3842_0000_0001.',
       '3345_3430_4AA0_3657_0025_3842_0000_0001.',
       '3345_3430_4AA9_5739_0025_3842_0000_0001.',
       '3345_3430_4AB0_3428_0025_3858_0000_0001.',
       '3345_3430_4AB1_4428_0025_3858_0000_0001.',
       '3345_3430_4AC0_1244_0025_3842_0000_0001.',
       '3345_3430_4AC0_7585_0025_3858_0000_0001.',
       '3345_3430_4AC5_2100_0025_3842_0000_0001.',
       '3345_3430_4AC6_6773_0025_3842_0000_0001.',
       '3345_3430_4AC6_7068_0025_3842_0000_0001.',
       '3345_3430_4AC6_7242_0025_3842_0000_0001.',
       '3345_3430_4AC7_3916_

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd2 in position 16: invalid continuation byte

In [3]:
### Make Plot for EDA
## MTP / 노트북 통합으로 viz_f 제거
# viz_f.path = path #SMART 데이터 위치 #as-is
# viz_f.target_path = target_path #KDE용 생성 데이터 저장된 위치 #as-is
# viz_f.Visualization_main(product) #as-is
# viz_f.Visualization_hist(product,'SMART') #as-is
# viz_f.Visualization_hist(product,'Ext_SMART') #as-is
# Visualization_main(product)
Visualization_hist(product,'SMART')
# Visualization_hist(product,'Ext_SMART')

NameError: name 'product' is not defined

Index(['3345_3430_4A70_4822_0025_3858_0000_0001.',
       '3345_3430_4A73_6411_0025_3858_0000_0001.',
       '3345_3430_4A81_3737_0025_3842_0000_0001.',
       '3345_3430_4AA0_3657_0025_3842_0000_0001.',
       '3345_3430_4AA9_5739_0025_3842_0000_0001.',
       '3345_3430_4AB0_3428_0025_3858_0000_0001.',
       '3345_3430_4AB1_4428_0025_3858_0000_0001.',
       '3345_3430_4AC0_1244_0025_3842_0000_0001.',
       '3345_3430_4AC0_7585_0025_3858_0000_0001.',
       '3345_3430_4AC5_2100_0025_3842_0000_0001.',
       '3345_3430_4AC6_6773_0025_3842_0000_0001.',
       '3345_3430_4AC6_7068_0025_3842_0000_0001.',
       '3345_3430_4AC6_7242_0025_3842_0000_0001.',
       '3345_3430_4AC7_3916_0025_3858_0000_0001.',
       '3345_3430_4AC8_2436_0025_3842_0000_0001.',
       '3345_3430_4B20_8367_0025_3858_0000_0001.',
       '3345_3430_4B58_6755_0025_3858_0000_0001.',
       '3345_3430_4B72_1596_0025_3858_0000_0001.',
       '3345_3430_4B76_0412_0025_3858_0000_0001.',
       '3345_3430_4B80_5841_002

In [None]:


####### PM983
product = 'PM983' # PM963, PM983, PM1725b, PM953

#읽어올 파일 경로 및 파일 목록
#path = 'F:/Telemetry_raw/Daily/To_Weekly/{}/SMART'.format(product) #as-is

## MTP / 신규 추가
path = adls_path + 'Telemetry_raw/Daily/To_Weekly/{}/SMART'.format(product)
pathList = path + "/*"
print('pathList: ' + pathList)
df_pathList = spark.read.format("csv").load(pathList)
files = df_pathList.inputFiles()    
file_list = files.sort(reverse=True)


#KDE 시각화를 위해 생성되는 파일들 저장 위치
#target_path = 'F:/Telemetry_output/MSC/{}'.format(product) #as-is

## MTP / 변경
target_path = adls_path + 'Telemetry_output/MSC/{}'.format(product)
print('path : ' + path)
print('target_path : ' + target_path)

#file_list = os.listdir(path).sort() #as-is
### Make Data for KDE estimation
#kde_f.path = path #SMART 데이터 위치 #as-is
#kde_f.target_path = target_path #KDE용 생성 데이터 저장 위치 #as-is

## MTP / 노트북 통합으로 kde_f 제거
# kde_f.KDE_main(product,'SMART', num_of_last_week) #as-is
# kde_f.KDE_main(product,'Ext_SMART', num_of_last_week) #as-is
KDE_main(product,'SMART', num_of_last_week)
KDE_main(product,'Ext_SMART', num_of_last_week)


In [None]:

### Make Plot for EDA
## MTP / 노트북 통합으로 viz_f 제거
# viz_f.path = path #SMART 데이터 위치 #as-is
# viz_f.target_path = target_path #KDE용 생성 데이터 저장된 위치 #as-is
# viz_f.Visualization_main(product) #as-is
# viz_f.Visualization_hist(product,'SMART') #as-is
# viz_f.Visualization_hist(product,'Ext_SMART') #as-is
Visualization_main(product)
Visualization_hist(product,'SMART')
Visualization_hist(product,'Ext_SMART')
