In [None]:
### 라이브러리 선언


## 표준 라이브러리
import os
import random


## 서드파티 라이브러리
import numpy as np
import pandas as pd

In [None]:
### Data Range


## Total Flow Rates
total_flow_rates = [80, 90, 100, 110, 120, 130, 140]


## Equivalence Ratios
equivalence_ratios = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]


## Mix Ratios
mix_ratios = [0, 3.75, 7.5, 11.25, 15, 18.75, 22.5, 26.25, 30]

In [None]:
### Declaration of Variables


## Number of OES data for each Experiment Conditions
NUM_OF_SPECTRUM = 500

## Number of OES data after reflecting the Average
NUM_OF_AVG_SPECTRUM = 100

## Declaration of Distribution Variable
NUM_OF_CONDITION = 0


## Ratio of Validation Dataset
VALID_DATA_RATIO = 0.4

In [None]:
### Loading Data from Excel File


## File path of Spectrum Data
spectrum_file_path = "/Users/RFL/SeoyeonYoun/PT/OES_data"


## File path of Actual Experimental Conditon Data
actual_condition_df = pd.read_excel("/Users/RFL/SeoyeonYoun/PT/ActualCondition_data.xlsx")

In [None]:
### Input and Output Dataset List (Train, Test, and Valid)


## Input X = Spectrum Data
X_Train = []
X_Test = []
X_Valid = []


## Output Y = TotalFLowRate, EquivalenceRatio, MixRatio
Y_TFR_Train = []
Y_TFR_Test = []
Y_TFR_Valid = []

Y_ER_Train = []
Y_ER_Test = []
Y_ER_Valid = []

Y_MR_Train = []
Y_MR_Test = []
Y_MR_Valid = []

In [None]:
### File Directory Function and File Name Function


## File Directory Function
def find_directory(forder_path, v_tfr, v_er, v_mr ):
    
    # File Directory 저장하기
    file_directory = os.path.join(forder_path, str(v_tfr), str(v_er), str(v_mr))

    # Directory가 실제로 존재하는지 확인
    if not os.path.exists(file_directory):
        raise FileNotFoundError(f'Directory not found: {file_directory}')
    
    return file_directory


## File Name Function
def find_name(file_directory):
    
    # .xlsx 파일 목록 필터링
    file_list = os.listdir(file_directory)

    # .xlsx 파일이 실제로 존재하는지 확인인
    if not file_list:
        raise FileNotFoundError(f'No .xlsx files found in {file_directory}')
    
    # File Name 생성
    file_name = os.path.join(file_directory, file_list[0])

    return file_name

In [None]:
### Data Preprocessing Function

def preprocess_data(name_of_file):
    
    try:
        # Excel file을 DataFrame 형태로로 읽어오고, 원하는 범위로 자르기
        spectrum_data = pd.read_excel(name_of_file, header=None)
        spectrum_data = spectrum_data.iloc[5:1605, 1:NUM_OF_SPECTRUM+1]

        # 데이터 타입을 float으로로 변환 (for Interpolation)
        spectrum_data = spectrum_data.astype(float)
                    
        # 음수 값 또는 20000 초과 값을 NaN으로 변경
        spectrum_data[spectrum_data<0] = np.nan
        spectrum_data[spectrum_data>20000] = np.nan

        # NaN을 Interpolate
        spectrum_data = spectrum_data.interpolate(method='linear', limit_direction='both')

        # 여전히 NaN 값이 남아있는 경우, 앞뒤 값으로 채움
        spectrum_data = spectrum_data.fillna(method='ffill').fillna(method='bfill')

        # 동일 조건에 대한 500개의 스펙트럼을 50개씩 묶어서 평균 내기 (1600, 500)이 (1600, 100)으로 축소
        spectrum_data_avg = spectrum_data.groupby(np.arange(spectrum_data.shape[1])//5, axis=1).mean()

        # DataFrame 형태를 NumPy 배열(ndarray)             
        spectrum_data_avg = spectrum_data_avg.values

        return spectrum_data_avg

    except Exception as e:
        # 앞에서 발생한 모든 예외(Exception)을 e로 받아서 출력
        return {f'Error reading {name_of_file}: {e}'}

In [None]:
### Data Distribution Function

def distribute_data(x, y_tfr, y_er, y_mr, spectrum_data):
    
    # Preprocessing된 Spectrum Data를 Dataset에 분배
    for index in range(NUM_OF_AVG_SPECTRUM):
        x.append(spectrum_data[:, index])
        y_tfr.append(actual_condition_df.iloc[NUM_OF_CONDITION, 0])
        y_er.append(actual_condition_df.iloc[NUM_OF_CONDITION, 1])
        y_mr.append(actual_condition_df.iloc[NUM_OF_CONDITION, 2])
    
    return x, y_tfr, y_er, y_mr

In [None]:
### Data Preprocessing and Distribution

for index_TFR, value_TFR in enumerate(total_flow_rates, start=0):
    for index_ER, value_ER in enumerate(equivalence_ratios, start=0):
        for index_MR, value_MR in enumerate(mix_ratios, start=0):
            
            # Progress 판단을 위한 임시 출력
            # print([value_TFR, value_ER, value_MR])

            # File Directory 생성
            file_directory = find_directory(spectrum_file_path, value_TFR, value_ER, value_MR)

            # File Name 생성
            file_name = find_name(file_directory)

            # Spectrum Data Preprocessing 진행
            spectrum_data_avg = preprocess_data(file_name)

            # Validation Dataset의 비율 조정을 위한 변수
            VALID_SCORE = random.random()

            # Spectrum Daata Distribution 진행
            if (index_TFR+index_ER+index_MR) % 2 == 0:
                X_Train, Y_TFR_Train, Y_ER_Train, Y_MR_Train = distribute_data(X_Train, Y_TFR_Train, Y_ER_Train, Y_MR_Train, spectrum_data_avg)
                print([value_TFR, value_ER, value_MR])
            elif (index_TFR+index_ER+index_MR) % 2 == 1:
                if (VALID_DATA_RATIO >= VALID_SCORE):
                    X_Valid, Y_TFR_Valid, Y_ER_Valid, Y_MR_Valid = distribute_data(X_Valid, Y_TFR_Valid, Y_ER_Valid, Y_MR_Valid, spectrum_data_avg)
                else:
                    X_Test, Y_TFR_Test, Y_ER_Test, Y_MR_Test = distribute_data(X_Test, Y_TFR_Test, Y_ER_Test, Y_MR_Test, spectrum_data_avg)
            else:
                print("Data distribution error!\n")
                print(f'Error @ {index_TFR}, {index_ER}, {index_MR}\n')
                print(f'Error @ {value_TFR}, {value_ER}, {value_MR}\n')

            # Global Variable 업데이트
            NUM_OF_CONDITION += 1

In [None]:
### Check Results of Data Preprocessing and Distribution - (1)


## Train Dataset for Spectrum Data 
print("X_Train Data의 경우")
print(max(len(x_Train) for x_Train in X_Train))
print(min(len(x_Train) for x_Train in X_Train))


## Test Dataset for Spectrum Data 
print("X_Test Data의 경우")
print(max(len(x_Test) for x_Test in X_Test))
print(min(len(x_Test) for x_Test in X_Test))


## Valid Dataset for Spectrum Data 
print("X_Valid Data의 경우")
print(max(len(x_Valid) for x_Valid in X_Valid))
print(min(len(x_Valid) for x_Valid in X_Valid))

In [None]:
### Check Results of Data Preprocessing and Distribution - (2)


## Function for Making Listed List Flat 
def flatten_list(nested_list):
    
    flat_list = [item for sublist in nested_list for item in sublist]
    return flat_list


## Train Spectrum Dataset의 최대값과 최소값 구하기기
print("X_Train Data의 경우")
print(f'최대값: {max(flatten_list(X_Train))}')
print(f'최소값: {min(flatten_list(X_Train))}')


## Test Spectrum Dataset의 최대값과 최소값 구하기
print("X_Test Data의 경우")
print(f'최대값: {max(flatten_list(X_Test))}')
print(f'최소값: {min(flatten_list(X_Test))}')


## Valid Spectrum Dataset의 최대값과 최소값 구하기
print("X_Valid Data의 경우")
print(f'최대값: {max(flatten_list(X_Valid))}')
print(f'최소값: {min(flatten_list(X_Valid))}')

In [None]:
### Check Results of Data Preprocessing and Distribution - (3)


## Input X의 길이 확인
print(f'X_Train Data Condition = {int(len(X_Train)/100)}')
print(f'X_Test Data Condition = {int(len(X_Test)/100)}')
print(f'X_Valid Data Condition = {int(len(X_Valid)/100)}')


## Output Y_TFR의 길이 확인
print(f'Y_TFR_Train Data Condition = {int(len(Y_TFR_Train)/100)}')
print(f'Y_TFR_Test Data Condition = {int(len(Y_TFR_Test)/100)}')
print(f'Y_TFR_Valid Data Condition = {int(len(Y_TFR_Valid)/100)}')


## Output Y_ER의 길이 확인
print(f'Y_ER_Train Data Condition = {int(len(Y_ER_Train)/100)}')
print(f'Y_ER_Test Data Condition = {int(len(Y_ER_Test)/100)}')
print(f'Y_ER_Valid Data Condition = {int(len(Y_ER_Valid)/100)}')


## Output Y_MR의 길이 확인
print(f'Y_MR_Train Data Condition = {int(len(Y_MR_Train)/100)}')
print(f'Y_MR_Test Data Condition = {int(len(Y_MR_Test)/100)}')
print(f'Y_MR_Valid Data Condition = {int(len(Y_MR_Valid)/100)}')

In [None]:
### Change into NumPy Array


## Input X
X_Train = np.array(X_Train)
X_Test = np.array(X_Test)
X_Valid = np.array(X_Valid)


## Output Y_TFR
Y_TFR_Train = np.array(Y_TFR_Train)
Y_TFR_Test = np.array(Y_TFR_Test)
Y_TFR_Valid = np.array(Y_TFR_Valid)


## Output Y_ER
Y_ER_Train = np.array(Y_ER_Train)
Y_ER_Test = np.array(Y_ER_Test)
Y_ER_Valid = np.array(Y_ER_Valid)


## Output Y_MR
Y_MR_Train = np.array(Y_MR_Train)
Y_MR_Test = np.array(Y_MR_Test)
Y_MR_Valid = np.array(Y_MR_Valid)

In [None]:
### Check Results of NumPy Array - (4)


## Input X
print(f'X_Train의 크기: {X_Train.shape}')
print(f'X_Test의 크기: {X_Test.shape}')
print(f'X_Valid의 크기: {X_Valid.shape}')


## Output Y_TFR
print(f'Y_TFR_Train의 크기: {Y_TFR_Train.shape}')
print(f'Y_TFR_Test의 크기: {Y_TFR_Test.shape}')
print(f'Y_TFR_Valid의 크기: {Y_TFR_Valid.shape}')


## Output Y_ER
print(f'Y_ER_Train의 크기: {Y_TFR_Train.shape}')
print(f'Y_ER_Test의 크기: {Y_TFR_Test.shape}')
print(f'Y_ER_Valid의 크기: {Y_TFR_Valid.shape}')


## Output Y_MR
print(f'Y_MR_Train의 크기: {Y_MR_Train.shape}')
print(f'Y_MR_Test의 크기: {Y_MR_Test.shape}')
print(f'Y_MR_Valid의 크기: {Y_MR_Valid.shape}')

In [None]:
### Normalization Function of Spectrum Data

def normalize_x(x_target, x_train, x_test, x_valid):
    
    # Spectrum 데이터 값 중 최대값 찾기
    max_x_spectrum = np.max([np.max(x_train), np.max(x_test), np.max(x_valid)])

    # Spectrum 데이터 값 중 최대값 출력 및 확인
    print(f'Normalization을 위한 Spectrum 데이터 중 최대값: {max_x_spectrum}')

    # Normalized 완료된 Spectrum Data를 저장할 새로운 Array 변수 생성
    normalized_x_target = np.zeros(len(x_target)*1600).reshape(len(x_target), 1600)

    # Normalization 진행
    for index in range(len(x_target)):
        normalized_x_target[index, :] = x_target[index, :] / max_x_spectrum

    return normalized_x_target

In [None]:
### Normalization of Input X


## Normalization of X_Train
normalized_X_Train = normalize_x(X_Train, X_Train, X_Test, X_Valid)


## Normalization of X_Test
normalized_X_Test = normalize_x(X_Test, X_Train, X_Test, X_Valid)


## Normalization of X_Valid
normalized_X_Valid = normalize_x(X_Valid, X_Train, X_Test, X_Valid)

In [None]:
### Normalization Function of Actual Condition

def normalize_y(y_target_train, y_target_test, y_target_valid, actual_condition_dataframe, column_index):
    
    # Actual Condition에서 Target 조건의 최소값과 최대값 찾기
    min_y_target = actual_condition_dataframe.iloc[:, column_index].min()
    max_y_target = actual_condition_dataframe.iloc[:, column_index].max()

    # Normalization 진행
    y_target_train = (y_target_train - min_y_target) / (max_y_target - min_y_target)
    y_target_test = (y_target_test - min_y_target) / (max_y_target - min_y_target)
    y_target_valid = (y_target_valid - min_y_target) / (max_y_target - min_y_target)

    return y_target_train, y_target_test, y_target_valid

In [None]:
### Normalization of Output Y


## Normalization of Y_TFR
Y_TFR_Train, Y_TFR_Test, Y_TFR_Valid = normalize_y(Y_TFR_Train, Y_TFR_Test, Y_TFR_Valid, actual_condition_df, 0)


## Normalization of Y_ER
Y_ER_Train, Y_ER_Test, Y_ER_Valid = normalize_y(Y_ER_Train, Y_ER_Test, Y_ER_Valid, actual_condition_df, 1)


## Normalization of Y_MR
Y_MR_Train, Y_MR_Test, Y_MR_Valid = normalize_y(Y_MR_Train, Y_MR_Test, Y_MR_Valid, actual_condition_df, 2)

In [None]:
### NumPy 파일로 변수 저장
"""
np.save("Normalized_X_Train.npy", normalized_X_Train)
np.save("Normalized_X_Test.npy", normalized_X_Test)
np.save("Normalized_X_Valid.npy", normalized_X_Valid)

np.save("Y_TFR_Train.npy", Y_TFR_Train)
np.save("Y_TFR_Test.npy", Y_TFR_Test)
np.save("Y_TFR_Valid.npy", Y_TFR_Valid)

np.save("Y_ER_Train.npy", Y_ER_Train)
np.save("Y_ER_Test.npy", Y_ER_Test)
np.save("Y_ER_Valid.npy", Y_ER_Valid)

np.save("Y_MR_Train.npy", Y_MR_Train)
np.save("Y_MR_Test.npy", Y_MR_Test)
np.save("Y_MR_Valid.npy", Y_MR_Valid)

print("NumPy 배열 변수들 저장 완료!")
"""