In [2]:
import pandas as pd
import numpy as np

from scipy import stats
import ast
from cfg_basic import *

In [12]:
class DataGenerator:
    def __init__(self, num_blocks=50):
        self.cfg = Configure()

        self.num_blocks = num_blocks

        # 없는 데이터를 전부 제거한 데이터프레임 생성(이후 이 데이터프레임을 사용)
        self.df_revised_for_group = pd.read_excel(self.cfg.data_params['data_revised_filepath'])

        # 그룹 개수에 대한 데이터프레임
        self.df_group_count = pd.read_excel(self.cfg.data_params['data_group'])

        # 그룹별 중량 모델에 대한 데이터프레임
        self.df_W_model = pd.read_excel(self.cfg.data_params['model_for_W'])

        # 그룹별 H01 모델에 대한 데이터프레임
        self.df_H01_model = pd.read_excel(self.cfg.data_params['model_for_H01'])

        # 그룹별 H02 모델에 대한 데이터프레임
        self.df_H02_model = pd.read_excel(self.cfg.data_params['model_for_H02'])

        # 그룹별 duration 모델에 대한 데이터프레임
        self.df_duration_model = pd.read_excel(self.cfg.data_params['model_for_duration'])


    def generate_group(self):  # 그룹을 선택한 후 선종과 블록 종류로 나누기 위한 함수
        # 그룹을 랜덤으로 선택->선종과 블록 타입으로 분리
        group = np.random.choice(self.df_group_count['선종_블록'], p=self.df_group_count['Proportion'])
        ship_type = group[0:2]
        block_type = group[-1]

        return (group, ship_type, block_type)


    def generate_process(self, group_code):         # 공종 명칭 생성 함수, 공정이 나오는 비율에 맞춰서 데이터 생성
        df_process_group = self.df_group_count[self.df_group_count['선종_블록'] == group_code]
        # 각 count 값을 올바르게 추출
        count = df_process_group['count'].values[0]
        panel_proportion = df_process_group['panel_count'].values[0] / count
        curve_proportion = df_process_group['curve_count'].values[0] / count
        big_proportion = df_process_group['big_count'].values[0] / count
        final_proportion = df_process_group['final_count'].values[0] / count

        proportion_list = [panel_proportion, curve_proportion, big_proportion, final_proportion]

        process_type = np.random.choice(['평중조', '곡중조', '대조중조', 'Final조립'], p=proportion_list)
        return process_type


    def calculate_interval(self):  # 착수일 간격 계산을 위한 함수
        p = self.cfg.data_params['p_for_interval']
        interval = stats.geom.rvs(p) - 1  # scipy의 geometric 함수의 rvs는 1부터 시작하기 때문에 1을 빼서 사용

        return interval


    def generate_property(self, group_code, process_type, property):
        if group_code in self.cfg.data_params['group_sampling']:   # 샘플링이 필요한 그룹은 generate 함수 내부에서 계산
            pass

        else:
            df_code = self.df_revised_for_group[self.df_revised_for_group['선종_블록'] == group_code]

            df_property = pd.read_excel(self.cfg.data_params['model_for_property'], sheet_name=property)
            df_property['best_params'] = df_property['best_params'].apply(ast.literal_eval)
            idx = df_property[(df_property['선종_블록'] == group_code) & (df_property['process_type'] == process_type)].index
            best_distribution_name = df_property.loc[idx, 'best_distribution_name'].values[0]
            best_params = df_property.loc[idx, 'best_params'].values[0]

            rvs = 0

            if best_distribution_name == 'cauchy':
                rvs = stats.cauchy.rvs(*best_params)
            elif best_distribution_name == 'expon':
                rvs = stats.expon.rvs(*best_params)
            elif best_distribution_name == 'gamma':
                rvs = stats.gamma.rvs(*best_params)
            elif best_distribution_name == 'norm':
                rvs = stats.norm.rvs(*best_params)
            elif best_distribution_name == 'exponpow':
                rvs = stats.exponpow.rvs(*best_params)
            elif best_distribution_name == 'lognorm':
                rvs = stats.lognorm.rvs(*best_params)
            elif best_distribution_name == 'powerlaw':
                rvs = stats.powerlaw.rvs(*best_params)
            elif best_distribution_name == 'reyleigh':
                rvs = stats.reyleigh.rvs(*best_params)
            elif best_distribution_name == 'uniform':
                rvs = stats.uniform.rvs(*best_params)

            property_value = rvs

        if property_value > df_code[property].max():
            property_value = df_code[property].max()
        elif property_value < df_code[property].min():
            property_value = df_code[property].min()

        property_value = np.floor(property_value * 10) / 10

        return property_value


    def generate_weight(self, group_code, process_type, length, breadth, height):
        if group_code not in ['CN_T', 'LN_D', 'VL_D']:
            df_revised_for_weight = self.df_revised_for_group[self.df_revised_for_group['선종_블록'] == group_code]

            idx_group = self.df_W_model[self.df_W_model['선종_블록'] == group_code].index

        else:
            if group_code == 'CN_T':        # CN_T: CN_D의 모델 사용
                df_revised_for_weight = self.df_revised_for_group[self.df_revised_for_group['선종_블록'] == 'CN_D']
                idx_group = self.df_W_model[self.df_W_model['선종_블록'] == 'CN_D'].index
            elif group_code == 'LN_D':      # LN_D: LN_E의 모델 사용
                df_revised_for_weight = self.df_revised_for_group[self.df_revised_for_group['선종_블록'] == 'LN_E']
                idx_group = self.df_W_model[self.df_W_model['선종_블록'] == 'LN_E'].index
            elif group_code == 'VL_D':      # VL_D: VL_B의 모델 사용
                df_revised_for_weight = self.df_revised_for_group[self.df_revised_for_group['선종_블록'] == 'VL_B']
                idx_group = self.df_W_model[self.df_W_model['선종_블록'] == 'VL_B'].index

        reg_coef = self.df_W_model.loc[idx_group, 'coef'].values[0]
        noise = self.df_W_model.loc[idx_group, 'std'].values[0]

        df_revised_for_final = df_revised_for_weight[df_revised_for_weight['공종_명칭'] == 'Final조립']
        max_weight = df_revised_for_final['W'].max()
        min_weight = df_revised_for_final['W'].min()

        LBH_value = length * breadth * height

        if process_type == 'Final조립':
            weight = reg_coef * LBH_value + np.random.normal(0, noise)

        # 중조 무게 피팅
        else:
            y_pred = reg_coef * LBH_value
            max_limit = y_pred * self.cfg.data_params['weight_max_limit_ratio']

            weight = max_limit + np.random.normal(0, noise)

        if weight < min_weight:
            weight = min_weight
        elif weight > max_weight:
            weight = max_weight

        weight = np.int64(weight)

        return weight


    def generate_workload_h01(self, group_code, length, breadth):
        df_for_H01 = self.df_revised_for_group[self.df_revised_for_group['선종_블록'] == group_code]
        idx_group = self.df_H01_model[self.df_H01_model['선종_블록'] == group_code].index

        min_limit = df_for_H01['H01'].min()

        reg_coef = [self.df_H01_model.loc[idx_group, 'coef_0'].values[0], self.df_H01_model.loc[idx_group, 'coef_1'].values[0], self.df_H01_model.loc[idx_group, 'coef_2'].values[0]]
        noise = self.df_H01_model.loc[idx_group, 'std'].values[0]

        workload_h01 = reg_coef[0] * length + reg_coef[1] * breadth + reg_coef[2] * (length * breadth) + np.random.normal(0, noise)

        if workload_h01 < min_limit:
            workload_h01 = min_limit

        workload_h01 = np.int64(workload_h01)

        return workload_h01


    def generate_workload_h02(self, group_code, workload_h01):     # H01에 비례
        df_for_H02 = self.df_revised_for_group[self.df_revised_for_group['선종_블록'] == group_code]
        idx_group = self.df_H02_model[self.df_H02_model['선종_블록'] == group_code].index

        min_limit = df_for_H02['H02'].min()
        max_limit = df_for_H02['H02'].max()

        reg_coef = self.df_H02_model.loc[idx_group, 'coef'].values[0]
        noise = self.df_H02_model.loc[idx_group, 'std'].values[0]

        workload_h02 = reg_coef * workload_h01 + np.random.normal(0, noise)
        if workload_h02 < min_limit:
            workload_h02 = min_limit
        elif workload_h02 > max_limit:
            workload_h02 = max_limit

        workload_h02 = np.int64(workload_h02)

        return workload_h02


    def generate_duration(self, group_code, workload_H01, workload_H02, weight):
        df_for_duration = self.df_revised_for_group[self.df_revised_for_group['선종_블록'] == group_code]
        min_limit = df_for_duration['계획공기'].min()

        idx_group = self.df_duration_model[self.df_duration_model['선종_블록'] == group_code].index
        reg_coef = [self.df_duration_model.loc[idx_group, 'coef_0'].values[0],
                    self.df_duration_model.loc[idx_group, 'coef_1'].values[0],
                    self.df_duration_model.loc[idx_group, 'coef_2'].values[0]]
        noise = self.df_duration_model.loc[idx_group, 'std'].values[0]


        duration = reg_coef[0] * workload_H01 + reg_coef[1] * workload_H02  + reg_coef[2] * weight + np.random.normal(0, noise)

        if duration < min_limit:
            duration = min_limit

        duration = np.int64(duration)

        return duration


    def calculate_buffer(self, process_type):  # column에 들어가는 값은 아님
        if process_type == 'Final조립':
            buffer = 2
        else:
            p = self.cfg.data_params['p_for_buffer']
            buffer = stats.geom.rvs(p, loc=-1)

        return buffer


    def generate(self, file_path=None):
        columns = ["Block_Name", "Block_ID", "Process_Type", "Ship_Type", "Block_Type", "Start_Date", "Duration", "Due_Date",
                   "Workload_H01", "Workload_H02", "Weight", "Length", "Breadth", "Height"]


        df_blocks = []



        for j in range(self.num_blocks):
            name = "J-%d" % j
            id = j

            # 데이터 생성 코드 추가

            group_results = self.generate_group()          # column에 포함되지는 않음
            group_code = group_results[0]           # 그룹을 참조하는 데이터를 위한 입력변수로 사용
            ship_type = group_results[1]
            block_type = group_results[2]

            process_type = self.generate_process(group_code)

            if j == 0:
                start_date = 0  # 첫번째 착수일은 0으로 고정
            else:
                interval = self.calculate_interval()
                start_date = df_blocks[j - 1][5] + interval  # 이전 착수일에 interval을 더하는 형식으로 계산

            buffer = self.calculate_buffer(process_type)

            if group_code not in self.cfg.data_params['group_sampling']:
                length = self.generate_property(group_code, process_type, 'L')
                breadth = self.generate_property(group_code, process_type, 'B')
                height = self.generate_property(group_code, process_type, 'H')

                weight = self.generate_weight(group_code, process_type, length, breadth, height)

                workload_h01 = self.generate_workload_h01(group_code, length, breadth)
                workload_h02 = self.generate_workload_h02(group_code, workload_h01)
                duration = self.generate_duration(group_code, workload_h01, workload_h02, weight)

            else:       # 샘플링된 그룹에 대한 처리, 한 행의 데이터를 그대로 가져오는 식으로 구현
                df_group_code = self.df_revised_for_group[self.df_revised_for_group['선종_블록'] == group_code]
                df_group_code.reset_index(inplace=True)
                idx = np.random.choice(range(df_group_code.shape[0]))

                length = df_group_code.loc[idx, 'L']
                breadth = df_group_code.loc[idx, 'B']
                height = df_group_code.loc[idx, 'H']

                if process_type == 'Final조립':
                    weight = df_group_code.loc[idx, 'W']
                else:       # 중조 공정일 때는 중량 계산
                    weight = self.generate_weight(group_code, process_type, length, breadth, height)

                workload_h01 = df_group_code.loc[idx, 'H01']
                workload_h02 = df_group_code.loc[idx, 'H02']
                duration = df_group_code.loc[idx, '계획공기']


            due_date = start_date + duration + buffer - 1

            row = [name, id, process_type, ship_type, block_type, start_date, duration, due_date,
                   workload_h01, workload_h02, weight, length, breadth, height]

            df_blocks.append(row)

        df_blocks = pd.DataFrame(df_blocks, columns=columns)

        if file_path is not None:

            writer = pd.ExcelWriter(file_path)
            df_blocks.to_excel(writer, sheet_name="blocks", index=False)
            writer.close()

        return df_blocks

In [13]:
data_gen = DataGenerator()
df_blocks = data_gen.generate()

In [14]:
df_blocks

Unnamed: 0,Block_Name,Block_ID,Process_Type,Ship_Type,Block_Type,Start_Date,Duration,Due_Date,Workload_H01,Workload_H02,Weight,Length,Breadth,Height
0,J-0,0,곡중조,LP,S,0,5,5,486,268,38,18.4,4.7,5.9
1,J-1,1,Final조립,LP,D,0,8,9,759,238,123,14.0,18.4,5.8
2,J-2,2,Final조립,CN,F,0,5,6,872,27,171,13.7,20.1,6.0
3,J-3,3,Final조립,LP,V,0,10,11,521,385,174,20.1,14.4,11.5
4,J-4,4,Final조립,LP,E,0,10,11,413,403,52,12.0,7.4,5.7
5,J-5,5,Final조립,LN,E,0,3,4,994,528,102,19.4,10.5,3.9
6,J-6,6,Final조립,LN,F,0,9,10,683,519,76,14.7,15.8,7.6
7,J-7,7,대조중조,CN,E,0,5,8,334,1082,19,13.7,11.2,3.0
8,J-8,8,곡중조,CN,F,0,2,1,457,395,40,11.2,8.7,5.2
9,J-9,9,곡중조,CN,T,0,2,2,220,56,33,5.5,17.4,5.5


In [15]:
df_blocks.to_excel('../data/데이터 생성 예시_13.xlsx')

In [7]:
cfg = Configure()
df_group = pd.read_excel(cfg.data_params['data_group'])

In [8]:
df_group

Unnamed: 0.1,Unnamed: 0,선종_블록,count,Proportion,panel_count,curve_count,big_count,final_count
0,0,CN_E,1192,0.171807,110,241,317,524
1,1,CN_B,808,0.11646,8,446,293,61
2,2,LN_E,718,0.103488,92,240,54,332
3,3,CN_S,690,0.099452,62,136,65,427
4,4,CN_F,397,0.057221,3,217,46,131
5,5,LN_F,299,0.043096,2,162,21,114
6,6,LN_B,294,0.042375,0,54,98,142
7,7,CN_A,249,0.035889,8,60,76,105
8,8,LP_S,224,0.032286,0,68,116,40
9,9,LN_A,215,0.030989,8,108,26,73


In [14]:
generator = DataGenerator()
generator.generate_property('CN_E', 'Final조립', 'L')

(13.8, 'cauchy', (15.5153747120682, 1.1792547426357187))

In [15]:
df_revised_for_group = generator.df_revised_for_group

df_revised_for_group

Unnamed: 0.1,Unnamed: 0,index,정반_코드,호선_코드,블록,단위블록_CODE,공종_명칭,stage_코드,선종_코드,취부팀_코드,...,실적공기,H00,H01,H02,W,B,L,H,A,선종_블록
0,0,2,M212,CN047,S22S0,11A0,Final조립,S032,CN,B02,...,22,0,1469,766,150,17.5,21.0,8.0,367.5,CN_S
1,1,3,M024,CN011,E22P0,11A0,곡중조,S021,CN,B04,...,7,17,475,914,0,13.0,18.0,4.5,234.0,CN_E
2,2,4,M211,BC001,F51P0,05A0,Final조립,S032,BC,B02,...,15,0,623,386,126,17.5,23.0,6.0,402.5,BC_F
3,3,9,M012,TK011,F51S0,11B0,곡중조,S021,VL,A03,...,7,40,267,145,0,6.5,7.0,9.5,45.5,VL_F
4,4,12,M025,LN017,B19S0,03A0,Final조립,S032,LN,A05,...,11,70,492,243,124,18.0,19.0,3.0,342.0,LN_B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6933,6933,11955,M221,CN034,B20P0,03B0,대조중조,S031,CN,B09,...,7,0,521,593,0,8.0,19.5,2.5,156.0,CN_B
6934,6934,11959,M011,LP004,S13P0,03A0,곡중조,S221,LP,B07,...,6,0,318,0,0,5.0,19.5,6.5,97.5,LP_S
6935,6935,11960,M011,LP004,S13S0,03A0,곡중조,S221,LP,B07,...,5,0,318,0,0,5.0,19.5,6.5,97.5,LP_S
6936,6936,11971,M033,LN004,S11P0,11A0,Final조립,S032,LN,B08,...,5,0,427,435,121,24.5,19.0,9.5,465.5,LN_S


In [36]:
generator = DataGenerator()

columns = ['선종_블록', 'process_type', 'property', 'best_distribution_name', 'best_params']


file_path = '../data/그룹별 제원 변수.xlsx'

with pd.ExcelWriter(file_path) as writer:
    for property in ['L', 'B', 'H']:
        df_property = []
        for code in df_group['선종_블록']:
            df_group_code = df_revised_for_group[df_revised_for_group['선종_블록'] == code]
            for process in ['Final조립', '평중조', '곡중조', '대조중조']:
                if process not in df_group_code['공종_명칭'].values:
                    continue
                property_value, best_distribution_name, best_params = generator.generate_property(code, process, property)
                row = [code, process, property, best_distribution_name, best_params]
                df_property.append(row)
        df_property = pd.DataFrame(df_property, columns=columns)

        if not df_property.empty:
            df_property.to_excel(writer, sheet_name=property, index=False)

Exception ignored in: <function ZipFile.__del__ at 0x0000027257BE5580>
Traceback (most recent call last):
  File "C:\Users\kimjh\AppData\Local\Programs\Python\Python312\Lib\zipfile\__init__.py", line 1940, in __del__
    self.close()
  File "C:\Users\kimjh\AppData\Local\Programs\Python\Python312\Lib\zipfile\__init__.py", line 1957, in close
    self.fp.seek(self.start_dir)
ValueError: seek of closed file


In [37]:
df_property.dtypes

선종_블록                     object
process_type              object
property                  object
best_distribution_name    object
best_params               object
dtype: object

In [18]:
df_group = generator.df_group_count

In [19]:
df_group

Unnamed: 0.1,Unnamed: 0,선종_블록,count,Proportion,panel_count,curve_count,big_count,final_count
0,0,CN_E,1192,0.171807,110,241,317,524
1,1,CN_B,808,0.11646,8,446,293,61
2,2,LN_E,718,0.103488,92,240,54,332
3,3,CN_S,690,0.099452,62,136,65,427
4,4,CN_F,397,0.057221,3,217,46,131
5,5,LN_F,299,0.043096,2,162,21,114
6,6,LN_B,294,0.042375,0,54,98,142
7,7,CN_A,249,0.035889,8,60,76,105
8,8,LP_S,224,0.032286,0,68,116,40
9,9,LN_A,215,0.030989,8,108,26,73


In [20]:
idx_panel_0 = df_group[df_group['panel_count'] == 0].index
idx_curve_0 = df_group[df_group['curve_count'] == 0].index
idx_big_0 = df_group[df_group['big_count'] == 0].index
idx_final_0 = df_group[df_group['final_count'] == 0].index

In [21]:
panel_0 = df_group.loc[idx_panel_0, '선종_블록']

panel_0

6     LN_B
8     LP_S
10    LP_V
12    TK_S
14    LN_S
17    LP_D
21    PT_A
22    TK_A
24    TK_F
25    LP_A
26    LP_F
28    LP_B
31    TK_B
32    BC_F
33    BC_B
35    VL_S
36    VL_F
37    VL_B
38    PT_L
39    PT_D
40    PT_R
41    BC_A
42    VL_D
43    VL_E
44    VL_A
Name: 선종_블록, dtype: object

In [22]:
curve_0 = df_group.loc[idx_curve_0, '선종_블록']

curve_0

10    LP_V
14    LN_S
15    CN_D
18    PT_E
20    LN_G
31    TK_B
34    BC_S
35    VL_S
37    VL_B
38    PT_L
39    PT_D
40    PT_R
41    BC_A
42    VL_D
43    VL_E
44    VL_A
Name: 선종_블록, dtype: object

In [23]:
big_0 = df_group.loc[idx_big_0, '선종_블록']
big_0

20    LN_G
24    TK_F
27    CN_T
29    LN_D
33    BC_B
38    PT_L
39    PT_D
40    PT_R
43    VL_E
44    VL_A
Name: 선종_블록, dtype: object

In [25]:
generator.generate_property('CN_T', 'Final조립', 'L')

ValueError: zero-size array to reduction operation minimum which has no identity

In [11]:
cfg = Configure()

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'results/20250710_15h_57m_43s'

In [12]:
cfg.data_params['folderpath']

NameError: name 'cfg' is not defined