<a href="https://colab.research.google.com/github/Sachini-H/regulated-synthetic-data/blob/main/multiple_lines_multiple_days.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This script can be used to generate the following events given some KPI constarints for one palnned date for one particular line:

*   FTT
*   Defective
*   Defects
*   Reject
*   Rectified
*   Manpower
*   Reject reasons
*   Change

The inputs we need are:

1. A plan for one day/ multipe days  for the line of interest
2. An hourly target table of KPIs for each day (date column and hour column both)
3. General schedule (Assuming we reuse the same schedule for all days)
4. An example template for each event type (just to order the columns)





   









In [111]:
# All imports
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import string
import uuid
from google.colab import drive

In [112]:
# Mount google drive and provide the file path that contains contraints in
# excel form
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/plan_data/constraints.xlsx'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [113]:
# Read constraints sheet
xls = pd.read_excel(file_path,
                    sheet_name=["plan","schedule", "target_kpis",
                                "ftt", "defective", "reject", "defects",
                                "rectified", "manpower", "rr"])

# Access individual sheets using sheet names
plan = xls["plan"]
schedule = xls["schedule"]
target_kpis = xls["target_kpis"]
ftt = xls["ftt"]
defective = xls["defective"]
reject = xls["reject"]
defects = xls["defects"]
rectified = xls["rectified"]
manpower = xls["manpower"]
reject_reason = xls["rr"]

In [114]:
# Timezone of the factory
hour_offset = 5.5

In [115]:
# Dictionary containing endline_qc id for each line
station_id = {"F1M7": "F1M7ST02", "F1M8": "F1M8ST02"}

In [116]:
# Function to generate random timestamps per-day that lies between a given start
# and end times

def generate_random_timestamps(start_time, end_time, min_increment,
                               max_increment, num_increments):
    """
    Generate a sorted list of timestamps with random increments for a
    specified date and time range.

    :param start_time: The start datetime.
    :param end_time: The end datetime.
    :param min_increment: Minimum increment in minutes.
    :param max_increment: Maximum increment in minutes.
    :param num_increments: Number of random increments to generate.
    :return: A DataFrame with the sorted timestamps.
    """
    # Generate random increments
    increments = [random.randint(min_increment, max_increment) for _ in
                  range(num_increments)]

    # Generate timestamps
    timestamps = []
    current_time = start_time

    while current_time < end_time:
        timestamps.append(current_time)
        increment = timedelta(seconds=random.choice(increments))
        current_time += increment
        if current_time > end_time:
            break

    # Ensure timestamps are sorted
    timestamps.sort()

    # Create a DataFrame
    df = pd.DataFrame({'ts': timestamps})

    return df

In [117]:
module_ids = plan["module_id"].unique()
module_ids

array(['F1M7', 'F1M8'], dtype=object)

In [118]:
all_dates = {}
for module_id in module_ids:
  all_dates[module_id] = []
  for date in plan[plan["module_id"] == module_id]["date"]:
    all_dates[module_id].append(date)
all_dates


{'F1M7': [Timestamp('2024-07-26 00:00:00'),
  Timestamp('2024-07-27 00:00:00'),
  Timestamp('2024-07-28 00:00:00'),
  Timestamp('2024-07-29 00:00:00'),
  Timestamp('2024-07-30 00:00:00'),
  Timestamp('2024-07-31 00:00:00'),
  Timestamp('2024-08-01 00:00:00')],
 'F1M8': [Timestamp('2024-07-26 00:00:00'),
  Timestamp('2024-07-27 00:00:00'),
  Timestamp('2024-07-28 00:00:00'),
  Timestamp('2024-07-29 00:00:00'),
  Timestamp('2024-07-30 00:00:00'),
  Timestamp('2024-07-31 00:00:00'),
  Timestamp('2024-08-01 00:00:00')]}

In [119]:
# Logic to determine function parameters(with an offset for factory loacl time)

dates ={}
starts = {}
ends={}
num_increments_dict = {}

for module_id in module_ids:

  date_list =[]
  start_time_list = []
  end_time_list = []
  num_increments_list = []

  for index in range(len(all_dates[module_id])):
    date = all_dates[module_id][index].to_pydatetime().date()
    start_time_str = pd.to_timedelta(schedule["start_time"].iloc[0]
                                     .strftime('%H:%M:%S'))- pd.Timedelta(
                                         hours=hour_offset)
    end_time_str = pd.to_timedelta(
        schedule["end_time"].iloc[-1].strftime('%H:%M:%S'))- pd.Timedelta(
            hours=hour_offset)

    start_time = datetime.combine(date, (datetime.min + start_time_str).time())
    end_time = datetime.combine(date, (datetime.min+ end_time_str).time())

    num_increments = plan["qty"].max()
    # print(date)
    # print(start_time)
    # print(end_time)
    # print(num_increments)

    date_list.append(date)
    start_time_list.append(start_time)
    end_time_list.append(end_time)
    num_increments_list.append(num_increments)

  dates[module_id] = date_list
  starts[module_id] = start_time_list
  ends[module_id] = end_time_list
  num_increments_dict[module_id] = num_increments_list



  #num_increments = 1000

min_increment = 2
max_increment = 6

In [120]:
# Verification
print(starts)


{'F1M7': [datetime.datetime(2024, 7, 26, 2, 0), datetime.datetime(2024, 7, 27, 2, 0), datetime.datetime(2024, 7, 28, 2, 0), datetime.datetime(2024, 7, 29, 2, 0), datetime.datetime(2024, 7, 30, 2, 0), datetime.datetime(2024, 7, 31, 2, 0), datetime.datetime(2024, 8, 1, 2, 0)], 'F1M8': [datetime.datetime(2024, 7, 26, 2, 0), datetime.datetime(2024, 7, 27, 2, 0), datetime.datetime(2024, 7, 28, 2, 0), datetime.datetime(2024, 7, 29, 2, 0), datetime.datetime(2024, 7, 30, 2, 0), datetime.datetime(2024, 7, 31, 2, 0), datetime.datetime(2024, 8, 1, 2, 0)]}


**FTT**

In [121]:
# Generate a vey large amount of timestamps(We want to make sure we generate
# more datapoints than the constrained amount so that we can randomly drop
# certain instances)

ftts = {}

for module_id in module_ids:
  ts_df = pd.DataFrame()
  date_list = dates[module_id]
  starts_list = starts[module_id]
  ends_list= ends[module_id]
  num_increments_list = num_increments_dict[module_id]
  for i in range(len(date_list)):
    ts_df= pd.concat([ts_df, generate_random_timestamps(starts_list[i],
                                                        ends_list[i],
                                                        min_increment,
                                                        max_increment,
                                    num_increments_list[i])])
  ts_df['module_id'] = module_id
  ts_df.reset_index(drop=True, inplace=True)
  ftts[module_id] = ts_df

ftts['F1M7']

Unnamed: 0,ts,module_id
0,2024-07-26 02:00:00,F1M7
1,2024-07-26 02:00:05,F1M7
2,2024-07-26 02:00:08,F1M7
3,2024-07-26 02:00:10,F1M7
4,2024-07-26 02:00:15,F1M7
...,...,...
72632,2024-08-01 13:29:41,F1M7
72633,2024-08-01 13:29:43,F1M7
72634,2024-08-01 13:29:47,F1M7
72635,2024-08-01 13:29:52,F1M7


Now, I assume we have the trained synthesizers for each event type given below which we can load and use to generate more samples. But we may have to alter categorical data column values according to the plan's bctx and org. (So just learned numerical distributions are learned.)

In [122]:
!pip install sdv



In [123]:
# To import a synthesizer
import sdv
from sdv.single_table import CTGANSynthesizer

In [124]:
# Import the pretrained synthesizer for FTT
ftt_synthesizer = CTGANSynthesizer.load(
    filepath='/content/drive/My Drive/plan_data/ftt_synthesizer.pkl'
)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [125]:
# Generate some instances (to the same length as timestamps)
syn_ftts = {}
for module_id in ftts.keys():
  ts_df = ftts[module_id]
  synthetic_ftt = ftt_synthesizer.sample(num_rows=len(ts_df))
  synthetic_ftt['evt_gmt'] = ts_df['ts']
  synthetic_ftt['module_id'] = module_id
  syn_ftts[module_id] = synthetic_ftt


Sampling rows: 100%|██████████| 72637/72637 [00:20<00:00, 3571.38it/s]
Sampling rows: 100%|██████████| 72374/72374 [00:07<00:00, 9785.74it/s]


In [126]:
# Function to determine the date based on a time threshold
def get_factory_date(dt):

    dt = dt + timedelta(hours=hour_offset)
    if dt.time() < pd.to_datetime('05:00:00').time():
        return (dt - timedelta(days=1)).date()
    else:
        return dt.date()

In [127]:
# Apply factory_date logic to evt_gmt
for module_id in syn_ftts.keys():
  synthetic_ftt = syn_ftts[module_id]
  synthetic_ftt['evt_gmt'] = pd.to_datetime(synthetic_ftt['evt_gmt'])
  synthetic_ftt['factory_date'] = synthetic_ftt['evt_gmt'].apply(get_factory_date)
  syn_ftts[module_id] = synthetic_ftt

In [128]:
# Verification
syn_ftts['F1M7'].head()

Unnamed: 0,evt_gmt,factory_date,plan_id,shift_id,shift,tenant_id,tenant,cluster_id,cluster,factory_id,...,size_id,size,evn,ftt_count,defective_count,defect_count,reject_count,rectified_count,input_count,rw_defective_count
0,2024-07-26 02:00:00,2024-07-26,sdv-id-ZmALYB,1,Morning,1,mas,1,kreeda,ABC001,...,1,XL,ftt,7,0,0,0,0,0,0
1,2024-07-26 02:00:05,2024-07-26,sdv-id-ZhJSKp,1,Morning,1,mas,1,kreeda,ABC001,...,1,M,ftt,6,0,0,0,0,0,0
2,2024-07-26 02:00:08,2024-07-26,sdv-id-RnJjTg,1,Morning,1,mas,1,kreeda,ABC001,...,1,XL,ftt,8,0,0,0,0,0,0
3,2024-07-26 02:00:10,2024-07-26,sdv-id-jGEmvR,1,Morning,1,mas,1,kreeda,ABC001,...,1,M,ftt,8,0,0,0,0,0,0
4,2024-07-26 02:00:15,2024-07-26,sdv-id-agVyXC,1,Morning,1,mas,1,kreeda,ABC001,...,1,XL,ftt,8,0,0,0,0,0,0


In [129]:
# For the ease of manipulation, let us assume that we add piece by piece in
# all scenarios.
# This functions sets a value column to a given value.

def edit_value(df, column, value):
  df[column] = value

In [130]:
for module_id in syn_ftts.keys():
  synthetic_ftt = syn_ftts[module_id]
  edit_value(synthetic_ftt, "ftt_count", 1)
  syn_ftts[module_id] = synthetic_ftt

In [131]:
#syn_ftts['F1M7'].head()

In [132]:
# Find the seconds of the day for each event
def find_working_hour(column, hours):
  local_time = column + timedelta(hours=hours)
  return local_time.hour * 3600 + local_time.minute * 60 + local_time.second


In [133]:
for module_id in syn_ftts.keys():
  synthetic_ftt = syn_ftts[module_id]
  synthetic_ftt['factory_time'] = synthetic_ftt['evt_gmt'].apply(
      find_working_hour, args=(hour_offset,))
  syn_ftts[module_id] = synthetic_ftt


In [134]:
#syn_ftts['F1M8'].head()

In [135]:
# Add the seconds of the day to the schedule df
schedule['start_time'] = pd.to_datetime(schedule['start_time'],
                                        format='%H:%M:00').dt.time
schedule['seconds_of_day'] = schedule['start_time'].apply(lambda x: x.hour * 3600 + x.minute * 60)

In [136]:
# Verification
schedule

Unnamed: 0,hour,start_time,end_time,seconds_of_day
0,1,07:30:00,08:30:00,27000
1,2,08:30:00,09:30:00,30600
2,3,09:30:00,10:30:00,34200
3,4,10:30:00,11:30:00,37800
4,5,11:30:00,13:00:00,41400
5,6,13:00:00,14:00:00,46800
6,7,14:00:00,15:00:00,50400
7,8,15:00:00,16:00:00,54000
8,9,16:00:00,17:00:00,57600
9,10,17:00:00,18:00:00,61200


In [137]:
# Perform a backward-looking merge
for module_id in syn_ftts.keys():
  synthetic_ftt = syn_ftts[module_id]
  synthetic_ftt = synthetic_ftt.sort_values('factory_time')
  synthetic_ftt = pd.merge_asof(synthetic_ftt, schedule, left_on='factory_time',
                                right_on='seconds_of_day', direction='backward')


  synthetic_ftt.drop(columns=['factory_time', 'start_time', 'end_time',
                              'seconds_of_day'], inplace=True)
  synthetic_ftt = synthetic_ftt.rename(columns={'hour': 'Hour',
                                                'factory_date': 'Date',
                                                'module_id': 'Line'})
  syn_ftts[module_id] = synthetic_ftt



In [138]:
#syn_ftts['F1M8']

In [139]:
target_kpis

Unnamed: 0,Date,Line,Hour,FTT,SMV,TPPM,Defective,Defect,Rectified,Reject,Operator,Helper,Man power
0,2024-07-26,F1M7,1,91,8.28,1.666667,9,13,3,0,23,5,28
1,2024-07-26,F1M7,2,95,8.28,1.666667,5,7,4,2,23,5,28
2,2024-07-26,F1M7,3,95,8.28,1.666667,6,12,2,2,23,5,28
3,2024-07-26,F1M7,4,92,8.28,1.666667,1,1,1,1,23,5,28
4,2024-07-26,F1M7,5,93,8.28,1.666667,5,9,3,2,23,5,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,2024-08-01,F1M8,7,70,8.28,1.210000,2,2,2,0,14,6,20
150,2024-08-01,F1M8,8,68,8.28,1.210000,2,2,1,1,14,6,20
151,2024-08-01,F1M8,9,74,8.28,1.210000,1,2,1,0,14,6,20
152,2024-08-01,F1M8,10,80,8.28,1.210000,1,1,1,0,14,6,20


In [140]:
# Function to extract only the necessary attribute targets
def get_relevant_target(columns):
  targets = target_kpis[columns]
  return targets

In [141]:
# Traget for FTTs
targets_ftt = get_relevant_target(["Date","Line", "Hour", "FTT"])
targets_ftt[77:]

Unnamed: 0,Date,Line,Hour,FTT
77,2024-07-26,F1M8,1,40
78,2024-07-26,F1M8,2,42
79,2024-07-26,F1M8,3,44
80,2024-07-26,F1M8,4,52
81,2024-07-26,F1M8,5,50
...,...,...,...,...
149,2024-08-01,F1M8,7,70
150,2024-08-01,F1M8,8,68
151,2024-08-01,F1M8,9,74
152,2024-08-01,F1M8,10,80


In [142]:
def adjust_hourly_counts(df, target_counts, count_column, hour_column='Hour',
                         date_column='Date', module_column='Line'):
    """
    Adjust the counts in each hour, date, module_id group of a DataFrame to
    match the target counts.

    :param df: DataFrame containing hour, date, and count columns.
    :param target_counts: DataFrame with hour, date, and target_count columns.
    :param count_column: Name of the column containing count values.
    :param hour_column: Name of the column containing hour values.
    :param date_column: Name of the column containing date values.
    :param module_column: Name of the column containing module_id values.
    :return: DataFrame with adjusted counts.
    """

    # Ensure date and hour columns are of the same type (datetime64[ns])
    target_counts[date_column] = pd.to_datetime(target_counts[date_column])
    df[date_column] = pd.to_datetime(df[date_column])
    df[hour_column] = pd.to_datetime(df[hour_column], format='%H').dt.hour

    # Merge the dataframes on the 'hour','date','module_id' columns
    merged_df = df.merge(target_counts, on=[hour_column, date_column,
                                            module_column], how='left')

    # Create a list to store the adjusted groups
    result_groups = []

    # Iterate over each hour and date group
    for (date, hour, module), group in merged_df.groupby([date_column,
                                                          hour_column,
                                                          module_column]):
        # Calculate the difference between the current count and the target count
        current_count = len(group)
        target_count = group[count_column].iloc[0]
        difference = current_count - target_count
        #print(current_count, target_count)

        # Drop rows randomly to meet the target count
        if difference > 0:
            drop_indices = group.sample(n=difference).index
            group = group.drop(drop_indices)

        # Append the group to the result list
        result_groups.append(group)

    # Concatenate the adjusted groups into a single DataFrame
    result_df = pd.concat(result_groups, ignore_index=True)

    # Drop the 'target_count' column as it's no longer needed
    #result_df = result_df.drop(columns=[count_column])

    return result_df




In [143]:
for module_id in syn_ftts.keys():
  synthetic_ftt = syn_ftts[module_id]
  synthetic_ftt_regulated = adjust_hourly_counts(synthetic_ftt, targets_ftt,"FTT")
  synthetic_ftt_regulated = synthetic_ftt_regulated.reindex(columns=ftt.columns)
  synthetic_ftt_regulated['factory_date'] = synthetic_ftt_regulated['evt_gmt'].apply(get_factory_date)
  synthetic_ftt_regulated['module_id'] = module_id
  syn_ftts[module_id] = synthetic_ftt_regulated


In [144]:
plan_for_bctx = plan.groupby('module_id').first().reset_index()
plan_for_bctx

Unnamed: 0,module_id,plan_id,plan_type,created_ts,modified_ts,date,shift_id,shift,tenant_id,tenant,...,color,qty,mins,operator,helper,ironman,qc,manpower,smv,produced_mins
0,F1M7,12f0ffe7-c509-49b5-b203-83be908b2881,2,2024-07-26 10:00:00,2024-07-26 14:00:00,2024-07-26,1,Morning,,,...,Print,1100,660,23,5,0,0,28,8.28,9108
1,F1M8,13f0ffe7-c509-49b5-b203-83be908b2881,2,2024-07-26 10:00:00,2024-07-26 14:00:00,2024-07-26,1,Morning,,,...,Print,500,660,12,5,0,0,17,8.28,4140


In [145]:
def edit_org(df,columns,module_id):
  """
  Find the corresponding org values from plan per line

  :param df : Dataframe with synthetic data
  :param columns: List of columns to edit
  :param module_id: Line id
  :return: Edited dataframe
  """
  plan_row = plan_for_bctx[(plan_for_bctx["module_id"] == module_id)]
  for column in columns:
     df[column] = plan_row[column].iloc[0]
  return df


In [146]:
bctx_columns = [ "tenant_id", "tenant","cluster_id", "cluster", "factory_id",
                "factory","unit_id", "unit", "building_id","building","floor_id",
                 "floor", "zone_id", "zone", "section_id","section",
                 "department_id","department", "module_id","module", "stp"]

for module_id in syn_ftts.keys():
  synthetic_ftt_regulated = syn_ftts[module_id]
  synthetic_ftt_regulated =edit_org(synthetic_ftt_regulated,bctx_columns,module_id)
  syn_ftts[module_id] = synthetic_ftt_regulated



In [147]:
def edit_bctx(df,columns,module_id):

    """
    Find the corresponding bctx and org values from plan per line per day

    :param df : Dataframe with synthetic data
    :param columns: List of columns to edit
    :param module_id: Line id
    :return: Edited dataframe
    """

    # Ensure date_column is in datetime format
    df["factory_date"] = pd.to_datetime(df["factory_date"])
    plan["date"] = pd.to_datetime(plan["date"])

    current_plan = plan[plan["module_id"] == module_id]

    # Iterate through each row in the df
    for i in range(len(df)):
        # Get the date for the current row in df
        current_date = df.loc[i, "factory_date"]

        # Find the corresponding row in the plan DataFrame
        plan_row = current_plan[(current_plan["date"] == current_date) ]

        if not plan_row.empty:
            for column in columns:
                df.at[i, column] = plan_row.iloc[0][column]
        else:
            print(f"Date {current_date} not found in plan DataFrame")

    return df

In [148]:
# edit bctx and values as needed
bctx_columns = ["plan_id", "shift_id", "shift","buyer_id","buyer","vendor_id",
                        "vendor", "po_id", "po", "so_id","so","li_id","li","fabric_type_id",
                        "fabric_type","product_category_id", "product_category",
                        "product_type_id", "product_type", "style_id", "style", "color_id", "color" ]

for module_id in syn_ftts.keys():
  synthetic_ftt_regulated = syn_ftts[module_id]
  synthetic_ftt_regulated =edit_bctx(synthetic_ftt_regulated,
                                            bctx_columns, module_id)
  syn_ftts[module_id] = synthetic_ftt_regulated

In [149]:
# Change the line_id manually
for module_id in syn_ftts.keys():
  synthetic_ftt_regulated = syn_ftts[module_id]
  synthetic_ftt_regulated['stp'] = synthetic_ftt_regulated['stp'] + '-' + station_id[module_id]
  synthetic_ftt_regulated['station_id'] = station_id[module_id]
  syn_ftts[module_id] = synthetic_ftt_regulated



In [150]:
synthetic_ftt_regulated = pd.concat(syn_ftts.values(), ignore_index=True)

In [151]:
#synthetic_ftt_regulated.to_csv('synthetic_ftt_regulated.csv', index=False)

In [152]:
synthetic_ftt_regulated

Unnamed: 0,evt_gmt,factory_date,plan_id,shift_id,shift,tenant_id,tenant,cluster_id,cluster,factory_id,...,size_id,size,evn,ftt_count,defective_count,defect_count,reject_count,rectified_count,input_count,rw_defective_count
0,2024-07-26 02:00:08,2024-07-26,12f0ffe7-c509-49b5-b203-83be908b2881,1,Morning,,,,,ID_PAS,...,1,XL,ftt,1,0,0,0,0,0,0
1,2024-07-26 02:00:23,2024-07-26,12f0ffe7-c509-49b5-b203-83be908b2881,1,Morning,,,,,ID_PAS,...,1,XL,ftt,1,0,0,0,0,0,0
2,2024-07-26 02:00:25,2024-07-26,12f0ffe7-c509-49b5-b203-83be908b2881,1,Morning,,,,,ID_PAS,...,1,M,ftt,1,0,0,0,0,0,0
3,2024-07-26 02:01:38,2024-07-26,12f0ffe7-c509-49b5-b203-83be908b2881,1,Morning,,,,,ID_PAS,...,1,L,ftt,1,0,0,0,0,0,0
4,2024-07-26 02:01:44,2024-07-26,12f0ffe7-c509-49b5-b203-83be908b2881,1,Morning,,,,,ID_PAS,...,1,M,ftt,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11146,2024-08-01 13:26:31,2024-08-01,13f0ffe7-c509-49b5-b203-83be908b2887,1,Morning,,,,,ID_PAS,...,1,L,ftt,1,0,0,0,0,0,0
11147,2024-08-01 13:26:41,2024-08-01,13f0ffe7-c509-49b5-b203-83be908b2887,1,Morning,,,,,ID_PAS,...,1,M,ftt,1,0,0,0,0,0,0
11148,2024-08-01 13:26:57,2024-08-01,13f0ffe7-c509-49b5-b203-83be908b2887,1,Morning,,,,,ID_PAS,...,1,L,ftt,1,0,0,0,0,0,0
11149,2024-08-01 13:27:57,2024-08-01,13f0ffe7-c509-49b5-b203-83be908b2887,1,Morning,,,,,ID_PAS,...,1,M,ftt,1,0,0,0,0,0,0


**DEFECTIVES**

In [153]:
defectives = {}

for module_id in module_ids:
  ts_df = pd.DataFrame()
  date_list = dates[module_id]
  starts_list = starts[module_id]
  ends_list= ends[module_id]
  num_increments_list = num_increments_dict[module_id]
  for i in range(len(date_list)):
    ts_df= pd.concat([ts_df, generate_random_timestamps(starts_list[i],
                                                        ends_list[i],
                                                        min_increment,
                                                        max_increment,
                                    num_increments_list[i])])
  ts_df['module_id'] = module_id
  ts_df.reset_index(drop=True, inplace=True)
  defectives[module_id] = ts_df

In [154]:
# Import the pretrained synthesizer for defective
defective_synthesizer = CTGANSynthesizer.load(
    filepath='/content/drive/My Drive/plan_data/defective_synthesizer.pkl'
)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [155]:
# Generate some instances (to the same length as timestamps)
syn_defectives ={}
for module_id in defectives.keys():
  ts_defective = defectives[module_id]
  synthetic_defective = defective_synthesizer.sample(num_rows=len(ts_defective))
  synthetic_defective['evt_gmt'] = ts_defective['ts']
  synthetic_defective['evt_gmt'] = pd.to_datetime(synthetic_defective['evt_gmt'])
  synthetic_defective['factory_date'] = synthetic_defective['evt_gmt'].apply(get_factory_date)
  synthetic_defective['module_id'] = module_id
  syn_defectives[module_id] = synthetic_defective

Sampling rows: 100%|██████████| 72648/72648 [00:10<00:00, 6721.25it/s]
Sampling rows: 100%|██████████| 72605/72605 [00:10<00:00, 6993.18it/s]


In [156]:
syn_defectives['F1M7'].head()

Unnamed: 0,evt_gmt,factory_date,plan_id,shift_id,shift,tenant_id,tenant,cluster_id,cluster,factory_id,...,size,evn,gid,ftt_count,defective_count,defect_count,reject_count,rectified_count,input_count,rw_defective_count
0,2024-07-26 02:00:00,2024-07-26,sdv-id-ZmALYB,1,Morning,1,mas,1,kreeda,ABC001,...,XL,defective,sdv-id-wnVsWx,0,1,1,0,0,0,0
1,2024-07-26 02:00:02,2024-07-26,sdv-id-ZhJSKp,1,Morning,1,mas,1,kreeda,ABC001,...,XL,defective,sdv-id-GrLoVq,0,1,1,0,0,0,0
2,2024-07-26 02:00:08,2024-07-26,sdv-id-RnJjTg,1,Morning,1,mas,1,kreeda,ABC001,...,XL,defective,sdv-id-HRikVU,0,1,1,0,0,0,0
3,2024-07-26 02:00:12,2024-07-26,sdv-id-jGEmvR,1,Morning,1,mas,1,kreeda,ABC001,...,XL,defective,sdv-id-DgKEuo,0,1,3,0,0,0,0
4,2024-07-26 02:00:14,2024-07-26,sdv-id-agVyXC,1,Morning,1,mas,1,kreeda,ABC001,...,L,defective,sdv-id-AQYBhy,0,1,1,0,0,0,0


In [157]:
# Edit defective value
for module_id in syn_defectives.keys():
  synthetic_defective = syn_defectives[module_id]
  edit_value(synthetic_defective, "defective_count", 1)
  syn_defectives[module_id] = synthetic_defective

In [158]:
# Function to generate gid
def generate_random_string(length=8):
    characters = string.ascii_uppercase + string.digits
    return ''.join(random.choice(characters) for _ in range(length))

In [159]:
# Generate gid
for module_id in syn_defectives.keys():
  synthetic_defective = syn_defectives[module_id]
  synthetic_defective["gid"]= [generate_random_string() for _ in range(len(synthetic_defective))]
  synthetic_defective= synthetic_defective.drop_duplicates(subset=['gid'])
  synthetic_defective['factory_time'] = synthetic_defective['evt_gmt'].apply(find_working_hour, args=(hour_offset,))
  syn_defectives[module_id] = synthetic_defective

In [160]:
# Perform a backward-looking merge
for module_id in syn_defectives.keys():
  synthetic_defective = syn_defectives[module_id]
  synthetic_defective = synthetic_defective.sort_values('factory_time')
  synthetic_defective = pd.merge_asof(synthetic_defective, schedule, left_on='factory_time', right_on='seconds_of_day', direction='backward')
  synthetic_defective.drop(columns=['factory_time', 'start_time', 'end_time', 'seconds_of_day'], inplace=True)
  synthetic_defective = synthetic_defective.rename(columns={'hour': 'Hour',
                                                            'factory_date': 'Date',
                                                            'module_id': 'Line'})
  syn_defectives[module_id] = synthetic_defective




In [161]:
# Target for defectives
targets_defective = get_relevant_target(["Date","Line", "Hour",
                                         "Defective", "Defect"])
targets_defective

Unnamed: 0,Date,Line,Hour,Defective,Defect
0,2024-07-26,F1M7,1,9,13
1,2024-07-26,F1M7,2,5,7
2,2024-07-26,F1M7,3,6,12
3,2024-07-26,F1M7,4,1,1
4,2024-07-26,F1M7,5,5,9
...,...,...,...,...,...
149,2024-08-01,F1M8,7,2,2
150,2024-08-01,F1M8,8,2,2
151,2024-08-01,F1M8,9,1,2
152,2024-08-01,F1M8,10,1,1


In [162]:
# Adjust the defective count to tally with the given target
for module_id in syn_defectives.keys():
  synthetic_defective = syn_defectives[module_id]
  defective_adjusted = adjust_hourly_counts(synthetic_defective, targets_defective,
                                          "Defective")
  defective_adjusted.drop(columns=['Defective','Defect'], inplace=True)
  syn_defectives[module_id] = defective_adjusted




In [163]:
def distribute_defects(total_defects, num_defectives):
  """
    Distribute a given number of total defects between a specified number
    of defective events.

    :param total_sum: The total defects to be distributed.
    :param num_events: The number of defective events.
    :return: A list with the distributed values.
  """
  # Initial distribution: assign each event a base value

  base_defects = total_defects // num_defectives
  remaining = total_defects % num_defectives

  distribution = [base_defects] * num_defectives

  # Randomly distribute the remaining value
  for _ in range(remaining):
      distribution[np.random.randint(0, num_defectives)] += 1

  return distribution



In [164]:
def generate_hourly_counts(defectives, targets, hour_column='Hour',date_column= 'Date',line_column= 'Line', count_column='defect_count'):
    """
    Generate counts for each hour of each day group of a DataFrame to match the target counts by
    distributing the target sum among all rows in the hour + date group.

    :param defectives: DataFrame containing hour and count columns.
    :param targets: DataFrame with hour and target_count columns.
    :param hour_column: Name of the column containing hour values.
    :param date_column: Name of the column containing date values.
    :param line_column: Name of the column containing module_id values.
    :param count_column: Name of the column containing count values.
    :return: DataFrame with generated counts.
    """
    # Merge the dataframes on the 'hour and date' column
    merged_df = defectives.merge(targets, on=[date_column, hour_column,
                                              line_column], how='left')

    # Create a new column for generated counts
    merged_df['generated_count'] = 0

    # Iterate over each hour+date+line group
    for (date, hour, line), group in merged_df.groupby([date_column,
                                                        hour_column,
                                                        line_column]):
        # Get the target sum for this date + hour+line
        target_sum = group['Defect'].iloc[0]

        # Generate the new counts by distributing the target sum
        generated_counts = distribute_defects(target_sum, len(group))
        merged_df.loc[group.index, 'generated_count'] = generated_counts

    # # Drop the 'target_count' column as it's no longer needed
    # result_df = merged_df.drop(columns=['target_count'])

    return merged_df


In [165]:
for module_id in syn_defectives.keys():
  synthetic_defective = syn_defectives[module_id]
  merged_df = generate_hourly_counts(synthetic_defective, targets_defective)
  merged_df['defect_count']= merged_df['generated_count']
  merged_df= merged_df.rename(columns={'Date': 'factory_date',
                                       'Line': 'module_id'})
  syn_defectives[module_id] = merged_df



In [166]:
# Function to add unit_id, unit columns

def add_unit_columns(df):
    # Get the index of the 'department_id' column
    department_index = df.columns.get_loc('department_id')

    # Insert 'unit_id' column with null values before 'department_id'
    df.insert(department_index, 'unit', [None] * len(df))

    # Insert 'unit' column with null values before 'department_id' (again since 'department_id' index has shifted)
    df.insert(department_index, 'unit_id', [None] * len(df))

    return df

In [167]:
# edit bctx and values as needed
editable_org = ["tenant_id", "tenant","cluster_id", "cluster", "factory_id",
                "factory","unit_id", "unit", "building_id","building","floor_id",
                "floor", "zone_id", "zone","department_id","department",
                "section_id","section", "module_id","module", "stp" ]

editable_bctx = [ "plan_id", "shift_id", "shift",  "buyer_id","buyer","vendor_id",
                  "vendor", "po_id", "po", "so_id","so","li_id","li",
                  "fabric_type_id","fabric_type","product_category_id",
                  "product_category","product_type_id", "product_type",
                  "style_id", "style", "color_id", "color" ]

defects_for_defect={}

for module_id in syn_defectives.keys():
  synthetic_defective_regulated = syn_defectives[module_id]
  synthetic_defective_regulated = edit_org(synthetic_defective_regulated,
                                           editable_org,module_id)
  synthetic_defective_regulated = edit_bctx(synthetic_defective_regulated,
                                            editable_bctx,module_id)
  synthetic_defective_regulated ['stp'] = synthetic_defective_regulated ['stp'] + '-' +   station_id[module_id]
  synthetic_defective_regulated['station_id'] = station_id[module_id]
  defects_for_defect[module_id] = synthetic_defective_regulated.drop(
      columns=['generated_count','Defective', 'Defect'])
  merged_df = synthetic_defective_regulated.drop(
      columns=['generated_count','Hour', 'Defective', 'Defect'])
  syn_defectives[module_id] = merged_df

#merged_df.to_csv('synthetic_defective_regulated.csv', index=False)

In [168]:
synthetic_defective_regulated = pd.concat(syn_defectives.values(), ignore_index=True)


In [169]:
#synthetic_defective_regulated.to_csv('synthetic_defective_regulated.csv', index=False)

In [102]:
synthetic_defective_regulated.columns

Index(['evt_gmt', 'factory_date', 'plan_id', 'shift_id', 'shift', 'tenant_id',
       'tenant', 'cluster_id', 'cluster', 'factory_id', 'factory',
       'building_id', 'building', 'unit_id', 'unit', 'department_id',
       'department', 'floor_id', 'floor', 'zone_id', 'zone', 'section_id',
       'section', 'module_id', 'module', 'station_id', 'station', 'hsk', 'buf',
       'apt', 'stp', 'stpv', 'buyer_id', 'buyer', 'vendor_id', 'vendor',
       'po_id', 'po', 'so_id', 'so', 'li_id', 'li', 'fabric_type_id',
       'fabric_type', 'product_category_id', 'product_category',
       'product_type_id', 'product_type', 'style_id', 'style', 'color_id',
       'color', 'size_id', 'size', 'evn', 'ftt_count', 'defective_count',
       'defect_count', 'reject_count', 'rectified_count', 'input_count',
       'rw_defective_count'],
      dtype='object')

In [178]:
synthetic_defective_regulated['defect_count'].sum()

548

**DEFECTS**

In [196]:
# To create the defects dataframe, explode each of the  events in defective data,
# defect_count times
def generate_defects_base(columns):
  df_repeated = pd.DataFrame()
  for col in columns:
    repeated_df = []
    for index, row in synthetic_defective_regulated.iterrows():
        repeated_df.extend([row[col]] * row['defect_count'])
    df_repeated[col] = repeated_df
  return df_repeated

In [197]:
columns_for_defects = ["evt_gmt",	"factory_date",	"shift_id",	"shift",
                        "tenant_id",	"tenant",	"cluster_id",	"cluster",
                        "factory_id",	"factory",	"building_id",	"building",
                        "unit_id", "unit","department_id",	"department",
                        "floor_id",	"floor","zone_id",	"zone",	"section_id",
                         "section",	"module_id","module","stp","size",	"hsk",
                        "buf","apt","stpv","buyer_id","buyer","vendor_id",
                        "vendor","po_id","po","so_id","so","li_id","li",
                        "fabric_type_id",	"fabric_type",	"product_category_id",
                        "product_category",	"product_type_id",	"product_type",
                        "style_id",	"style","color_id",	"color","size_id","gid"]
defects_base = generate_defects_base(columns_for_defects)

In [198]:
# Findout already processed columns
added_columns = defects_base.columns

In [199]:
# # Findout what other columns should be in defects data

# for column_name in added_columns:
#     defects_temp = defects.drop(column_name, axis=1)
# defects_temp.head(200)

In [200]:
# Import the pretrained synthesizer for defective
defects_synthesizer = CTGANSynthesizer.load(
    filepath="/content/drive/My Drive/plan_data/defects_synthesizer.pkl"
)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [201]:
# Generate synthetic dataset that is the same length as aggregated defective data
synthetic_defects = defects_synthesizer.sample(num_rows=len(defects_base))

Sampling rows: 100%|██████████| 548/548 [00:00<00:00, 4332.84it/s]


In [202]:
#Concatenate the aggregated defective data with generated defect-specific data
synthetic_defects = pd.concat([defects_base, synthetic_defects], axis=1)

In [203]:
# Manually change event type
synthetic_defects["evn"]= "defect"

In [204]:
# Reorder the data columns
synthetic_defects = synthetic_defects.reindex(columns=defects.columns)
#synthetic_defects.head(20)

In [205]:
#synthetic_defects.columns

In [206]:
#synthetic_defects.to_csv('sythetic_defects_regulated.csv', index=False)

**RECTIFIED**

In [244]:
# We are going to derive rectified as portion of events from the defectives
# with the working hour attached.
defectives = defects_for_defect.copy()
for module_id in defectives.keys():
  defectives[module_id].rename(columns={'factory_date': 'Date',
                                        'module_id': 'Line'}, inplace=True)


In [245]:
# Target for rectified
targets_rectified = get_relevant_target(["Date", "Hour", "Line", "Rectified"])
targets_rectified

Unnamed: 0,Date,Hour,Line,Rectified
0,2024-07-26,1,F1M7,3
1,2024-07-26,2,F1M7,4
2,2024-07-26,3,F1M7,2
3,2024-07-26,4,F1M7,1
4,2024-07-26,5,F1M7,3
...,...,...,...,...
149,2024-08-01,7,F1M8,2
150,2024-08-01,8,F1M8,1
151,2024-08-01,9,F1M8,1
152,2024-08-01,10,F1M8,1


In [246]:
def pick_rectified_events(all_defective, targets, hour_column='Hour',
                          date_column = 'Date', line_column='Line',
                          datetime_column='evt_gmt',
                          count_column='Rectified'):
    """
    Pick the first n events from the raw events DataFrame for each hour group based on the counts DataFrame.

    :param all_defective: DataFrame containing raw events with an hour and datetime column.
    :param targets: DataFrame containing the count of events per hour.
    :param hour_column: Name of the column containing hour values.
    :param date_column: Name of the column containing date values.
    :param line_column: Name of the column containing module_id values.
    :param datetime_column: Name of the column containing datetime values.
    :param count_column: Name of the column containing count values.
    :return: DataFrame with the selected events.
    """
    # Ensure the datetime column is of datetime type
    all_defective[datetime_column] = pd.to_datetime(all_defective[datetime_column])

    # Merge the raw_df and counts_df on the date + hour column
    merged_df = all_defective.merge(targets, on=[date_column, hour_column,
                                                 line_column], how='left')

    # Initialize an empty DataFrame to store the selected events
    selected_events = pd.DataFrame(columns=all_defective.columns)

    # # Iterate over each date+hour+line group
    # for (date, hour, line), group in merged_df.groupby([date_column, hour_column,
    #                                                     line_column]):
    #     # Get the target sum for this date + hour
    #     target_sum = group['defect_count'].iloc[0]

    # Iterate over each hour group
    for (date, hour, line), group in merged_df.groupby([date_column,
                                                        hour_column,
                                                        line_column]):
        # Get the target count for this hour
        target_count = group[count_column].iloc[0]

        # Sort the group by the datetime column
        sorted_group = group.sort_values(by=datetime_column)

        # Select the first n events
        selected_n_events = sorted_group.head(target_count)

        # Append the selected events to the result DataFrame
        selected_events = pd.concat([selected_events, selected_n_events])

    # Drop the count column from the result DataFrame
    #selected_events = selected_events.drop(columns=[count_column])

    return selected_events


In [247]:
# Convert both columns to datetime for merging
targets_rectified['Date'] = pd.to_datetime(targets_rectified['Date'])

for module_id in defectives.keys():
  defectives[module_id]['Date'] = pd.to_datetime(defectives[module_id]['Date'])


In [232]:
#defectives['F1M7'].columns

In [248]:
# Select certain defectives and transform them to rectified events
for module_id in defectives.keys():
  rectified_events = pick_rectified_events(defectives[module_id], targets_rectified)
  rectified_events.drop(columns=['Hour', 'Rectified'], inplace=True)
  rectified_events['evn'] = 'rectified'
  rectified_events['defective_count'] = 0
  rectified_events['defect_count'] = 0
  rectified_events['rectified_count'] = 1
  defectives[module_id] = rectified_events

  selected_events = pd.concat([selected_events, selected_n_events])
  selected_events = pd.concat([selected_events, selected_n_events])


In [250]:
# Function to increment a timestamp by 1-5 seconds randomly
def increment_timestamp(timestamp):
    random_seconds = random.randint(1, 5)
    return timestamp + timedelta(seconds=random_seconds)

In [251]:
# Apply the function to the 'timestamp' column and reorder the data columns
for module_id in defectives.keys():
  defectives[module_id]['evt_gmt'] = defectives[module_id]['evt_gmt'].apply(increment_timestamp)
  defectives[module_id].rename(columns={'Date': 'factory_date',
                                        'Line': 'module_id'}, inplace=True)
  #defectives[module_id]['stp'] = defectives[module_id]['stp'] + '-' + station_id[module_id]
  defectives[module_id] = defectives[module_id].reindex(columns=rectified.columns)


In [252]:
synthetic_rectified = pd.concat(defectives.values(), ignore_index=True)

In [253]:
#synthetic_rectified.to_csv('synthetic_rectified_regulated.csv', index=False)

**REJECT**

In [266]:
# Generate a vey large amount of timestamps(We want to make sure we generate
# more datapoints than the constrained amount so that we can randomly drop
# certain instances)

rejects = {}

for module_id in module_ids:
  ts_df = pd.DataFrame()
  date_list = dates[module_id]
  starts_list = starts[module_id]
  ends_list= ends[module_id]
  num_increments_list = num_increments_dict[module_id]
  for i in range(len(date_list)):
    ts_df= pd.concat([ts_df, generate_random_timestamps(starts_list[i],
                                                        ends_list[i],
                                                        min_increment,
                                                        max_increment,
                                    num_increments_list[i])])
  ts_df['module_id'] = module_id
  ts_df.reset_index(drop=True, inplace=True)
  rejects[module_id] = ts_df



rejects['F1M7'].head()

Unnamed: 0,ts,module_id
0,2024-07-26 02:00:00,F1M7
1,2024-07-26 02:00:02,F1M7
2,2024-07-26 02:00:06,F1M7
3,2024-07-26 02:00:10,F1M7
4,2024-07-26 02:00:14,F1M7


In [267]:
# Import the pretrained synthesizer for Reject
reject_synthesizer = CTGANSynthesizer.load(
    filepath='/content/drive/My Drive/plan_data/reject_synthesizer.pkl'
)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [268]:
syn_rejects ={}
for module_id in rejects.keys():
  ts_rejects = rejects[module_id]
  synthetic_rejects = reject_synthesizer.sample(num_rows=len(ts_rejects))
  synthetic_rejects['evt_gmt'] = ts_rejects['ts']
  synthetic_rejects['evt_gmt'] = pd.to_datetime(synthetic_rejects['evt_gmt'])
  synthetic_rejects['factory_date'] = synthetic_rejects['evt_gmt'].apply(get_factory_date)
  synthetic_rejects['module_id'] = module_id
  synthetic_rejects["gid"]= [generate_random_string() for _ in range(len(synthetic_rejects))]
  synthetic_rejects['factory_time'] = synthetic_rejects['evt_gmt'].apply(find_working_hour, args=(hour_offset,))
  syn_rejects[module_id] = synthetic_rejects

Sampling rows: 100%|██████████| 72928/72928 [00:09<00:00, 7753.01it/s]
Sampling rows: 100%|██████████| 72448/72448 [00:10<00:00, 6978.94it/s]


In [269]:
# Target for rejects
targets_reject = get_relevant_target(["Date", "Hour", "Line", "Reject"])
targets_reject

Unnamed: 0,Date,Hour,Line,Reject
0,2024-07-26,1,F1M7,0
1,2024-07-26,2,F1M7,2
2,2024-07-26,3,F1M7,2
3,2024-07-26,4,F1M7,1
4,2024-07-26,5,F1M7,2
...,...,...,...,...
149,2024-08-01,7,F1M8,0
150,2024-08-01,8,F1M8,1
151,2024-08-01,9,F1M8,0
152,2024-08-01,10,F1M8,0


In [270]:
# Perform a backward-looking merge
for module_id in syn_rejects.keys():
  synthetic_reject = syn_rejects[module_id]
  synthetic_reject = synthetic_reject.sort_values('factory_time')
  synthetic_reject = pd.merge_asof(synthetic_reject, schedule,
                                   left_on='factory_time',
                                   right_on='seconds_of_day',
                                   direction='backward')
  synthetic_reject.drop(columns=['factory_time', 'start_time',
                                 'end_time', 'seconds_of_day'], inplace=True)
  synthetic_reject = synthetic_reject.rename(columns={'hour': 'Hour',
                                                      'factory_date': 'Date',
                                                      'module_id' :'Line'})
  syn_rejects[module_id] = synthetic_reject


In [274]:
# Adjust the reject count to tally with the given target
for module_id in syn_rejects.keys():
  synthetic_reject = syn_rejects[module_id]
  reject_adjusted = adjust_hourly_counts(synthetic_reject, targets_reject,
                                          "Reject")
  reject_adjusted.rename(columns={'Date': 'factory_date', 'Line': 'module_id'},
                         inplace=True)
  reject_adjusted =  reject_adjusted.reindex(columns=reject.columns)
  syn_rejects[module_id] = reject_adjusted


In [275]:
# edit bctx and values as needed
org_columns = [ "tenant_id", "tenant","cluster_id", "cluster", "factory_id",
               "factory","unit_id", "unit", "building_id","building",
                "department_id","department","floor_id", "floor", "zone_id",
                "zone", "section_id","section", "module_id","module", "stp",  ]

bctx_columns = ["plan_id", "shift_id", "shift","buyer_id","buyer","vendor_id",
                "vendor", "po_id", "po", "so_id","so","li_id","li","fabric_type_id",
                "fabric_type","product_category_id", "product_category",
                "product_type_id", "product_type", "style_id", "style",
                "color_id", "color"]



for module_id in syn_rejects.keys():
  synthetic_reject_regulated = syn_rejects[module_id]
  synthetic_reject_regulated = edit_org(synthetic_reject_regulated,
                                         org_columns,module_id)
  synthetic_reject_regulated = edit_bctx(synthetic_reject_regulated,
                                          bctx_columns,module_id)
  synthetic_reject_regulated['stp'] = synthetic_reject_regulated['stp'] + '-' + station_id[module_id]
  synthetic_reject_regulated['station_id'] = station_id[module_id]
  edit_value(synthetic_reject_regulated, "reject_count", 1)
  syn_rejects[module_id] = synthetic_reject_regulated





In [276]:
synthetic_reject_regulated = pd.concat(syn_rejects.values(), ignore_index=True)

In [277]:
#synthetic_reject_regulated.to_csv('synthetic_reject_regulated.csv', index=False)

**MANPOWER**

Generate categorical data columns using for example, ftt events and generate the numbers directly using schedule.

In [386]:
schedule

Unnamed: 0,hour,start_time,end_time,seconds_of_day
0,1,07:30:00,08:30:00,27000
1,2,08:30:00,09:30:00,30600
2,3,09:30:00,10:30:00,34200
3,4,10:30:00,11:30:00,37800
4,5,11:30:00,13:00:00,41400
5,6,13:00:00,14:00:00,46800
6,7,14:00:00,15:00:00,50400
7,8,15:00:00,16:00:00,54000
8,9,16:00:00,17:00:00,57600
9,10,17:00:00,18:00:00,61200


In [417]:
def expand_schedule_with_dates(schedule_df, date_list):
    """
    Repeat the schedule DataFrame for each date in the date_list and add the relevant date.

    :param schedule_df: The schedule DataFrame with 11 hours.
    :param date_list: List of dates to repeat the schedule for.
    :return: A DataFrame with the schedule repeated for each date, with the date added to each row.
    """
    # Create an empty DataFrame to store the results
    expanded_df = pd.DataFrame()

    # Repeat the schedule DataFrame for each date
    for date in date_list:
        temp_df = schedule_df.copy()  # Copy the schedule DataFrame
        temp_df['Date'] = date  # Add the date column
        expanded_df = pd.concat([expanded_df, temp_df], ignore_index=True)

    return expanded_df

In [418]:
# Create a tweaked schedule per module
new_schedules = {}
for module_id in dates.keys():
  dates_of_module = dates[module_id]
  new_schedule = expand_schedule_with_dates(schedule, dates_of_module)
  new_schedule['module_id'] = module_id
  new_schedules[module_id] = new_schedule


In [419]:
# Combine the new schedules to one dataframe
new_schedule = pd.concat(new_schedules.values(), ignore_index=True)
#new_schedule

In [420]:
# Just using synthetic_ftt_regulated dataframe to extract the underlying org
manpower_cat = synthetic_ftt_regulated.head(len(new_schedule))
manpower_cat.drop(columns=['plan_id', 'evn', 'ftt_count', 'defective_count', 'defect_count',
                           'reject_count', 'rectified_count', 'input_count',
                           'rw_defective_count', 'buyer_id','buyer','vendor_id',
                           'vendor',	'po_id'	,'po',	'so_id',	'so',	'li_id'	,
                           'li',	'fabric_type_id',	'fabric_type',	'product_category_id',
                           'product_category',	'product_type_id',	'product_type',
                           'style_id',	'style',	'color_id',	'color'	,'size_id',	'size'], inplace=True)


In [421]:
# Iterate over each datetime object in the Series and format it

start_time_str = [(dt.strftime('%H:%M:%S')) for dt in new_schedule["start_time"]]
ts_manpower = pd.DataFrame()
ts_manpower['start_time'] = start_time_str
ts_manpower['module_id'] = new_schedule['module_id']
#ts_manpower





In [422]:
# Generate timestamps for manpower eventsand tweak major columns for a future merge
manpower_cat['evt_gmt'] = pd.to_datetime(new_schedule['Date'].astype(str) + ' ' +ts_manpower['start_time'] )
manpower_cat['module_id'] = new_schedule['module_id']
manpower_cat['factory_date']= manpower_cat['evt_gmt'].apply(get_factory_date)
manpower_cat['Hour'] = new_schedule['hour']
#manpower_cat

In [423]:
# Target for defectives (This example target does not have ironman and qc)
targets_manpower = get_relevant_target(["Date", "Hour","Line", "Operator", "Helper", "Man power"
                                        #"Iron man", "QC"
                                        ])
targets_manpower

Unnamed: 0,Date,Hour,Line,Operator,Helper,Man power
0,2024-07-26,1,F1M7,23,5,28
1,2024-07-26,2,F1M7,23,5,28
2,2024-07-26,3,F1M7,23,5,28
3,2024-07-26,4,F1M7,23,5,28
4,2024-07-26,5,F1M7,23,5,28
...,...,...,...,...,...,...
149,2024-08-01,7,F1M8,14,6,20
150,2024-08-01,8,F1M8,14,6,20
151,2024-08-01,9,F1M8,14,6,20
152,2024-08-01,10,F1M8,14,6,20


In [424]:
# lookup the hourly values from given target
manpower_cat.rename(columns={'factory_date': 'Date', 'module_id': 'Line'}, inplace=True)
manpower_cat['Date'] = pd.to_datetime(manpower_cat['Date'])

# Ensure 'Date' in targets_manpower is also datetime
targets_manpower['Date'] = pd.to_datetime(targets_manpower['Date'])


synthetic_manpower = manpower_cat.merge(targets_manpower, on=["Date", 'Hour', 'Line'], how='left')
synthetic_manpower.drop(columns=['Hour'], inplace=True)
synthetic_manpower.rename(columns={'Operator': 'operator', 'Helper': 'helper', 'Man power': 'manpower'}, inplace=True)
synthetic_manpower

Unnamed: 0,evt_gmt,Date,shift_id,shift,tenant_id,tenant,cluster_id,cluster,factory_id,factory,...,station,hsk,buf,apt,stp,stpv,eid,operator,helper,manpower
0,2024-07-26 07:30:00,2024-07-26,1,Morning,,,,,ID_PAS,PT.Purnama Asih Sur,...,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,3a9f33db-d51d-4888-bd8d-59191947272e,23,5,28
1,2024-07-26 08:30:00,2024-07-26,1,Morning,,,,,ID_PAS,PT.Purnama Asih Sur,...,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,e8fe3a5c-838f-42a9-822e-81d89271467f,23,5,28
2,2024-07-26 09:30:00,2024-07-26,1,Morning,,,,,ID_PAS,PT.Purnama Asih Sur,...,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,e77e07fd-a9e0-48da-82a1-21fdf7505348,23,5,28
3,2024-07-26 10:30:00,2024-07-26,1,Morning,,,,,ID_PAS,PT.Purnama Asih Sur,...,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,ce23fc96-735e-4576-82f6-fae3b5ef85d6,23,5,28
4,2024-07-26 11:30:00,2024-07-26,1,Morning,,,,,ID_PAS,PT.Purnama Asih Sur,...,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,f3fbf1cb-0fad-4bbe-baa9-95242fb67551,23,5,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,2024-08-01 14:00:00,2024-08-01,1,Morning,,,,,ID_PAS,PT.Purnama Asih Sur,...,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,2e955041-92fb-4f4c-ad89-6395dcae8e57,14,6,20
150,2024-08-01 15:00:00,2024-08-01,1,Morning,,,,,ID_PAS,PT.Purnama Asih Sur,...,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,0db09921-3428-4287-8114-d1a9515287f9,14,6,20
151,2024-08-01 16:00:00,2024-08-01,1,Morning,,,,,ID_PAS,PT.Purnama Asih Sur,...,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,c0dfa83e-3240-435b-b980-eec3afb58c61,14,6,20
152,2024-08-01 17:00:00,2024-08-01,1,Morning,,,,,ID_PAS,PT.Purnama Asih Sur,...,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,fcbe9ca1-0699-45a4-b612-79ce8b1ea2c4,14,6,20


In [425]:
# Add missing column and values if not present in target
if "iron man" not in synthetic_manpower.columns:
    synthetic_manpower["ironman"] = 0
if "qc" not in synthetic_manpower.columns:
    synthetic_manpower["qc"] = 0

In [426]:
# Reformatting columns
synthetic_manpower.rename(columns={'Date': 'factory_date',
                                   'Line': 'module_id'}, inplace=True)

In [427]:
# edit bctx and values as needed
editable_bctx = ['shift_id', 'shift']
editable_org = ['tenant_id', 'tenant',
       'cluster_id', 'cluster', 'factory_id', 'factory', 'building_id',
       'building', 'unit_id', 'unit', 'department_id', 'department', 'floor_id', 'floor',
       'zone_id', 'zone', 'section_id', 'section', 'module_id', 'module',"stp"
       ]

syn_manpower = {}
# Breaking up the synthetic_manpower dataframe into modulewise cases and
# editing bctx and org
# Also adjust the time to gmt
for module_id in synthetic_manpower.module_id.unique():
  modulewise_mp = synthetic_manpower[synthetic_manpower['module_id'] == module_id]
  modulewise_mp.reset_index(drop=True, inplace=True)
  modulewise_mp = edit_bctx(modulewise_mp, editable_bctx, module_id)
  modulewise_mp = edit_org(modulewise_mp, editable_org, module_id)
  modulewise_mp['stp'] = modulewise_mp['stp'] +  '-' + station_id[module_id]
  modulewise_mp['station_id'] = station_id[module_id]
  modulewise_mp['evt_gmt'] = pd.to_datetime(modulewise_mp['evt_gmt'])
  modulewise_mp['evt_gmt'] = modulewise_mp['evt_gmt'] - pd.to_timedelta(
      hour_offset, unit='h')
  modulewise_mp = modulewise_mp.reindex(columns=manpower.columns)
  syn_manpower[module_id] = modulewise_mp

In [428]:
synthetic_manpower= pd.concat(syn_manpower.values(), ignore_index=True)

In [429]:
#synthetic_manpower.to_csv('synthetic_manpower_regulated.csv', index=False)

**Now we are going to generate an eid for all synthetic data**

In [430]:
def add_unique_event_ids_and_save(*dfs, filenames):
    """
    Add unique event_id column to multiple DataFrames and save each to a CSV file.

    Parameters:
    *dfs : list of pd.DataFrame
        List of DataFrames to which unique event_ids will be added.
    filenames : list of str
        List of filenames corresponding to each DataFrame for saving as CSV.

    Returns:
    list of pd.DataFrame
        List of DataFrames with unique event_ids added.
    """

    # Calculate the total number of rows across all DataFrames
    total_rows = sum(len(df) for df in dfs)

    # Generate unique event_ids
    event_ids = [str(uuid.uuid4()) for _ in range(total_rows)]

    # Initialize the start index for slicing event_ids
    start_idx = 0

    # Function to move the last column to the first position
    def move_last_column_to_first(df):
        cols = df.columns.tolist()
        new_cols = [cols[-1]] + cols[:-1]
        return df[new_cols]

    # Assign unique event_ids to each DataFrame and save to CSV
    for df, filename in zip(dfs, filenames):
        end_idx = start_idx + len(df)
        df['eid'] = event_ids[start_idx:end_idx]
        start_idx = end_idx
        df = move_last_column_to_first(df)
        df.to_csv(filename, index=False)





# Filenames for saving the CSVs
filenames = ['regulated_synthetic_ftt.csv', 'regulated_synthetic_defective.csv',
             'regulated_synthetic_defect.csv', 'regulated_synthetic_rectified.csv',
             'regulated_synthetic_reject.csv','regulated_synthetic_manpower.csv']



# Add unique event_ids to DataFrames and save to CSVs
add_unique_event_ids_and_save(synthetic_ftt_regulated,synthetic_defective_regulated,
                              synthetic_defects,synthetic_rectified,
                              synthetic_reject_regulated,synthetic_manpower
                              ,filenames=filenames)



In [431]:
# Adding plan to a seperate csv file
plan.to_csv('plan.csv', index=False)

In [432]:
# Function to bring the added eid to the very begining
def move_last_column_to_front(df):
    """
    Moves the last column of the DataFrame to the front.

    Parameters:
    df (pd.DataFrame): The DataFrame to be processed.

    Returns:
    pd.DataFrame: The DataFrame with the last column moved to the front.
    """
    # Get the last column name
    last_column = df.columns[-1]

    # Reorder columns with the last column first
    new_order = [last_column] + list(df.columns[:-1])

    # Reindex the DataFrame
    df = df[new_order]

    return df


In [433]:
# Drop gid if present and combine ftt,reject,defective and rectified as production data


def generate_production_dataset(df_list, column_name):
    """
    Checks if a particular column exists in each DataFrame in the list,
    drops that column if it exists, and then merges all DataFrames vertically.

    Parameters:
    df_list (list of pd.DataFrame): List of DataFrames to be processed.
    column_name (str): The name of the column to check and drop.

    Returns:
    pd.DataFrame: The merged DataFrame.
    """
    processed_dfs = []

    for df in df_list:
        if column_name in df.columns:
            df = df.drop(columns=[column_name])
        processed_dfs.append(df)

    # Concatenate all DataFrames vertically
    merged_df = pd.concat(processed_dfs, ignore_index=True)
    merged_df = move_last_column_to_front(merged_df)

    merged_df.to_csv('production.csv', index=False)



In [434]:
# Generate production dataset

df_list = [synthetic_ftt_regulated,synthetic_defective_regulated,
           synthetic_rectified,synthetic_reject_regulated]
column_name = 'gid'
generate_production_dataset(df_list, column_name)

**REJECT REASON**

In [435]:
synthetic_reject_regulated.columns

Index(['evt_gmt', 'factory_date', 'plan_id', 'shift_id', 'shift', 'tenant_id',
       'tenant', 'cluster_id', 'cluster', 'factory_id', 'factory',
       'building_id', 'building', 'unit_id', 'unit', 'department_id',
       'department', 'floor_id', 'floor', 'zone_id', 'zone', 'section_id',
       'section', 'module_id', 'module', 'station_id', 'station', 'hsk', 'buf',
       'apt', 'stp', 'stpv', 'buyer_id', 'buyer', 'vendor_id', 'vendor',
       'po_id', 'po', 'so_id', 'so', 'li_id', 'li', 'fabric_type_id',
       'fabric_type', 'product_category_id', 'product_category',
       'product_type_id', 'product_type', 'style_id', 'style', 'color_id',
       'color', 'size_id', 'size', 'gid', 'evn', 'ftt_count',
       'defective_count', 'defect_count', 'reject_count', 'rectified_count',
       'input_count', 'rw_defective_count', 'eid'],
      dtype='object')

In [436]:
rejects_for_rr = synthetic_reject_regulated.copy()
rejects_for_rr.drop(columns=['evn', 'ftt_count','defective_count',
                             'defect_count', 'reject_count', 'rectified_count',
                             'input_count', 'rw_defective_count'], inplace=True)
rejects_for_rr.head()

Unnamed: 0,evt_gmt,factory_date,plan_id,shift_id,shift,tenant_id,tenant,cluster_id,cluster,factory_id,...,product_type_id,product_type,style_id,style,color_id,color,size_id,size,gid,eid
0,2024-07-26 03:23:13,2024-07-26,12f0ffe7-c509-49b5-b203-83be908b2881,1,Morning,,,,,ID_PAS,...,1,Top,1,3667M,1,Print,1,M,J4U460W4,7130a048-2ce4-4523-946f-5b365152ece6
1,2024-07-26 03:56:27,2024-07-26,12f0ffe7-c509-49b5-b203-83be908b2881,1,Morning,,,,,ID_PAS,...,1,Top,1,3667M,1,Print,1,XL,GGREPUO0,a699e226-829e-4a14-9ec4-a72844b4e927
2,2024-07-26 04:34:44,2024-07-26,12f0ffe7-c509-49b5-b203-83be908b2881,1,Morning,,,,,ID_PAS,...,1,Top,1,3667M,1,Print,1,L,GPXWGF5D,a01209a5-65b1-4791-aa67-a49192a35bfd
3,2024-07-26 04:48:20,2024-07-26,12f0ffe7-c509-49b5-b203-83be908b2881,1,Morning,,,,,ID_PAS,...,1,Top,1,3667M,1,Print,1,XL,66AKPFH8,393ebd18-055c-42cf-81a9-b429dce0bb77
4,2024-07-26 05:40:01,2024-07-26,12f0ffe7-c509-49b5-b203-83be908b2881,1,Morning,,,,,ID_PAS,...,1,Top,1,3667M,1,Print,1,L,GPG1Y4C6,754c66aa-422d-4a72-a0a4-2b328e91d815


In [437]:
# Import the pretrained synthesizer for reject reasons
rr_synthesizer = CTGANSynthesizer.load(
    filepath='/content/drive/My Drive/plan_data/rr_synthesizer.pkl'
)

In [438]:
# Generate a set of synthetic reject reasons
synthetic_rr = rr_synthesizer.sample(num_rows=len(rejects_for_rr))


Sampling rows: 100%|██████████| 224/224 [00:00<00:00, 4379.92it/s]


In [439]:
synthetic_rr_regulated = pd.concat([rejects_for_rr, synthetic_rr], axis=1)
synthetic_rr_regulated.head()

Unnamed: 0,evt_gmt,factory_date,plan_id,shift_id,shift,tenant_id,tenant,cluster_id,cluster,factory_id,...,color_id,color,size_id,size,gid,eid,sign,reject_reason_id,reject_reason_code,reject_reason
0,2024-07-26 03:23:13,2024-07-26,12f0ffe7-c509-49b5-b203-83be908b2881,1,Morning,,,,,ID_PAS,...,1,Print,1,M,J4U460W4,7130a048-2ce4-4523-946f-5b365152ece6,1,5,304,Fabric Defect
1,2024-07-26 03:56:27,2024-07-26,12f0ffe7-c509-49b5-b203-83be908b2881,1,Morning,,,,,ID_PAS,...,1,Print,1,XL,GGREPUO0,a699e226-829e-4a14-9ec4-a72844b4e927,1,5,304,Fabric Defect
2,2024-07-26 04:34:44,2024-07-26,12f0ffe7-c509-49b5-b203-83be908b2881,1,Morning,,,,,ID_PAS,...,1,Print,1,L,GPXWGF5D,a01209a5-65b1-4791-aa67-a49192a35bfd,1,2,301,Uneven
3,2024-07-26 04:48:20,2024-07-26,12f0ffe7-c509-49b5-b203-83be908b2881,1,Morning,,,,,ID_PAS,...,1,Print,1,XL,66AKPFH8,393ebd18-055c-42cf-81a9-b429dce0bb77,1,4,303,Needle Defect
4,2024-07-26 05:40:01,2024-07-26,12f0ffe7-c509-49b5-b203-83be908b2881,1,Morning,,,,,ID_PAS,...,1,Print,1,L,GPG1Y4C6,754c66aa-422d-4a72-a0a4-2b328e91d815,1,9,308,Other


In [440]:
eid_column = synthetic_rr_regulated['eid']

In [441]:
synthetic_rr_regulated = synthetic_rr_regulated.reindex(columns=reject_reason.columns)
synthetic_rr_regulated = pd.concat([eid_column, synthetic_rr_regulated], axis=1)
synthetic_rr_regulated.head()

Unnamed: 0,eid,evt_gmt,factory_date,shift_id,shift,tenant_id,tenant,cluster_id,cluster,factory_id,...,style,color_id,color,size_id,size,sign,gid,reject_reason_id,reject_reason_code,reject_reason
0,7130a048-2ce4-4523-946f-5b365152ece6,2024-07-26 03:23:13,2024-07-26,1,Morning,,,,,ID_PAS,...,3667M,1,Print,1,M,1,J4U460W4,5,304,Fabric Defect
1,a699e226-829e-4a14-9ec4-a72844b4e927,2024-07-26 03:56:27,2024-07-26,1,Morning,,,,,ID_PAS,...,3667M,1,Print,1,XL,1,GGREPUO0,5,304,Fabric Defect
2,a01209a5-65b1-4791-aa67-a49192a35bfd,2024-07-26 04:34:44,2024-07-26,1,Morning,,,,,ID_PAS,...,3667M,1,Print,1,L,1,GPXWGF5D,2,301,Uneven
3,393ebd18-055c-42cf-81a9-b429dce0bb77,2024-07-26 04:48:20,2024-07-26,1,Morning,,,,,ID_PAS,...,3667M,1,Print,1,XL,1,66AKPFH8,4,303,Needle Defect
4,754c66aa-422d-4a72-a0a4-2b328e91d815,2024-07-26 05:40:01,2024-07-26,1,Morning,,,,,ID_PAS,...,3667M,1,Print,1,L,1,GPG1Y4C6,9,308,Other


In [442]:
synthetic_rr_regulated.to_csv('regulated_synthetic_rr.csv', index=False)

**CHANGE**

In [460]:
# Load synthetic manpower dataset
synthetic_manpower = pd.read_csv('regulated_synthetic_manpower.csv')

In [461]:
synthetic_manpower['change_type'] = 'manpower'
synthetic_manpower['scope'] = 'production'
synthetic_manpower['change'] = synthetic_manpower.apply(lambda row: {
        "ironman": str(row['ironman']),
        "helper": str(row['helper']),
        "operator": str(row['operator'])
}, axis=1)

In [462]:
synthetic_manpower.drop(columns=['manpower', 'operator', 'helper', 'ironman',
       'qc'], inplace=True)

In [463]:
synthetic_manpower.head()

Unnamed: 0,eid,evt_gmt,factory_date,shift_id,shift,tenant_id,tenant,cluster_id,cluster,factory_id,...,station_id,station,hsk,buf,apt,stp,stpv,change_type,scope,change
0,322d44e4-4345-4c68-a5cb-522af35e54f0,2024-07-26 02:00:00,2024-07-26,1,Morning,,,,,ID_PAS,...,F1M7ST02,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,manpower,production,"{'ironman': '0', 'helper': '5', 'operator': '23'}"
1,a68b9c58-d2c1-40d2-be83-3520ecd8f1a6,2024-07-26 03:00:00,2024-07-26,1,Morning,,,,,ID_PAS,...,F1M7ST02,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,manpower,production,"{'ironman': '0', 'helper': '5', 'operator': '23'}"
2,a312c4f5-e9e6-461d-9bbe-aecc97ede390,2024-07-26 04:00:00,2024-07-26,1,Morning,,,,,ID_PAS,...,F1M7ST02,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,manpower,production,"{'ironman': '0', 'helper': '5', 'operator': '23'}"
3,0ef4f34b-eafb-449b-95ca-c64afb5d992f,2024-07-26 05:00:00,2024-07-26,1,Morning,,,,,ID_PAS,...,F1M7ST02,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,manpower,production,"{'ironman': '0', 'helper': '5', 'operator': '23'}"
4,fe6613c6-6928-441f-a0a8-f22a6498646f,2024-07-26 06:00:00,2024-07-26,1,Morning,,,,,ID_PAS,...,F1M7ST02,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,manpower,production,"{'ironman': '0', 'helper': '5', 'operator': '23'}"


In [464]:
# Copy the same skeleton as manpower change
synthetic_bctx_temp = synthetic_manpower.copy()

In [465]:
# Convert factort_date to a datetime for a plan merge
synthetic_bctx_temp['factory_date'] = pd.to_datetime(synthetic_bctx_temp['factory_date'])

In [466]:
# Merge plan and change dataframes on plan date and get the bctx in the required
# nested format
synthetic_bctx= pd.merge(synthetic_bctx_temp, plan[['date','module_id','buyer','po','style',
                                                    'color']], how='left',
                         left_on=['factory_date', 'module_id'], right_on=['date','module_id'])
synthetic_bctx['change_type'] = 'running_order'
synthetic_bctx['scope'] = 'application'
synthetic_bctx['change'] = synthetic_bctx.apply(lambda row: {
    "buyer": row['buyer'],
    "po": row['po'],
    "style": row['style'],
    "color": row['color']
}, axis=1)
synthetic_bctx.drop(columns=['date','buyer','po','style','color'], inplace=True)
synthetic_bctx['factory_date'] = synthetic_bctx['factory_date'].dt.date
synthetic_bctx.head(25)

Unnamed: 0,eid,evt_gmt,factory_date,shift_id,shift,tenant_id,tenant,cluster_id,cluster,factory_id,...,station_id,station,hsk,buf,apt,stp,stpv,change_type,scope,change
0,322d44e4-4345-4c68-a5cb-522af35e54f0,2024-07-26 02:00:00,2024-07-26,1,Morning,,,,,ID_PAS,...,F1M7ST02,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,running_order,application,"{'buyer': 'M&S', 'po': 2011242746, 'style': '3..."
1,a68b9c58-d2c1-40d2-be83-3520ecd8f1a6,2024-07-26 03:00:00,2024-07-26,1,Morning,,,,,ID_PAS,...,F1M7ST02,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,running_order,application,"{'buyer': 'M&S', 'po': 2011242746, 'style': '3..."
2,a312c4f5-e9e6-461d-9bbe-aecc97ede390,2024-07-26 04:00:00,2024-07-26,1,Morning,,,,,ID_PAS,...,F1M7ST02,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,running_order,application,"{'buyer': 'M&S', 'po': 2011242746, 'style': '3..."
3,0ef4f34b-eafb-449b-95ca-c64afb5d992f,2024-07-26 05:00:00,2024-07-26,1,Morning,,,,,ID_PAS,...,F1M7ST02,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,running_order,application,"{'buyer': 'M&S', 'po': 2011242746, 'style': '3..."
4,fe6613c6-6928-441f-a0a8-f22a6498646f,2024-07-26 06:00:00,2024-07-26,1,Morning,,,,,ID_PAS,...,F1M7ST02,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,running_order,application,"{'buyer': 'M&S', 'po': 2011242746, 'style': '3..."
5,58108adb-52d8-4dc0-951a-b91143e837e9,2024-07-26 07:30:00,2024-07-26,1,Morning,,,,,ID_PAS,...,F1M7ST02,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,running_order,application,"{'buyer': 'M&S', 'po': 2011242746, 'style': '3..."
6,1d2e0a66-b5de-4bf1-8553-7b1aa5ad452a,2024-07-26 08:30:00,2024-07-26,1,Morning,,,,,ID_PAS,...,F1M7ST02,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,running_order,application,"{'buyer': 'M&S', 'po': 2011242746, 'style': '3..."
7,ae434296-0600-4ca7-a146-8417de12df78,2024-07-26 09:30:00,2024-07-26,1,Morning,,,,,ID_PAS,...,F1M7ST02,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,running_order,application,"{'buyer': 'M&S', 'po': 2011242746, 'style': '3..."
8,db46ad84-8428-44b3-ba1a-fd138f5fb264,2024-07-26 10:30:00,2024-07-26,1,Morning,,,,,ID_PAS,...,F1M7ST02,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,running_order,application,"{'buyer': 'M&S', 'po': 2011242746, 'style': '3..."
9,764d58e5-28b1-468e-9b4d-4d5b66851899,2024-07-26 11:30:00,2024-07-26,1,Morning,,,,,ID_PAS,...,F1M7ST02,Station 2,primary,out,endline_qc,ID_PAS-SW-BD1-F1-F1M7-F1M7ST02,1.0.0,running_order,application,"{'buyer': 'M&S', 'po': 2011242746, 'style': '3..."


In [467]:
regulated_synthetic_change = pd.concat([synthetic_manpower, synthetic_bctx], axis=0)
regulated_synthetic_change.to_csv('regulated_synthetic_change.csv', index=False)

In [468]:
len(regulated_synthetic_change)

308