# 데이터 불러오기

In [3]:
import os
import pandas as pd

folder_path = './ISCX-VPN-NonVPN_data/'
all_df = {}
all_source = {}

for sub_folder in os.listdir(folder_path):
  full_path = os.path.join(folder_path, sub_folder)
  if os.path.isdir(full_path):
    all_folder_df = []
    all_folder_source = []
    for filename in os.listdir(full_path):
      if filename.endswith('.csv'):
        file_path = os.path.join(full_path, filename)
        extracted = filename.replace('TimeBasedFeatures-Dataset-', '').replace('.csv', '')
        try:
          df = pd.read_csv(file_path)
          all_folder_df.append(df)
          all_folder_source.append(extracted)
        except Exception as e:
          print(f"⚠️ 에러 발생: {sub_folder}/{extracted} → {e}")
          continue
    all_df[sub_folder] = all_folder_df
    all_source[sub_folder] = all_folder_source

for sub_folder in all_df:
  print(f"=========={sub_folder}==========")
  for df, source in zip(all_df[sub_folder], all_source[sub_folder]):
    shape = f"({df.shape[0]:<5}, {df.shape[1]})"
    print(f"{source:<12} : {shape}")
print("====================================")

Scenario B-ARFF_60s : (15515, 24)
Scenario B-ARFF_30s-AllinOne : (14651, 24)
Scenario B-ARFF_120s : (10782, 24)
Scenario B-ARFF_60s-AllinOne : (15515, 24)
Scenario B-ARFF_120s-AllinOne : (10782, 24)
Scenario B-ARFF_30s : (14651, 24)
Scenario B-ARFF_15s-AllinOne : (18758, 24)
Scenario B-ARFF_15s : (18758, 24)
Scenario A2-ARFF_30s-NO-VPN : (6917 , 24)
Scenario A2-ARFF_30s-VPN : (7734 , 24)
Scenario A2-ARFF_15s-VPN : (9793 , 24)
Scenario A2-ARFF_120s-NO-VPN : (5151 , 24)
Scenario A2-ARFF_120s-VPN : (5631 , 24)
Scenario A2-ARFF_60s-VPN : (6935 , 24)
Scenario A2-ARFF_15s-NO-VPN : (8965 , 24)
Scenario A2-ARFF_60s-NO-VPN : (8580 , 24)
Scenario A1-ARFF_60s-VPN : (15515, 24)
Scenario A1-ARFF_15s-VPN : (18758, 24)
Scenario A1-ARFF_120s-VPN : (10782, 24)
Scenario A1-ARFF_30s-VPN : (14651, 24)


In [4]:
for sub_folder in all_df:
    print(f"=========={sub_folder}==========")
    for df, source in zip(all_df[sub_folder], all_source[sub_folder]):
        non_numeric_cols = df.select_dtypes(exclude='number').columns.tolist()
        print(non_numeric_cols)

['class1']
['class1']
['class1']
['class1']
['class1']
['class1']
['class1']
['class1']
['class1']
['class1']
['class1']
['class1']
['class1']
['class1']
['class1']
['class1']
['class1']
['class1']
['class1']
['class1']


# 데이터 매핑 함수 선언
## (범주형 데이터 숫자형으로 변환)

## 중복 데이터 처리

In [5]:
def drop_duplicates(df_work: pd.DataFrame):
  start = len(df_work)
  
  df_work.drop_duplicates(inplace=True)
  
  finish = len(df_work)
  print(f"중복 데이터 처리 : {start} - {start-finish} -> {finish}")

## 누락 데이터 처리

In [6]:
def processing_missing(df_work: pd.DataFrame):
  from sklearn.impute import KNNImputer

  start_missing = df_work.isnull().sum()
  start_missing = start_missing[start_missing > 0].to_dict()

  numeric_cols = df_work.select_dtypes(include='number').columns
  df_numeric = df_work[numeric_cols]

  imputer = KNNImputer(n_neighbors=10)
  df_imputed_numeric = pd.DataFrame(imputer.fit_transform(df_numeric), columns=numeric_cols, index=df_numeric.index)

  df_work[numeric_cols] = df_imputed_numeric
  
  finish_missing = df_work.isnull().sum()
  finish_missing = finish_missing[finish_missing > 0].to_dict()
  print(f"누락 데이터 처리 후: {start_missing} -> {finish_missing}")

## 이상치 처리

In [7]:
def processing_outlier(df_work: pd.DataFrame):
    first = len(df_work)
    numeric_cols = df_work.select_dtypes(include='number').columns
    classes = df_work['class1'].unique()

    cleaned_parts = []
    SUM = 0

    for cls in classes:
        df_cls = df_work[df_work['class1'] == cls].copy()
        outlier_indices = set()

        for col in numeric_cols:
            Q1 = df_cls[col].quantile(0.25)
            Q3 = df_cls[col].quantile(0.75)
            IQR = Q3 - Q1

            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            outliers = df_cls[(df_cls[col] < lower_bound) | (df_cls[col] > upper_bound)].index
            outlier_indices.update(outliers)

        df_cls.drop(index=outlier_indices, inplace=True)
        cleaned_parts.append(df_cls)

        SUM += len(outlier_indices)

    # inplace 갱신
    df_work.drop(df_work.index, inplace=True)
    df_work.reset_index(drop=True, inplace=True)
    df_work[:] = pd.concat(cleaned_parts).reset_index(drop=True)

    print(f"전체 처리 후: {first} - {SUM} → {len(df_work)}")

# 데이터 전처리

In [10]:
# 데이터 젙처리
all_df_preprocess = {}
for sub_folder in all_df:
    all_folder_df_preprocess = []
    print(f"=========={sub_folder}==========")
    for df, source in zip(all_df[sub_folder], all_source[sub_folder]):
        df_preprocess = df.copy()
        print("=====================================")
        print(f"{source} - Preprocess")
        drop_duplicates(df_preprocess)
        processing_missing(df_preprocess)
        processing_outlier(df_preprocess)
        all_folder_df_preprocess.append(df_preprocess)
    print("=====================================")
    all_df_preprocess[sub_folder] = all_folder_df_preprocess

Scenario B-ARFF_60s - Preprocess
중복 데이터 처리 : 15515 - 786 -> 14729
누락 데이터 처리 후: {} -> {}
전체 처리 후: 14729 - 8815 → 5914
Scenario B-ARFF_30s-AllinOne - Preprocess
중복 데이터 처리 : 14651 - 1027 -> 13624
누락 데이터 처리 후: {} -> {}
전체 처리 후: 13624 - 7810 → 5814
Scenario B-ARFF_120s - Preprocess
중복 데이터 처리 : 10782 - 1193 -> 9589
누락 데이터 처리 후: {} -> {}
전체 처리 후: 9589 - 6105 → 3484
Scenario B-ARFF_60s-AllinOne - Preprocess
중복 데이터 처리 : 15515 - 817 -> 14698
누락 데이터 처리 후: {} -> {}
전체 처리 후: 14698 - 8910 → 5788
Scenario B-ARFF_120s-AllinOne - Preprocess
중복 데이터 처리 : 10782 - 1212 -> 9570
누락 데이터 처리 후: {} -> {}
전체 처리 후: 9570 - 5960 → 3610
Scenario B-ARFF_30s - Preprocess
중복 데이터 처리 : 14651 - 1013 -> 13638
누락 데이터 처리 후: {} -> {}
전체 처리 후: 13638 - 8005 → 5633
Scenario B-ARFF_15s-AllinOne - Preprocess
중복 데이터 처리 : 18758 - 706 -> 18052
누락 데이터 처리 후: {} -> {}
전체 처리 후: 18052 - 10543 → 7509
Scenario B-ARFF_15s - Preprocess
중복 데이터 처리 : 18758 - 684 -> 18074
누락 데이터 처리 후: {} -> {}
전체 처리 후: 18074 - 9988 → 8086
Scenario A2-ARFF_30s-NO-V

## 데이터 전처리 후 shape 변화

In [11]:
for sub_folder in all_df_preprocess:
  print(f"=========={sub_folder}==========")
  for df, df_preprocess, source in zip(all_df[sub_folder], all_df_preprocess[sub_folder], all_source[sub_folder]):
    before = f"({df.shape[0]:<5}, {df.shape[1]})"
    after = f"({df_preprocess.shape[0]:<5}, {df_preprocess.shape[1]})"
    print(f"{source:<15} : {before} -> {after}")
print("====================================")

Scenario B-ARFF_60s : (15515, 24) -> (5914 , 24)
Scenario B-ARFF_30s-AllinOne : (14651, 24) -> (5814 , 24)
Scenario B-ARFF_120s : (10782, 24) -> (3484 , 24)
Scenario B-ARFF_60s-AllinOne : (15515, 24) -> (5788 , 24)
Scenario B-ARFF_120s-AllinOne : (10782, 24) -> (3610 , 24)
Scenario B-ARFF_30s : (14651, 24) -> (5633 , 24)
Scenario B-ARFF_15s-AllinOne : (18758, 24) -> (7509 , 24)
Scenario B-ARFF_15s : (18758, 24) -> (8086 , 24)
Scenario A2-ARFF_30s-NO-VPN : (6917 , 24) -> (2912 , 24)
Scenario A2-ARFF_30s-VPN : (7734 , 24) -> (3188 , 24)
Scenario A2-ARFF_15s-VPN : (9793 , 24) -> (4392 , 24)
Scenario A2-ARFF_120s-NO-VPN : (5151 , 24) -> (1801 , 24)
Scenario A2-ARFF_120s-VPN : (5631 , 24) -> (1881 , 24)
Scenario A2-ARFF_60s-VPN : (6935 , 24) -> (3047 , 24)
Scenario A2-ARFF_15s-NO-VPN : (8965 , 24) -> (4286 , 24)
Scenario A2-ARFF_60s-NO-VPN : (8580 , 24) -> (3325 , 24)
Scenario A1-ARFF_60s-VPN : (15515, 24) -> (2545 , 24)
Scenario A1-ARFF_15s-VPN : (18758, 24) -> (4236 , 24)
Scenario A1-ARFF

In [14]:
save_dir = "preprocessed_data"
os.makedirs(save_dir, exist_ok=True)

for sub_folder in all_df_preprocess:
    for idx, (df_preprocess, source) in enumerate(zip(all_df_preprocess[sub_folder], all_source[sub_folder])):
        filename = f"{source}.csv"
        print(filename)
        filepath = os.path.join(save_dir, sub_folder, filename)
        os.makedirs(os.path.join(save_dir, sub_folder), exist_ok=True)
        df_preprocess.to_csv(filepath, index=False)

Scenario B-ARFF_60s.csv
Scenario B-ARFF_30s-AllinOne.csv
Scenario B-ARFF_120s.csv
Scenario B-ARFF_60s-AllinOne.csv
Scenario B-ARFF_120s-AllinOne.csv
Scenario B-ARFF_30s.csv
Scenario B-ARFF_15s-AllinOne.csv
Scenario B-ARFF_15s.csv
Scenario A2-ARFF_30s-NO-VPN.csv
Scenario A2-ARFF_30s-VPN.csv
Scenario A2-ARFF_15s-VPN.csv
Scenario A2-ARFF_120s-NO-VPN.csv
Scenario A2-ARFF_120s-VPN.csv
Scenario A2-ARFF_60s-VPN.csv
Scenario A2-ARFF_15s-NO-VPN.csv
Scenario A2-ARFF_60s-NO-VPN.csv
Scenario A1-ARFF_60s-VPN.csv
Scenario A1-ARFF_15s-VPN.csv
Scenario A1-ARFF_120s-VPN.csv
Scenario A1-ARFF_30s-VPN.csv
