# 데이터셋 선정 및 전처리

In [1]:
import os
import pickle

import numpy as np
import pandas as pd

from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

In [2]:
def invert_input_array(input_array, feature_metadata):
    inverted_data = {}
    
    for col, meta in feature_metadata.items():
        if meta['type'] == 'categorical':
            # One-hot encoded 된 부분 추출
            start_idx, end_idx = meta['index'][0], meta['index'][-1] + 1
            cat_data = input_array[:, start_idx:end_idx]
            # OneHotEncoder로 복원
            inverted_col = meta['encoder'].inverse_transform(cat_data)
            inverted_data[col] = inverted_col.flatten()
        else:
            # 수치형 데이터 복원
            idx = meta['index']
            num_data = input_array[:, idx].reshape(-1, 1)
            inverted_col = meta['encoder'].inverse_transform(num_data)
            inverted_data[col] = inverted_col.flatten()
    
    # 복원된 데이터를 DataFrame으로 변환
    inverted_df = pd.DataFrame(inverted_data)
    
    return inverted_df

# # input_array에서 원래의 데이터 복원
# inverted_df = invert_input_array(input_array, feature_metadata)

# # 결과 확인
# print(inverted_df.head())

In [3]:
# dataset_ids = [2, 144, 222, 186, 14]

# # 출력 파일 설정
# output_file = "dataset_info.txt"

# with open(output_file, "w", encoding="utf-8") as f:
#     for dataset_id in dataset_ids:
#         dataset = fetch_ucirepo(id=dataset_id)
#         f.write(f"Dataset ID: {dataset_id}\n")
#         f.write(f"Dataset: {dataset.metadata.name}\n")
#         f.write(f"Dataset size (rows, columns): {dataset.data.features.shape}\n")
#         f.write(dataset.data.features.head(3).to_markdown() + "\n\n")
        
#         # 변수 정보 출력
#         if 'additional_info' in dataset.metadata and 'variable_info' in dataset.metadata['additional_info']:
#             f.write("Variable Info:\n")
#             f.write(dataset.metadata['additional_info']['variable_info'] + "\n\n")
        
#         # 타겟 분포 출력
#         ratio = dataset.data.targets.value_counts() / len(dataset.data.targets)
#         f.write("Target Distribution:\n")
#         f.write(ratio.to_markdown() + "\n\n")
#         f.write("=" * 60 + "\n\n")

# print(f"Output saved to {output_file}")


In [4]:
# dataset_ids = [2, 144, 222, 186]
# for d_id in dataset_ids:
#     raw_data = fetch_ucirepo(id=d_id)
#     dataset_nm = raw_data.metadata['name']
#     raw_data.data.features.to_csv(f"data/{dataset_nm}/raw_data.csv", index=False)

In [6]:
dataset_id = 2
dataset = fetch_ucirepo(id=dataset_id)

feature_metadata = {}

input_data = []
start_idx = 0
for col in dataset.data.features.columns:
    feature_metadata[col] = {}
    if dataset.data.features[col].dtype == "object":
        feature_metadata[col]['type'] = "categorical"
        onehot = OneHotEncoder(handle_unknown='ignore')
        feature_val = dataset.data.features[col].fillna("missing")
        preprocessed = onehot.fit_transform(feature_val.values.reshape(-1, 1)).toarray()
        cat_dist = feature_val.value_counts(dropna=False) / len(dataset.data.features)
        cat_dist = cat_dist.loc[onehot.categories_[0]].values
        feature_metadata[col]['encoder'] = onehot
        feature_metadata[col]['cat_dist'] = cat_dist
        feature_metadata[col]['index'] = np.arange(start_idx, start_idx + preprocessed.shape[1])
        start_idx += preprocessed.shape[1]
    else:
        feature_metadata[col]['type'] = "numerical"
        scaler = StandardScaler()
        preprocessed = scaler.fit_transform(dataset.data.features[col].values.reshape(-1, 1))
        feature_metadata[col]['encoder'] = scaler
        feature_metadata[col]['index'] = start_idx
        start_idx += 1

    input_data.append(preprocessed)

print(feature_metadata)

input_array = np.concatenate(input_data, axis=1)

print(dataset.data.targets.isin([">50K", ">50K."]).value_counts() / len(dataset.data.targets))
y = dataset.data.targets.isin([">50K", ">50K."]).values.astype(int)[:,0]

X_train, X_test, y_train, y_test = train_test_split(input_array, y, test_size=0.2, random_state=42, stratify=y)

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
print("Logistic Regression Train Accuracy:", lr.score(X_train, y_train))
print("Logistic Regression Test Accuracy:", lr.score(X_test, y_test))

mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000)
mlp.fit(X_train, y_train)
print("MLP Train Accuracy:", mlp.score(X_train, y_train))
print("MLP Test Accuracy:", mlp.score(X_test, y_test))

path = f"data/{dataset.metadata['name']}"
if not os.path.exists(path):
    os.makedirs(path)

np.save(f"{path}/X_train.npy", X_train)
np.save(f"{path}/X_test.npy", X_test)
np.save(f"{path}/y_train.npy", y_train)
np.save(f"{path}/y_test.npy", y_test)

with open(f"{path}/feature_metadata.pkl", "wb") as f:
    pickle.dump(feature_metadata, f)

{'age': {'type': 'numerical', 'encoder': StandardScaler(), 'index': 0}, 'workclass': {'type': 'categorical', 'encoder': OneHotEncoder(handle_unknown='ignore'), 'cat_dist': array([3.75905983e-02, 2.93190287e-02, 6.42070349e-02, 2.04741821e-04,
       6.94197617e-01, 3.47037386e-02, 7.90712911e-02, 4.05593547e-02,
       4.29957823e-04, 1.97166373e-02]), 'index': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])}, 'fnlwgt': {'type': 'numerical', 'encoder': StandardScaler(), 'index': 11}, 'education': {'type': 'categorical', 'encoder': OneHotEncoder(handle_unknown='ignore'), 'cat_dist': array([0.02843864, 0.03709922, 0.01345154, 0.00505712, 0.01042136,
       0.01955284, 0.01547848, 0.03277917, 0.04219729, 0.16430531,
       0.01216166, 0.32316449, 0.0543999 , 0.00169936, 0.01707547,
       0.22271815]), 'index': array([12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27])}, 'education-num': {'type': 'numerical', 'encoder': StandardScaler(), 'index': 28}, 'marital-status': {'type

In [7]:
dataset_id = 144
dataset = fetch_ucirepo(id=dataset_id)

# feature_metadata = {}
# for col in dataset.data.features.columns:
#     feature_metadata[col] = {}
#     if dataset.data.features[col].dtype == "object":
#         feature_metadata[col]['type'] = "categorical"
#     else:
#         feature_metadata[col]['type'] = "numerical"
        
# feature_metadata

feature_metadata = {}
input_data = []
start_idx = 0
for col in dataset.data.features.columns:
    feature_metadata[col] = {}
    if dataset.data.features[col].dtype == "object":
        feature_metadata[col]['type'] = "categorical"
        onehot = OneHotEncoder(handle_unknown='ignore')
        feature_val = dataset.data.features[col].fillna("missing")
        preprocessed = onehot.fit_transform(feature_val.values.reshape(-1, 1)).toarray()
        cat_dist = feature_val.value_counts(dropna=False) / len(dataset.data.features)
        cat_dist = cat_dist.loc[onehot.categories_[0]].values
        feature_metadata[col]['encoder'] = onehot
        feature_metadata[col]['cat_dist'] = cat_dist
        feature_metadata[col]['index'] = np.arange(start_idx, start_idx + preprocessed.shape[1])
        start_idx += preprocessed.shape[1]
    else:
        feature_metadata[col]['type'] = "numerical"
        scaler = StandardScaler()
        preprocessed = scaler.fit_transform(dataset.data.features[col].values.reshape(-1, 1))
        feature_metadata[col]['encoder'] = scaler
        feature_metadata[col]['index'] = start_idx
        start_idx += 1

    input_data.append(preprocessed)

print(feature_metadata)

input_array = np.concatenate(input_data, axis=1)

print(dataset.data.targets.isin([2]).value_counts() / len(dataset.data.targets))
y = dataset.data.targets.isin([2]).values.astype(int)[:,0]

X_train, X_test, y_train, y_test = train_test_split(input_array, y, test_size=0.2, random_state=42, stratify=y)

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
print("Logistic Regression Train Accuracy:", lr.score(X_train, y_train))
print("Logistic Regression Test Accuracy:", lr.score(X_test, y_test))

mlp = MLPClassifier(hidden_layer_sizes=(16, 16), max_iter=400)
mlp.fit(X_train, y_train)
print("MLP Train Accuracy:", mlp.score(X_train, y_train))
print("MLP Test Accuracy:", mlp.score(X_test, y_test))

path = f"data/{dataset.metadata['name']}"
if not os.path.exists(path):
    os.makedirs(path)

np.save(f"{path}/X_train.npy", X_train)
np.save(f"{path}/X_test.npy", X_test)
np.save(f"{path}/y_train.npy", y_train)
np.save(f"{path}/y_test.npy", y_test)

with open(f"{path}/feature_metadata.pkl", "wb") as f:
    pickle.dump(feature_metadata, f)

{'Attribute1': {'type': 'categorical', 'encoder': OneHotEncoder(handle_unknown='ignore'), 'cat_dist': array([0.274, 0.269, 0.063, 0.394]), 'index': array([0, 1, 2, 3])}, 'Attribute2': {'type': 'numerical', 'encoder': StandardScaler(), 'index': 4}, 'Attribute3': {'type': 'categorical', 'encoder': OneHotEncoder(handle_unknown='ignore'), 'cat_dist': array([0.04 , 0.049, 0.53 , 0.088, 0.293]), 'index': array([5, 6, 7, 8, 9])}, 'Attribute4': {'type': 'categorical', 'encoder': OneHotEncoder(handle_unknown='ignore'), 'cat_dist': array([0.234, 0.103, 0.012, 0.181, 0.28 , 0.012, 0.022, 0.05 , 0.009,
       0.097]), 'index': array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19])}, 'Attribute5': {'type': 'numerical', 'encoder': StandardScaler(), 'index': 20}, 'Attribute6': {'type': 'categorical', 'encoder': OneHotEncoder(handle_unknown='ignore'), 'cat_dist': array([0.603, 0.103, 0.063, 0.048, 0.183]), 'index': array([21, 22, 23, 24, 25])}, 'Attribute7': {'type': 'categorical', 'encoder': OneHotEncoder(h



In [8]:
dataset_id = 222
dataset = fetch_ucirepo(id=dataset_id)

# feature_metadata = {}
# for col in dataset.data.features.columns:
#     feature_metadata[col] = {}
#     if dataset.data.features[col].dtype == "object":
#         feature_metadata[col]['type'] = "categorical"
#     else:
#         feature_metadata[col]['type'] = "numerical"
        
# feature_metadata

feature_metadata = {}
input_data = []
start_idx = 0
for col in dataset.data.features.columns:
    feature_metadata[col] = {}
    if dataset.data.features[col].dtype == "object":
        feature_metadata[col]['type'] = "categorical"
        onehot = OneHotEncoder(handle_unknown='ignore')
        feature_val = dataset.data.features[col].fillna("missing")
        preprocessed = onehot.fit_transform(feature_val.values.reshape(-1, 1)).toarray()
        cat_dist = feature_val.value_counts(dropna=False) / len(dataset.data.features)
        cat_dist = cat_dist.loc[onehot.categories_[0]].values
        feature_metadata[col]['encoder'] = onehot
        feature_metadata[col]['cat_dist'] = cat_dist
        feature_metadata[col]['index'] = np.arange(start_idx, start_idx + preprocessed.shape[1])
        start_idx += preprocessed.shape[1]
    else:
        feature_metadata[col]['type'] = "numerical"
        scaler = StandardScaler()
        preprocessed = scaler.fit_transform(dataset.data.features[col].values.reshape(-1, 1))
        feature_metadata[col]['encoder'] = scaler
        feature_metadata[col]['index'] = start_idx
        start_idx += 1

    input_data.append(preprocessed)

print(feature_metadata)

input_array = np.concatenate(input_data, axis=1)

print(dataset.data.targets.isin(['yes']).value_counts() / len(dataset.data.targets))
y = dataset.data.targets.isin(['yes']).values.astype(int)[:,0]

X_train, X_test, y_train, y_test = train_test_split(input_array, y, test_size=0.2, random_state=42, stratify=y)

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
print("Logistic Regression Train Accuracy:", lr.score(X_train, y_train))
print("Logistic Regression Test Accuracy:", lr.score(X_test, y_test))

mlp = MLPClassifier(hidden_layer_sizes=(16, 16), max_iter=400)
mlp.fit(X_train, y_train)
print("MLP Train Accuracy:", mlp.score(X_train, y_train))
print("MLP Test Accuracy:", mlp.score(X_test, y_test))

path = f"data/{dataset.metadata['name']}"
if not os.path.exists(path):
    os.makedirs(path)

np.save(f"{path}/X_train.npy", X_train)
np.save(f"{path}/X_test.npy", X_test)
np.save(f"{path}/y_train.npy", y_train)
np.save(f"{path}/y_test.npy", y_test)

with open(f"{path}/feature_metadata.pkl", "wb") as f:
    pickle.dump(feature_metadata, f)

{'age': {'type': 'numerical', 'encoder': StandardScaler(), 'index': 0}, 'job': {'type': 'categorical', 'encoder': OneHotEncoder(handle_unknown='ignore'), 'cat_dist': array([0.11437482, 0.21525735, 0.03289023, 0.02742695, 0.20919688,
       0.00637013, 0.05007631, 0.03492513, 0.09188029, 0.02074716,
       0.16803433, 0.02882042]), 'index': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])}, 'marital': {'type': 'categorical', 'encoder': OneHotEncoder(handle_unknown='ignore'), 'cat_dist': array([0.11517109, 0.60193316, 0.28289576]), 'index': array([13, 14, 15])}, 'education': {'type': 'categorical', 'encoder': OneHotEncoder(handle_unknown='ignore'), 'cat_dist': array([0.04107407, 0.15153392, 0.51319369, 0.29419831]), 'index': array([16, 17, 18, 19])}, 'default': {'type': 'categorical', 'encoder': OneHotEncoder(handle_unknown='ignore'), 'cat_dist': array([0.98197341, 0.01802659]), 'index': array([20, 21])}, 'balance': {'type': 'numerical', 'encoder': StandardScaler(), 'index': 22}, 

In [9]:
dataset_id = 186
dataset = fetch_ucirepo(id=dataset_id)

# feature_metadata = {}
# for col in dataset.data.features.columns:
#     feature_metadata[col] = {}
#     if dataset.data.features[col].dtype == "object":
#         feature_metadata[col]['type'] = "categorical"
#     else:
#         feature_metadata[col]['type'] = "numerical"
        
# feature_metadata

feature_metadata = {}
input_data = []
start_idx = 0
for col in dataset.data.features.columns:
    feature_metadata[col] = {}
    if dataset.data.features[col].dtype == "object":
        feature_metadata[col]['type'] = "categorical"
        onehot = OneHotEncoder(handle_unknown='ignore')
        feature_val = dataset.data.features[col].fillna("missing")
        preprocessed = onehot.fit_transform(feature_val.values.reshape(-1, 1)).toarray()
        cat_dist = feature_val.value_counts(dropna=False) / len(dataset.data.features)
        cat_dist = cat_dist.loc[onehot.categories_[0]].values
        feature_metadata[col]['encoder'] = onehot
        feature_metadata[col]['cat_dist'] = cat_dist
        feature_metadata[col]['index'] = np.arange(start_idx, start_idx + preprocessed.shape[1])
        start_idx += preprocessed.shape[1]
    else:
        feature_metadata[col]['type'] = "numerical"
        scaler = StandardScaler()
        preprocessed = scaler.fit_transform(dataset.data.features[col].values.reshape(-1, 1))
        feature_metadata[col]['encoder'] = scaler
        feature_metadata[col]['index'] = start_idx
        start_idx += 1

    input_data.append(preprocessed)

print(feature_metadata)

input_array = np.concatenate(input_data, axis=1)

print(dataset.data.targets.isin([7,8,9]).value_counts() / len(dataset.data.targets))
y = dataset.data.targets.isin([7,8,9]).values.astype(int)[:,0]

X_train, X_test, y_train, y_test = train_test_split(input_array, y, test_size=0.2, random_state=42, stratify=y)

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
print("Logistic Regression Train Accuracy:", lr.score(X_train, y_train))
print("Logistic Regression Test Accuracy:", lr.score(X_test, y_test))

mlp = MLPClassifier(hidden_layer_sizes=(16, 16), max_iter=400)
mlp.fit(X_train, y_train)
print("MLP Train Accuracy:", mlp.score(X_train, y_train))
print("MLP Test Accuracy:", mlp.score(X_test, y_test))

path = f"data/{dataset.metadata['name']}"
if not os.path.exists(path):
    os.makedirs(path)

np.save(f"{path}/X_train.npy", X_train)
np.save(f"{path}/X_test.npy", X_test)
np.save(f"{path}/y_train.npy", y_train)
np.save(f"{path}/y_test.npy", y_test)

with open(f"{path}/feature_metadata.pkl", "wb") as f:
    pickle.dump(feature_metadata, f)

{'fixed_acidity': {'type': 'numerical', 'encoder': StandardScaler(), 'index': 0}, 'volatile_acidity': {'type': 'numerical', 'encoder': StandardScaler(), 'index': 1}, 'citric_acid': {'type': 'numerical', 'encoder': StandardScaler(), 'index': 2}, 'residual_sugar': {'type': 'numerical', 'encoder': StandardScaler(), 'index': 3}, 'chlorides': {'type': 'numerical', 'encoder': StandardScaler(), 'index': 4}, 'free_sulfur_dioxide': {'type': 'numerical', 'encoder': StandardScaler(), 'index': 5}, 'total_sulfur_dioxide': {'type': 'numerical', 'encoder': StandardScaler(), 'index': 6}, 'density': {'type': 'numerical', 'encoder': StandardScaler(), 'index': 7}, 'pH': {'type': 'numerical', 'encoder': StandardScaler(), 'index': 8}, 'sulphates': {'type': 'numerical', 'encoder': StandardScaler(), 'index': 9}, 'alcohol': {'type': 'numerical', 'encoder': StandardScaler(), 'index': 10}}
quality
False      0.803448
True       0.196552
Name: count, dtype: float64
Logistic Regression Train Accuracy: 0.818549162

MLP Train Accuracy: 0.8716567250336733
MLP Test Accuracy: 0.8276923076923077




In [10]:
# Breast Cancer는 데이터 크기는 작은데 반해 numerical data가 categorical로 들어가 있어서 전처리에 대한 복잡도가 높음.
# 이미 데이터셋 크기가 큰 충분한 데이터가 있기 때문에 추가적인 데이터셋의 필요성은 떨어짐.
dataset_id = 14
dataset = fetch_ucirepo(id=dataset_id)

feature_metadata = {}
for col in dataset.data.features.columns:
    feature_metadata[col] = {}
    if dataset.data.features[col].dtype == "object":
        feature_metadata[col]['type'] = "categorical"
    else:
        feature_metadata[col]['type'] = "numerical"
        
feature_metadata

# feature_metadata = {}
# input_data = []
# start_idx = 0
# for col in dataset.data.features.columns:
#     feature_metadata[col] = {}
#     if dataset.data.features[col].dtype == "object":
#         feature_metadata[col]['type'] = "categorical"
#         onehot = OneHotEncoder()
#         preprocessed = onehot.fit_transform(dataset.data.features[col].values.reshape(-1, 1)).toarray()
#         cat_dist = dataset.data.features[col].value_counts(dropna=False) / len(dataset.data.features)
#         cat_dist = cat_dist.loc[onehot.categories_[0]].values
#         feature_metadata[col]['encoder'] = onehot
#         feature_metadata[col]['cat_dist'] = cat_dist
#         feature_metadata[col]['index'] = np.arange(start_idx, start_idx + preprocessed.shape[1])
#         start_idx += preprocessed.shape[1]
#     else:
#         feature_metadata[col]['type'] = "numerical"
#         scaler = StandardScaler()
#         preprocessed = scaler.fit_transform(dataset.data.features[col].values.reshape(-1, 1))
#         feature_metadata[col]['encoder'] = scaler
#         feature_metadata[col]['index'] = start_idx
#         start_idx += 1

#     input_data.append(preprocessed)

# print(feature_metadata)

# input_array = np.concatenate(input_data, axis=1)

# print(dataset.data.targets.isin([7,8,9]).value_counts() / len(dataset.data.targets))
# y = dataset.data.targets.isin([7,8,9]).values.astype(int)[:,0]

# X_train, X_test, y_train, y_test = train_test_split(input_array, y, test_size=0.2, random_state=42, stratify=y)

# lr = LogisticRegression(max_iter=1000)
# lr.fit(X_train, y_train)
# print("Logistic Regression Train Accuracy:", lr.score(X_train, y_train))
# print("Logistic Regression Test Accuracy:", lr.score(X_test, y_test))

# mlp = MLPClassifier(hidden_layer_sizes=(16, 16), max_iter=400)
# mlp.fit(X_train, y_train)
# print("MLP Train Accuracy:", mlp.score(X_train, y_train))
# print("MLP Test Accuracy:", mlp.score(X_test, y_test))

# path = f"data/{dataset.metadata['name']}"
# if not os.path.exists(path):
#     os.makedirs(path)

# np.save(f"{path}/X_train.npy", X_train)
# np.save(f"{path}/X_test.npy", X_test)
# np.save(f"{path}/y_train.npy", y_train)
# np.save(f"{path}/y_test.npy", y_test)

# with open(f"{path}/feature_metadata.pkl", "wb") as f:
#     pickle.dump(feature_metadata, f)

{'age': {'type': 'categorical'},
 'menopause': {'type': 'categorical'},
 'tumor-size': {'type': 'categorical'},
 'inv-nodes': {'type': 'categorical'},
 'node-caps': {'type': 'categorical'},
 'deg-malig': {'type': 'numerical'},
 'breast': {'type': 'categorical'},
 'breast-quad': {'type': 'categorical'},
 'irradiat': {'type': 'categorical'}}