In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time

from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder

import os

import warnings
warnings.filterwarnings('ignore')

In [None]:
original_path = "/content/drive/MyDrive/Evaluation_synthesized_data/categorical_original.csv"
fake_path = '/content/drive/MyDrive/Evaluation_synthesized_data/categorical_OI_11_00_fake.csv'

original_data = pd.read_csv(original_path)
fake_data = pd.read_csv(fake_path)

In [None]:
# 1. 공통 컬럼 정렬
common_cols = list(set(original_data.columns) & set(fake_data.columns))
original_data = original_data[common_cols]
fake_data = fake_data[common_cols]

# 2. 범주형 변수 encoding (GMM은 수치형만 가능)
def encode_categoricals(df):
    df = df.copy()
    for col in df.select_dtypes(include='object').columns:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))
    return df

original_encoded = encode_categoricals(original_data)
fake_encoded = encode_categoricals(fake_data)

# 3. 결측치 제거 (또는 채우기)
original_encoded = original_encoded.dropna()
fake_encoded = fake_encoded.dropna()

# 4. 연속형 변수만 선택 (GMM은 수치형만 지원)
numerical_cols = original_encoded.select_dtypes(include=np.number).columns.tolist()
X_fake = fake_encoded[numerical_cols]
X_real = original_encoded[numerical_cols]

In [None]:
# 5. GMM 학습 및 평가
def evaluate_gmm_log_likelihood(X_train, X_eval, n_components=10):
    start_time = time.time()

    gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=42)
    gmm.fit(X_train)

    end_time = time.time()
    elapsed_time = end_time - start_time

    log_likelihood = gmm.score(X_eval)  # 평균 log-likelihood
    return log_likelihood, elapsed_time

gmm_ll, elapsed_time = evaluate_gmm_log_likelihood(X_fake, X_real)
print(f"📊 GMM 기반 평균 Log-Likelihood (real data): {gmm_ll:.4f}")
print(f"실행 시간: {elapsed_time:.2f}초")

In [None]:
#실제 데이터로 학습하고 실제 데이터를 평가하는 구간(최대 성능의 상한선)
oracle_ll = evaluate_gmm_log_likelihood(X_real, X_real)
print("Oracle log-likelihood (upper bound):", oracle_ll)