In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

url = '../data/dataset.csv'
df = pd.read_csv(url)
print('원본 데이터 형태:', df.shape)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/dataset.csv'

In [None]:
df.Target.value_counts(),df.Target.count()
df[df.Target.isna()]
df.Target.unique()

array(['Dropout', 'Graduate', 'Enrolled'], dtype=object)

In [None]:
# 2. 타깃 인코딩 및 기본 정리
TARGET_MAPPING = {'Dropout': 0, 'Graduate': 1}

df = df.copy()
removed_enrolled = df.loc[df['Target'] == 'Enrolled'].shape[0]
if removed_enrolled:
    print(f"Enrolled 라벨 {removed_enrolled}건 제거")
    df = df[df['Target'] != 'Enrolled']

df['Target'] = df['Target'].map(TARGET_MAPPING)
if df['Target'].isna().any():
    missing_labels = df.loc[df['Target'].isna(), 'Target']
    raise ValueError(f'정의되지 않은 타깃 라벨이 있습니다: {missing_labels.unique()}')

# 중복 행 제거
before = len(df)
df = df.drop_duplicates()
after = len(df)
print(f'중복 제거: {before - after}행 삭제 (현재 {after}행)')

# 결측치 요약
missing_summary = df.isnull().sum()
print('결측치 현황:')
print(missing_summary[missing_summary > 0] if missing_summary.any() else '결측치 없음')

Enrolled 라벨 794건 제거
중복 제거: 0행 삭제 (현재 3630행)
결측치 현황:
결측치 없음


In [None]:
# 3. 특징/타깃 분리 및 데이터 타입 분석
feature_cols = [col for col in df.columns if col != 'Target']
X = df[feature_cols]
y = df['Target']

columns_to_drop = set()

# 3-1. 식별자 및 상수 컬럼 제거
id_like = [col for col in feature_cols if 'id' in col.lower()]
constant_cols = [col for col in feature_cols if X[col].nunique(dropna=False) <= 1]
columns_to_drop.update(id_like)
columns_to_drop.update(constant_cols)

# 3-2. 타깃과의 상관관계가 거의 없는 수치 컬럼 제거
numeric_candidates = X.select_dtypes(include=['number']).columns.tolist()
low_corr_numeric = []
for col in numeric_candidates:
    corr = X[col].corr(y)
    if pd.isna(corr) or abs(corr) < 0.02:
        low_corr_numeric.append(col)
columns_to_drop.update(low_corr_numeric)

# 3-3. 고유값 비율이 과도한 범주형 컬럼 제거
categorical_candidates = X.select_dtypes(include=['object', 'category']).columns.tolist()
high_cardinality = [
    col
    for col in categorical_candidates
    if (X[col].nunique(dropna=False) / len(X)) > 0.6
]
columns_to_drop.update(high_cardinality)

if columns_to_drop:
    print('제거 대상 컬럼:', sorted(columns_to_drop))
    X = X.drop(columns=columns_to_drop)
else:
    print('제거 대상 컬럼 없음')

feature_cols = X.columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

print(f'총 특징 수: {len(feature_cols)}')
print(f' - 수치형 ({len(numeric_cols)}개): {numeric_cols}')
print(f' - 범주형 ({len(categorical_cols)}개): {categorical_cols}')

제거 대상 컬럼: ['Course', 'Educational special needs', "Father's qualification", 'International', 'Nacionality', 'Unemployment rate']
총 특징 수: 28
 - 수치형 (28개): ['Marital status', 'Application mode', 'Application order', 'Daytime/evening attendance', 'Previous qualification', "Mother's qualification", "Mother's occupation", "Father's occupation", 'Displaced', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'Age at enrollment', 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)', 'Inflation rate', 'GDP']
 - 범주형 (0개): []


In [None]:
# 4. 전처리 파이프라인 구성 및 적용
numeric_transformer = Pipeline([
    ('scaler', StandardScaler()),
])

categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols),
    ],
    remainder='drop',
)

X_processed = preprocessor.fit_transform(X)

# 파생된 컬럼 이름 추출
feature_names = []
if numeric_cols:
    feature_names.extend(numeric_cols)
if categorical_cols:
    encoded_cat_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
    feature_names.extend(encoded_cat_names.tolist())

X_processed_df = pd.DataFrame(X_processed, columns=feature_names, index=df.index)

print('전처리 완료:')
print(' - 변환된 특징 행렬 형태:', X_processed_df.shape)
print(' - 예시 5행:')
X_processed_df.head()

전처리 완료:
 - 변환된 특징 행렬 형태: (3630, 28)
 - 예시 5행:


Unnamed: 0,Marital status,Application mode,Application order,Daytime/evening attendance,Previous qualification,Mother's qualification,Mother's occupation,Father's occupation,Displaced,Debtor,...,Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Inflation rate,GDP
0,-0.300686,0.226505,2.436822,0.35585,-0.392879,0.049009,-0.327219,0.544637,0.906297,-0.358302,...,-2.083224,-0.189871,-0.287686,-2.782691,-1.958586,-1.429014,-1.831085,-0.190148,0.121615,0.774119
1,-0.300686,-0.154237,-0.562471,0.35585,-0.392879,-1.283593,-0.866764,-0.846007,0.906297,-0.358302,...,0.685217,-0.189871,-0.287686,-0.131002,-0.444817,0.468555,0.662383,-0.190148,-1.10607,0.353704
2,-0.300686,-1.106094,2.436822,0.35585,-0.392879,1.04846,0.751869,0.544637,0.906297,-0.358302,...,-2.083224,-0.189871,-0.287686,-0.131002,-1.958586,-1.429014,-1.831085,-0.190148,0.121615,0.774119
3,-0.300686,0.226505,0.187353,0.35585,-0.392879,1.15951,-0.327219,-0.846007,0.906297,-0.358302,...,0.572219,-0.189871,-0.287686,-0.131002,0.564362,0.152293,0.431281,-0.190148,-1.467154,-1.376634
4,1.330837,0.98799,-0.562471,-2.810171,-0.392879,1.04846,0.751869,0.544637,-1.103391,-0.358302,...,0.355641,-0.189871,-0.287686,-0.131002,-0.444817,0.468555,0.54075,-0.190148,-1.10607,0.353704


In [None]:
# 5. 학습/검증 데이터 분할 (향후 모델링 용)
X_train, X_test, y_train, y_test = train_test_split(
    X_processed_df,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42,
)

print('Train/Test 분할 완료:')
print(' - X_train:', X_train.shape)
print(' - X_test :', X_test.shape)
print(' - y_train 분포:')
print(y_train.value_counts(normalize=True).rename('ratio'))

Train/Test 분할 완료:
 - X_train: (2904, 28)
 - X_test : (726, 28)
 - y_train 분포:
Target
1    0.608471
0    0.391529
Name: ratio, dtype: float64


In [None]:
# 6. 전처리 데이터 저장
from pathlib import Path

processed_dataset = X_processed_df.copy()
processed_dataset['Target'] = y

output_path = Path('../data/preprocessed/dataset_preprocessed.csv')
if output_path.exists():
    print(f"{output_path} 파일이 이미 존재합니다. 저장을 건너뜁니다.")
else:
    processed_dataset.to_csv(output_path, index=False)
    print(f"전처리된 데이터를 {output_path}로 저장했습니다.")

processed_dataset.head()

전처리된 데이터를 ..\csv\test.csv로 저장했습니다.


Unnamed: 0,Marital status,Application mode,Application order,Daytime/evening attendance,Previous qualification,Mother's qualification,Mother's occupation,Father's occupation,Displaced,Debtor,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Inflation rate,GDP,Target
0,-0.300686,0.226505,2.436822,0.35585,-0.392879,0.049009,-0.327219,0.544637,0.906297,-0.358302,...,-0.189871,-0.287686,-2.782691,-1.958586,-1.429014,-1.831085,-0.190148,0.121615,0.774119,0
1,-0.300686,-0.154237,-0.562471,0.35585,-0.392879,-1.283593,-0.866764,-0.846007,0.906297,-0.358302,...,-0.189871,-0.287686,-0.131002,-0.444817,0.468555,0.662383,-0.190148,-1.10607,0.353704,1
2,-0.300686,-1.106094,2.436822,0.35585,-0.392879,1.04846,0.751869,0.544637,0.906297,-0.358302,...,-0.189871,-0.287686,-0.131002,-1.958586,-1.429014,-1.831085,-0.190148,0.121615,0.774119,0
3,-0.300686,0.226505,0.187353,0.35585,-0.392879,1.15951,-0.327219,-0.846007,0.906297,-0.358302,...,-0.189871,-0.287686,-0.131002,0.564362,0.152293,0.431281,-0.190148,-1.467154,-1.376634,1
4,1.330837,0.98799,-0.562471,-2.810171,-0.392879,1.04846,0.751869,0.544637,-1.103391,-0.358302,...,-0.189871,-0.287686,-0.131002,-0.444817,0.468555,0.54075,-0.190148,-1.10607,0.353704,1
