In [92]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

url = './dataset.csv'
df = pd.read_csv(url)
print('원본 데이터 형태:', df.shape)
df.head()

원본 데이터 형태: (4424, 35)


Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,8,5,2,1,1,1,13,10,6,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,6,1,11,1,1,1,1,3,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,5,1,1,1,22,27,10,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,8,2,15,1,1,1,23,27,6,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,12,1,3,0,1,1,22,28,10,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [93]:
df.Target.value_counts(),df.Target.count()
df[df.Target.isna()]
df.Target.unique()

array(['Dropout', 'Graduate', 'Enrolled'], dtype=object)

In [94]:
# 2. 타깃 인코딩 및 기본 정리
TARGET_MAPPING = {'Dropout': 0, 'Graduate': 1, 'Enrolled': 1}

df = df.copy()
df['Target'] = df['Target'].map(TARGET_MAPPING)
if df['Target'].isna().any():
    missing_labels = df.loc[df['Target'].isna(), 'Target']
    raise ValueError(f'정의되지 않은 타깃 라벨이 있습니다: {missing_labels.unique()}')

# 중복 행 제거
before = len(df)
df = df.drop_duplicates()
after = len(df)
print(f'중복 제거: {before - after}행 삭제 (현재 {after}행)')

# 결측치 요약
missing_summary = df.isnull().sum()
print('결측치 현황:')
print(missing_summary[missing_summary > 0] if missing_summary.any() else '결측치 없음')

중복 제거: 0행 삭제 (현재 4424행)
결측치 현황:
결측치 없음


In [95]:
# 3. 특징/타깃 분리 및 데이터 타입 분석
feature_cols = [col for col in df.columns if col != 'Target']
X = df[feature_cols]
y = df['Target']

categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

print(f'총 특징 수: {len(feature_cols)}')
print(f' - 수치형: {numeric_cols}')
print(f' - 범주형: {categorical_cols}')

총 특징 수: 34
 - 수치형: ['Marital status', 'Application mode', 'Application order', 'Course', 'Daytime/evening attendance', 'Previous qualification', 'Nacionality', "Mother's qualification", "Father's qualification", "Mother's occupation", "Father's occupation", 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'Age at enrollment', 'International', 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)', 'Unemployment rate', 'Inflation rate', 'GDP']
 - 범주형: []


In [96]:
# 4. 전처리 파이프라인 구성 및 적용
numeric_transformer = Pipeline([
    ('scaler', StandardScaler()),
])

categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols),
    ],
    remainder='drop',
)

X_processed = preprocessor.fit_transform(X)

# 파생된 컬럼 이름 추출
feature_names = []
if numeric_cols:
    feature_names.extend(numeric_cols)
if categorical_cols:
    encoded_cat_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
    feature_names.extend(encoded_cat_names.tolist())

X_processed_df = pd.DataFrame(X_processed, columns=feature_names, index=df.index)

print('전처리 완료:')
print(' - 변환된 특징 행렬 형태:', X_processed_df.shape)
print(' - 예시 5행:')
X_processed_df.head()

전처리 완료:
 - 변환된 특징 행렬 형태: (4424, 34)
 - 예시 5행:


Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,-0.294829,0.210069,2.490896,-1.823744,0.350082,-0.386404,-0.145586,0.075111,-0.584526,-0.329669,...,-0.199273,-0.282442,-2.838337,-2.04263,-1.471527,-1.963489,-0.199441,-0.287638,0.124386,0.765761
1,-0.294829,-0.167406,-0.554068,0.254153,0.350082,-0.386404,-0.145586,-1.254495,-1.21838,-0.829997,...,-0.199273,-0.282442,-0.105726,-0.522682,0.518904,0.659562,-0.199441,0.876222,-1.105222,0.347199
2,-0.294829,-1.111094,2.490896,-1.131112,0.350082,-0.386404,-0.145586,1.072315,0.954834,0.670987,...,-0.199273,-0.282442,-0.105726,-2.04263,-1.471527,-1.963489,-0.199441,-0.287638,0.124386,0.765761
3,-0.294829,0.210069,0.207173,1.177663,0.350082,-0.386404,-0.145586,1.183116,0.954834,-0.329669,...,-0.199273,-0.282442,-0.105726,0.490616,0.187165,0.41645,-0.199441,-0.813253,-1.466871,-1.375511
4,1.356212,0.965018,-0.554068,-1.592866,-2.85647,-0.386404,-0.145586,1.072315,1.045384,0.670987,...,-0.199273,-0.282442,-0.105726,-0.522682,0.518904,0.531608,-0.199441,0.876222,-1.105222,0.347199


In [97]:
# 5. 학습/검증 데이터 분할 (향후 모델링 용)
X_train, X_test, y_train, y_test = train_test_split(
    X_processed_df,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42,
)

print('Train/Test 분할 완료:')
print(' - X_train:', X_train.shape)
print(' - X_test :', X_test.shape)
print(' - y_train 분포:')
print(y_train.value_counts(normalize=True).rename('ratio'))

Train/Test 분할 완료:
 - X_train: (3539, 34)
 - X_test : (885, 34)
 - y_train 분포:
Target
1    0.678723
0    0.321277
Name: ratio, dtype: float64


In [98]:
# 6. 전처리 데이터 저장
from pathlib import Path

processed_dataset = X_processed_df.copy()
processed_dataset['Target'] = y

output_path = Path('test.csv')
if output_path.exists():
    print(f"{output_path} 파일이 이미 존재합니다. 저장을 건너뜁니다.")
else:
    processed_dataset.to_csv(output_path, index=False)
    print(f"전처리된 데이터를 {output_path}로 저장했습니다.")

processed_dataset.head()

전처리된 데이터를 test.csv로 저장했습니다.


Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,-0.294829,0.210069,2.490896,-1.823744,0.350082,-0.386404,-0.145586,0.075111,-0.584526,-0.329669,...,-0.282442,-2.838337,-2.04263,-1.471527,-1.963489,-0.199441,-0.287638,0.124386,0.765761,0
1,-0.294829,-0.167406,-0.554068,0.254153,0.350082,-0.386404,-0.145586,-1.254495,-1.21838,-0.829997,...,-0.282442,-0.105726,-0.522682,0.518904,0.659562,-0.199441,0.876222,-1.105222,0.347199,1
2,-0.294829,-1.111094,2.490896,-1.131112,0.350082,-0.386404,-0.145586,1.072315,0.954834,0.670987,...,-0.282442,-0.105726,-2.04263,-1.471527,-1.963489,-0.199441,-0.287638,0.124386,0.765761,0
3,-0.294829,0.210069,0.207173,1.177663,0.350082,-0.386404,-0.145586,1.183116,0.954834,-0.329669,...,-0.282442,-0.105726,0.490616,0.187165,0.41645,-0.199441,-0.813253,-1.466871,-1.375511,1
4,1.356212,0.965018,-0.554068,-1.592866,-2.85647,-0.386404,-0.145586,1.072315,1.045384,0.670987,...,-0.282442,-0.105726,-0.522682,0.518904,0.531608,-0.199441,0.876222,-1.105222,0.347199,1
