# 정제된 데이터로 머신 러닝 모델 성능 평가

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [11]:
df = pd.read_csv('./data/Academic_student.csv')
df

Unnamed: 0,Student_ID,Age,Gender,Marital_Status,Course_Chosen,Application_Mode,Residence_Location,Parental_Education,Parental_Income_Level,Employment_Status,Semester_Enrolled_Units,Semester_Credited_Units,Semester_Evaluated_Units,Semester_Approved_Units,Semester_Average_Grade,Retention,Unemployment_Rate,Inflation_Rate,Regional_GDP,Year
0,STUD_1,30,Female,Married,Management,In-person,Rural,High School,83337,Employed,6,5,5,5,2.34,1,5.0,2.5,25000,2021
1,STUD_3,29,Female,Widowed,Journalism,In-person,Urban,PhD,117523,Employed,7,3,2,3,2.29,1,5.0,2.5,25000,2021
2,STUD_5,26,Other,Married,Agronomy,Online,Rural,Master,98967,Part-time,2,3,4,3,2.50,0,5.0,2.5,25000,2021
3,STUD_6,18,Other,Widowed,Agronomy,In-person,Rural,Bachelor,20900,Part-time,4,7,7,2,1.41,0,5.0,2.5,25000,2021
4,STUD_7,23,Male,Widowed,Agronomy,Online,Suburban,Master,54013,Part-time,7,4,7,5,2.57,0,5.0,2.5,25000,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,STUD_112,29,Male,Married,Education,Referral,Rural,PhD,72964,Employed,3,0,4,8,0.82,0,5.0,2.5,25000,2021
496,STUD_163,21,Male,Married,Design,Online,Urban,Bachelor,80747,Part-time,5,4,6,4,2.59,1,5.0,2.5,25000,2021
497,STUD_188,27,Other,Single,Technologies,In-person,Urban,Bachelor,41074,Unemployed,1,3,4,8,2.67,1,5.0,2.5,25000,2021
498,STUD_402,29,Other,Single,Agronomy,Referral,Urban,PhD,101174,Part-time,8,5,3,2,1.95,0,5.0,2.5,25000,2021


In [12]:
print(df['Marital_Status'].unique())
print(df['Residence_Location'].unique())
print(df['Parental_Education'].unique())
print(df['Employment_Status'].unique())

['Married' 'Widowed' 'Single' 'Divorced']
['Rural' 'Urban' 'Suburban']
['High School' 'PhD' 'Master' 'Bachelor' nan]
['Employed' 'Part-time' 'Unemployed']


In [13]:
# 데이터 전처리

# Gender (Female : 1, Male, 2, Other : 3)
# Marital_Status (Married : 1, Widowed : 2, Single : 3, Divorced : 4)
# Residence_Location (Rural : 1, Urban : 2, Suburban : 3)
# Parental_Education (High School : 1, PhD : 2, Master : 3, Bachelor : 4, Nan 값은 drop)
# Employment_Status (Employed : 1, Part-time : 2, Unemployed : 3)
# Year 컬럼 drop

# 1. Parental_Education 열의 결측치(NaN)가 있는 행 제거
df.dropna(subset=['Parental_Education'], inplace=True)

# 2. 컬럼 인코딩
# Gender
df['Gender'] = df['Gender'].map({'Female': 1, 'Male': 2, 'Other': 3})

# Marital_Status
df['Marital_Status'] = df['Marital_Status'].map({'Married': 1, 'Widowed': 2, 'Single': 3, 'Divorced': 4})

# Residence_Location
df['Residence_Location'] = df['Residence_Location'].map({'Rural': 1, 'Urban': 2, 'Suburban': 3})

# Parental_Education
df['Parental_Education'] = df['Parental_Education'].map({'High School': 1, 'PhD': 2, 'Master': 3, 'Bachelor': 4})

# Employment_Status
df['Employment_Status'] = df['Employment_Status'].map({'Employed': 1, 'Part-time': 2, 'Unemployed': 3})

# 3. Year 열 제거
df.drop('Year', axis=1, inplace=True)

# Course_Chosen, Application_Mode 컬럼 제거
df = df.drop(['Course_Chosen', 'Application_Mode'], axis=1)

print("--- 전처리 후 데이터프레임 ---")
display(df)

--- 전처리 후 데이터프레임 ---


Unnamed: 0,Student_ID,Age,Gender,Marital_Status,Residence_Location,Parental_Education,Parental_Income_Level,Employment_Status,Semester_Enrolled_Units,Semester_Credited_Units,Semester_Evaluated_Units,Semester_Approved_Units,Semester_Average_Grade,Retention,Unemployment_Rate,Inflation_Rate,Regional_GDP
0,STUD_1,30,1,1,1,1,83337,1,6,5,5,5,2.34,1,5.0,2.5,25000
1,STUD_3,29,1,2,2,2,117523,1,7,3,2,3,2.29,1,5.0,2.5,25000
2,STUD_5,26,3,1,1,3,98967,2,2,3,4,3,2.50,0,5.0,2.5,25000
3,STUD_6,18,3,2,1,4,20900,2,4,7,7,2,1.41,0,5.0,2.5,25000
4,STUD_7,23,2,2,3,3,54013,2,7,4,7,5,2.57,0,5.0,2.5,25000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,STUD_112,29,2,1,1,2,72964,1,3,0,4,8,0.82,0,5.0,2.5,25000
496,STUD_163,21,2,1,2,4,80747,2,5,4,6,4,2.59,1,5.0,2.5,25000
497,STUD_188,27,3,3,2,4,41074,3,1,3,4,8,2.67,1,5.0,2.5,25000
498,STUD_402,29,3,3,2,2,101174,2,8,5,3,2,1.95,0,5.0,2.5,25000


In [14]:
df.isnull().sum()
df.dropna(subset=['Semester_Average_Grade'], inplace=True)

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score, f1_score

# 분류 모델들
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# 부스팅 계열
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

X = df.drop(['Student_ID', 'Retention'], axis=1)
y = df['Retention']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
models = {
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),  
    "XGBoost": XGBClassifier(
        random_state=42,
        eval_metric="logloss"
    )
}

In [17]:
# --- 2) 학습 & 평가 유틸 ---
def eval_model(name, model, X_tr, y_tr, X_te, y_te):
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)

    # 확률 점수 얻기 (predict_proba가 없으면 decision_function을 확률처럼 정규화)
    if hasattr(model, "predict_proba"):
        y_score = model.predict_proba(X_te)[:, 1]
    elif hasattr(model, "decision_function"):
        df = model.decision_function(X_te)
        # min-max로 [0,1] 스케일 (순위 기반 ROC-AUC에는 큰 문제 없음)
        df_min, df_max = df.min(), df.max()
        y_score = (df - df_min) / (df_max - df_min + 1e-9)
    else:
        # 확률이 전혀 없으면 예측(0/1)로 대체(ROC-AUC는 덜 의미 있음)
        y_score = y_pred

    rep = classification_report(y_te, y_pred, output_dict=True, zero_division=0)
    auc = roc_auc_score(y_te, y_score)
    tn, fp, fn, tp = confusion_matrix(y_te, y_pred).ravel()

    row = {
        "Model": name,
        "Accuracy": rep["accuracy"],
        "Precision(1)": rep["1"]["precision"],
        "Recall(1)": rep["1"]["recall"],
        "F1(1)": rep["1"]["f1-score"],
        "ROC-AUC": auc,
        "TP": tp, "FP": fp, "TN": tn, "FN": fn
    }
    return row, rep

# --- 3) 전체 모델 실행 ---
rows = []
reports = {}
for name, m in models.items():
    try:
        row, rep = eval_model(name, m, X_train, y_train, X_test, y_test)
        rows.append(row)
        reports[name] = rep
    except Exception as e:
        print(f"[WARN] {name} 실행 중 오류 → 스킵: {e}")

df_results = pd.DataFrame(rows).sort_values("ROC-AUC", ascending=False)
print("=== Model comparison (no stratify, no scaling) ===")
print(df_results.to_string(index=False))

best_model_name = df_results.iloc[0]["Model"]
print(f"\nBest (by ROC-AUC): {best_model_name}")

=== Model comparison (no stratify, no scaling) ===
           Model  Accuracy  Precision(1)  Recall(1)    F1(1)  ROC-AUC  TP  FP  TN  FN
GradientBoosting  0.571429      0.600000      0.525 0.560000 0.556081  21  14  23  19
    DecisionTree  0.545455      0.560976      0.575 0.567901 0.544257  23  18  19  17
         XGBoost  0.480519      0.500000      0.425 0.459459 0.523649  17  17  20  23
    RandomForest  0.454545      0.466667      0.350 0.400000 0.515878  14  16  21  26

Best (by ROC-AUC): GradientBoosting
