In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터 로딩
X_train = pd.read_csv("datasets/X_train.csv", encoding='utf-8')
y_train = pd.read_csv("datasets/y_train.csv", encoding='utf-8')
X_test = pd.read_csv("datasets/X_test.csv", encoding='utf-8')
y_test = pd.read_csv("datasets/y_test.csv", encoding='utf-8')

# 문자열 인코딩
for col in X_train.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col].fillna(''))

# GridSearchCV 파라미터 정의
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'class_weight': ['balanced']
}

# 모델 및 GridSearchCV
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적 모델 추출
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# 결과 출력
print("✅ Best Params:", grid_search.best_params_)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



In [None]:
import joblib

# 저장할 데이터 구조
save_dict = {
    'model': best_model,
    'encoders': label_encoders,
    'features': X_train.columns,
    'metrics': {
        'model_name': 'RandomForest',
        'accuracy': 0.723,
        'f1_score': 0.560,
        'recall': 0.840
    }
}

# 파일로 저장
joblib.dump(save_dict, 'rf.model.joblib')
print("✅ rf.model.joblib 저장 완료")
