In [1]:
import pandas as pd


In [2]:
df = pd.read_csv("../data/sec_timeline_scaled_recovery_rate.csv")

In [3]:
df.head()

Unnamed: 0,term,emp_length,annual_inc,dti,delinq_2yrs,fico_avg,inq_last_6mths,open_acc,pub_rec,revol_bal,...,home_ownership_OWN,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,loan_status,grade,sub_grade,loan_amnt,int_rate,recovery_rate
0,0,0.272727,0.578659,0.034688,1,0.512129,0,0.155556,0,0.544497,...,False,False,False,True,0,0.0,0.117647,0.566905,0.0797,0.993053
1,1,1.0,0.626931,0.124273,1,0.053908,1,0.266667,0,0.659641,...,False,False,False,True,1,0.666667,0.676471,0.875639,0.2499,0.210523
2,0,0.0,0.598518,0.068131,0,0.862534,0,0.2,0,0.559936,...,False,False,False,False,1,0.0,0.029412,0.790812,0.0707,0.66302
3,0,0.545455,0.584349,0.028188,0,0.215633,0,0.077778,1,0.598166,...,False,False,False,False,0,0.166667,0.205882,0.501238,0.1042,0.976153
4,0,1.0,0.594195,0.10604,0,0.080863,0,0.211111,0,0.658498,...,False,False,True,False,0,0.166667,0.205882,0.841755,0.0999,1.0


In [14]:
############### Library ###############
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error
import joblib

############### Data ###############
file_path = '../data/sec_timeline_scaled_recovery_rate.csv'
df = pd.read_csv(file_path)

# X와 y 분리
y = df['recovery_rate']
X = df.drop('recovery_rate', axis=1)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Train, Valid, Test 셋 분할 (대략 64% / 16% / 20%)
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=RANDOM_SEED)

############### Feature Selection Functions ###############
# 단일 validation 셋을 이용한 forward search
def forward_search(model, X_train, y_train, X_valid, y_valid, max_features=None):
    if max_features is None:
        max_features = X_train.shape[1]
    
    features = []
    remaining_features = list(X_train.columns)
    best_score = float('inf')
    scores_history = []
    feature_history = []
    
    for i in range(max_features):
        best_new_score = float('inf')
        best_feature = None
        
        for feature in remaining_features:
            current_features = features + [feature]
            # 학습 및 예측
            model.fit(X_train[current_features], y_train)
            y_valid_pred = model.predict(X_valid[current_features])
            current_score = mean_absolute_error(y_valid, y_valid_pred)
            
            if current_score < best_new_score:
                best_new_score = current_score
                best_feature = feature
        
        if best_new_score < best_score:
            features.append(best_feature)
            remaining_features.remove(best_feature)
            best_score = best_new_score
            scores_history.append(best_score)
            feature_history.append(best_feature)
            print(f"Forward Search - Added feature: {best_feature}, MAE: {best_score:.4f}, Total features: {len(features)}")
        else:
            break
            
    return features, scores_history, feature_history

# 단일 validation 셋을 이용한 backward search
def backward_search(model, X_train, y_train, X_valid, y_valid, min_features=1):
    features = list(X_train.columns)
    best_score = float('inf')
    scores_history = []
    removed_features = []
    
    # 초기 성능 계산 (전체 feature 사용)
    model.fit(X_train[features], y_train)
    y_valid_pred = model.predict(X_valid[features])
    best_score = mean_absolute_error(y_valid, y_valid_pred)
    
    while len(features) > min_features:
        best_new_score = float('inf')
        worst_feature = None
        
        for feature in features:
            current_features = [f for f in features if f != feature]
            model.fit(X_train[current_features], y_train)
            y_valid_pred = model.predict(X_valid[current_features])
            current_score = mean_absolute_error(y_valid, y_valid_pred)
            
            if current_score < best_new_score:
                best_new_score = current_score
                worst_feature = feature
        
        if best_new_score <= best_score:
            features.remove(worst_feature)
            best_score = best_new_score
            scores_history.append(best_score)
            removed_features.append(worst_feature)
            print(f"Backward Search - Removed feature: {worst_feature}, MAE: {best_score:.4f}, Remaining features: {len(features)}")
        else:
            break
            
    return features, scores_history, removed_features

############### Model Training Function ###############
def train_models(X_train, y_train, X_valid, y_valid, method='selection'):
    """
    Train regression models using specified feature selection method.
    method: 'selection' (전체 feature 사용), 'forward', or 'backward'
    """
    # 기본 회귀 모델들
    base_learners = [
        ('ridge', Ridge()),
        ('lasso', Lasso()),
        ('elasticnet', ElasticNet()),
    ]
    
    model_results = {}
    # 반드시 포함해야 하는 도메인 feature
    domain_features = ['emp_length', 'dti', 'revol_util', 'fico_avg']
    
    # 결과 저장 폴더 (모델 저장 경로: "./model")
    output_dir = "./model"
    os.makedirs(output_dir, exist_ok=True)
    
    for name, model in base_learners:
        print(f"\nProcessing {name} model using {method} method...")
        try:
            model_results[name] = {
                'model_type': 'Linear',
                'model_name': name,
                'selected_features': None,
                'n_features': None,
                'mae_score': None
            }
            
            # Feature selection 적용
            if method == 'forward':
                selected_features, _, feature_history = forward_search(model, X_train, y_train, X_valid, y_valid)
                model_results[name]['feature_order'] = ', '.join(feature_history)
            elif method == 'backward':
                selected_features, _, removed_features = backward_search(model, X_train, y_train, X_valid, y_valid)
                model_results[name]['feature_order'] = ', '.join(removed_features)
            else:  # 'selection' 기본: 전체 feature 사용
                selected_features = X_train.columns.tolist()
            
            # 도메인 feature 반드시 포함
            selected_features = list(set(domain_features) | set(selected_features))
            # (선택된 feature가 20개를 초과하면 도메인 feature 우선 + 앞쪽 일부 선택)
            if len(selected_features) > 20:
                non_domain = [feat for feat in selected_features if feat not in domain_features]
                selected_features = domain_features + non_domain[:(20 - len(domain_features))]
            
            model_results[name]['selected_features'] = ', '.join(selected_features)
            model_results[name]['n_features'] = len(selected_features)
            
            # 선택된 feature로 학습
            X_train_sel = X_train[selected_features]
            X_valid_sel = X_valid[selected_features]
            model.fit(X_train_sel, y_train)
            y_valid_pred = model.predict(X_valid_sel)
            mae = mean_absolute_error(y_valid, y_valid_pred)
            model_results[name]['mae_score'] = mae
            
            # 모델 저장 (파일명: recovery_pred_<모델이름>.pkl)
            save_path = os.path.join(output_dir, f"recovery_pred_{name}.pkl")
            joblib.dump(model, save_path)
            print(f"{name} model saved to {save_path} (MAE: {mae:.4f})")
            
        except Exception as e:
            print(f"Error processing {name}: {str(e)}")
            continue
            
    results_df = pd.DataFrame.from_dict(model_results, orient='index')
    results_df['method'] = method
    results_df.to_csv(os.path.join(output_dir, f"regression_models_results_{method}.csv"), index=False)
    return results_df

############### 모델 학습 실행 ###############
# 여러 방법('selection', 'forward', 'backward')에 대해 모델 학습
for method in ['selection', 'forward', 'backward']:
    print(f"\nTraining models with {method} method...")
    results = train_models(X_train, y_train, X_valid, y_valid, method=method)
    print(f"\nResults for {method} method:")
    print(results)



Training models with selection method...

Processing ridge model using selection method...
ridge model saved to ./model\recovery_pred_ridge.pkl (MAE: 0.1502)

Processing lasso model using selection method...
lasso model saved to ./model\recovery_pred_lasso.pkl (MAE: 0.1754)

Processing elasticnet model using selection method...
elasticnet model saved to ./model\recovery_pred_elasticnet.pkl (MAE: 0.1754)

Results for selection method:
           model_type  model_name  \
ridge          Linear       ridge   
lasso          Linear       lasso   
elasticnet     Linear  elasticnet   

                                            selected_features  n_features  \
ridge       emp_length, dti, revol_util, fico_avg, total_a...          20   
lasso       emp_length, dti, revol_util, fico_avg, total_a...          20   
elasticnet  emp_length, dti, revol_util, fico_avg, total_a...          20   

            mae_score     method  
ridge        0.150152  selection  
lasso        0.175420  selection 