# 특성 중요도

In [None]:
def plot_feature_importance(model):
    # 특성 중요도 얻기
    importances = model.feature_importances_

    # 중요도를 내림차순으로 정렬하고 특성 이름에 매핑
    sorted_indices = np.argsort(importances)[::-1]
    sorted_features = [X_train.columns[i] for i in sorted_indices]
    sorted_importances = importances[sorted_indices]

    # 특성 중요도 시각화
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(sorted_features)), sorted_importances, tick_label=sorted_features)
    plt.xticks(rotation=90)
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.title('Feature Importance Plot')
    plt.show()

# 모델 평가(회귀)


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy import stats

def evaluate_regression_model(y_true, y_pred, plot_residuals=True):
    # RMSE 계산
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    # MAE 계산
    mae = mean_absolute_error(y_true, y_pred)

    # R-squared 계산
    r_squared = r2_score(y_true, y_pred)

    # Adjusted R-squared 계산
    n_samples = len(y_true)
    n_features = X_train.shape[1]
    adjusted_r_squared = 1 - (1 - r_squared) * (n_samples - 1) / (n_samples - n_features - 1)

    # 결과 출력
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R-squared:", r_squared)
    print("Adjusted R-squared:", adjusted_r_squared)

    # 잔차 플롯 및 Q-Q 플롯
    if plot_residuals:
        residuals = y_true - y_pred

        plt.figure(figsize=(18, 6))

        # 잔차 플롯
        plt.subplot(1, 3, 1)
        plt.scatter(y_pred, residuals, color='blue')
        plt.xlabel("Predicted Values")
        plt.ylabel("Residuals")
        plt.title("Residual Plot")
        plt.axhline(y=0, color='r', linestyle='-')
        plt.grid(True)

        # Q-Q 플롯
        plt.subplot(1, 3, 2)
        stats.probplot(residuals, dist="norm", plot=plt)
        plt.title("Q-Q Plot")
        plt.xlabel("Theoretical quantiles")
        plt.ylabel("Ordered Values")
        plt.grid(True)

        # 비선형 피팅 그래프
        plt.subplot(1, 3, 3)
        observed_values = list(y_true)
        plt.scatter(observed_values, y_pred, color='blue', label='Observed vs. Predicted')
        plt.plot([min(observed_values), max(observed_values)], [min(observed_values), max(observed_values)], color='red', linestyle='--', label='Identity Line')
        plt.xlabel('Observed Values')
        plt.ylabel('Predicted Values')
        plt.title('Nonlinear Fitting Plot')
        plt.legend()
        plt.grid(True)

        plt.tight_layout()
        plt.show()

# 98% 신뢰 구간 추정

In [None]:
import statsmodels.api as sm
from sklearn.metrics import mean_pinball_loss

def plot_pred_ints(model, X_test, y_test):

  # Gradient Boosting
  try:
    model.estimators_.shape[1] == 1

    models = [est[0] for est in model.estimators_]

    actual_pred = pd.DataFrame()

    all_models = {}

    # 공통 파라미터 추출
    common_params = model.get_params()
    # 'alpha'와 'loss' 파라미터 제거
    common_params.pop('alpha', None)
    common_params.pop('loss', None)

    for alpha in [0.01, 0.5, 0.99]:
      gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha, **common_params)
      all_models["q %1.2f" % alpha] = gbr.fit(X_train, y_train)

    y_lower = all_models["q 0.01"].predict(X_test)
    y_upper = all_models["q 0.99"].predict(X_test)
    y_med = all_models["q 0.50"].predict(X_test)
    y_pred = model.predict(X_test)

    # 예측값을 DataFrame에 추가
    actual_pred['lower'] = y_lower
    actual_pred['upper'] = y_upper
    actual_pred['med'] = y_med
    actual_pred['pred'] = list(y_pred)
    actual_pred['actual'] = list(y_test)

  # Random Forest
  except:
    pred_Q = pd.DataFrame()

    quantiles = [0.01, 0.05, 0.50, 0.95 , 0.99]

    for pred in model.estimators_:
        temp = pd.Series(pred.predict(X_test).round(3))
        pred_Q = pd.concat([pred_Q, temp], axis = 1)

        actual_pred = pd.DataFrame()

        for q in quantiles:
          s = pred_Q.quantile(q=q, axis=1)
          actual_pred = pd.concat([actual_pred,s], axis = 1, sort=False)

        actual_pred.columns = quantiles
        actual_pred['actual'] = list(y_test)
        # 분위수의 최대값과 최소값을 사용하여 간격(interval)을 계산합니다.
        interval = actual_pred[np.max(quantiles)] - actual_pred[np.min(quantiles)]

        # 계산된 간격을 RF_actual_pred DataFrame에 추가합니다.
        actual_pred['interval'] = interval
        actual_pred = actual_pred.round(3)
        actual_pred.columns = ['0.01', '0.05', '0.50', '0.95' , '0.99', 'actual','interval']

        y_pred = model.predict(X_test)
        actual_pred['pred'] = y_pred

  return actual_pred

# 98% 신뢰 구간 추정 시각화

In [None]:
def calculate_accuracy(q5, q95, actual):
    # 전체 데이터에 대한 정확도 계산
    accuracy = sum((a >= b) & (a <= c) for a, b, c in zip(actual, q5, q95)) / len(actual)
    return accuracy

def viz_plot_pred_ints(df, n=50):
  filtered_idx = df.index
  rand_idx = np.random.choice(filtered_idx, n, replace=False)
  idx = np.arange(0, len(rand_idx))

  if len(df.columns) == 5:
    lower = list(df['lower'])
    upper = list(df['upper'])
    med = list(df['med'])
    pred = list(df['pred'])
    actual = list(df['actual'])

    # 해당하는 인덱스의 원소를 가져오기
    actual_element = [actual[i] for i in rand_idx]
    pred_element = [pred[i] for i in rand_idx]
    lower_element = [lower[i] for i in rand_idx]
    upper_element = [upper[i] for i in rand_idx]

    # 전체 데이터에 대한 정확도 계산
    accuracy = calculate_accuracy(lower, upper, actual)

    # 정확도 출력
    accuracy_percentage = accuracy * 100
    print(f'Accuracy: {accuracy_percentage:.2f}%')

    # 시각화
    plt.figure(figsize = (10, 8))
    plt.fill_between(idx, lower_element, upper_element, color='lightgray')
    plt.vlines(idx, lower_element, upper_element, color='black', linestyle='solid', linewidth=2, alpha = 0.5)
    plt.scatter(idx, actual_element, label = 'actual')
    plt.scatter(idx, pred_element, label = 'pred')

    plt.title('98% Confidence Interval Random index')
    plt.xlabel('Index')
    plt.ylabel('Actual Value')
    plt.legend()
    plt.show()

  elif len(df.columns) == 8:

    q1 = list(df['0.01'])
    q5 = list(df['0.05'])
    q50 = list(df['0.50'])
    q99 = list(df['0.99'])
    actual = list(df['actual'])
    interval = list(df['interval'])
    pred = list(df['pred'])

    # 해당하는 인덱스의 원소를 가져오기
    actual_element = [actual[i] for i in rand_idx]
    pred_element = [pred[i] for i in rand_idx]
    q_1_element = [q1[i] for i in rand_idx]
    q_2_element = [q99[i] for i in rand_idx]

    # 전체 데이터에 대한 정확도 계산
    accuracy = calculate_accuracy(q1, q99, actual)

    # 정확도 출력
    accuracy_percentage = accuracy * 100
    print(f'Accuracy: {accuracy_percentage:.2f}%')

    # 시각화
    plt.figure(figsize = (10, 8))
    plt.fill_between(idx, q_1_element, q_2_element, color='lightgray')
    plt.vlines(idx, q_1_element, q_2_element, color='black', linestyle='solid', linewidth=2, alpha = 0.5)
    plt.scatter(idx, actual_element, label = 'actual')
    plt.scatter(idx, pred_element, label = 'pred')

    plt.title('98% Confidence Interval Random index')
    plt.xlabel('Index')
    plt.ylabel('Actual Value')
    plt.legend()
    plt.show()