In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# CSVファイルを読み込み
df = pd.read_csv('../data/src/structural-features.csv')

# 特徴量と目的変数を分離
feature_columns = ['1-distance_v1', '1-interval_v1', 'order_v1', 'rally_v1']
X = df[feature_columns]
y = df['average_evaluation']

print("===== データ概要 =====")
print(f"サンプル数: {len(df)}")
print(f"特徴量: {feature_columns}")
print(f"\n評価値の分布:")
print(f"平均: {y.mean():.3f}")
print(f"標準偏差: {y.std():.3f}")
print(f"最小値: {y.min():.3f}")
print(f"最大値: {y.max():.3f}")

# Leave-One-Out交差検証の準備
loo = LeaveOneOut()
n_samples = len(X)

# 各モデルの結果を格納する辞書
results = {
    '重回帰分析': {'predictions': [], 'model': LinearRegression()},
    'Ridge回帰': {'predictions': [], 'model': Ridge(alpha=1.0)},
    'Lasso回帰': {'predictions': [], 'model': Lasso(alpha=0.1)}
}

# 標準化のためのスケーラー
scaler = StandardScaler()

# Leave-One-Out交差検証の実行
print("\n===== Leave-One-Out交差検証実行中 =====")
for train_idx, test_idx in loo.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # データの標準化
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 各モデルで学習と予測
    for model_name, model_info in results.items():
        model = model_info['model']
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        model_info['predictions'].append(y_pred[0])

# 評価指標の計算
print("\n===== 各モデルの性能評価 =====")
print(f"{'モデル':<15} {'RMSE':<10} {'MAE':<10}")
print("-" * 35)

for model_name, model_info in results.items():
    predictions = np.array(model_info['predictions'])
    rmse = np.sqrt(mean_squared_error(y, predictions))
    mae = mean_absolute_error(y, predictions)
    
    print(f"{model_name:<15} {rmse:<10.4f} {mae:<10.4f}")
    
    # 結果を保存
    model_info['rmse'] = rmse
    model_info['mae'] = mae

# 全データで各モデルを学習して係数を確認
print("\n===== 各特徴量の回帰係数 =====")
X_scaled = scaler.fit_transform(X)

for model_name, model_info in results.items():
    model = model_info['model']
    model.fit(X_scaled, y)
    
    print(f"\n{model_name}:")
    print(f"  切片: {model.intercept_:.4f}")
    for i, feature in enumerate(feature_columns):
        print(f"  {feature}: {model.coef_[i]:.4f}")

# 予測値と実測値の相関
print("\n===== 予測精度の詳細分析 =====")
best_model_name = min(results, key=lambda x: results[x]['rmse'])
print(f"最も精度の高いモデル: {best_model_name}")

# 実測値と予測値の相関係数
for model_name, model_info in results.items():
    predictions = np.array(model_info['predictions'])
    correlation = np.corrcoef(y, predictions)[0, 1]
    print(f"{model_name}の相関係数: {correlation:.4f}")

# 予測誤差の分析
print("\n===== 予測誤差の分析 =====")
best_predictions = results[best_model_name]['predictions']
errors = y - best_predictions

print(f"平均誤差: {np.mean(errors):.4f}")
print(f"誤差の標準偏差: {np.std(errors):.4f}")
print(f"最大過大評価: {np.max(errors):.4f}")
print(f"最大過小評価: {np.min(errors):.4f}")

# 各特徴量の重要度（標準化後の係数の絶対値）
print("\n===== 特徴量の重要度 =====")
model = results['重回帰分析']['model']
model.fit(X_scaled, y)
feature_importance = np.abs(model.coef_)
feature_importance_normalized = feature_importance / np.sum(feature_importance)

for i, feature in enumerate(feature_columns):
    print(f"{feature}: {feature_importance_normalized[i]:.3f}")

===== データ概要 =====
サンプル数: 20
特徴量: ['1-distance_v1', '1-interval_v1', 'order_v1', 'rally_v1']

評価値の分布:
平均: 2.483
標準偏差: 0.688
最小値: 1.667
最大値: 4.000

===== Leave-One-Out交差検証実行中 =====

===== 各モデルの性能評価 =====
モデル             RMSE       MAE       
-----------------------------------
重回帰分析           0.6761     0.5326    
Ridge回帰         0.6696     0.5312    
Lasso回帰         0.6799     0.5534    

===== 各特徴量の回帰係数 =====

重回帰分析:
  切片: 2.4833
  1-distance_v1: 0.1985
  1-interval_v1: 0.1651
  order_v1: 0.0398
  rally_v1: 0.2831

Ridge回帰:
  切片: 2.4833
  1-distance_v1: 0.1919
  1-interval_v1: 0.1587
  order_v1: 0.0321
  rally_v1: 0.2683

Lasso回帰:
  切片: 2.4833
  1-distance_v1: 0.1055
  1-interval_v1: 0.0634
  order_v1: 0.0000
  rally_v1: 0.1972

===== 予測精度の詳細分析 =====
最も精度の高いモデル: Ridge回帰
重回帰分析の相関係数: 0.2918
Ridge回帰の相関係数: 0.2927
Lasso回帰の相関係数: 0.1575

===== 予測誤差の分析 =====
平均誤差: 0.0518
誤差の標準偏差: 0.6676
最大過大評価: 1.3395
最大過小評価: -1.5114

===== 特徴量の重要度 =====
1-distance_v1: 0.289
1-interval_v1: 0.240
order_v1: 0.05