<a href="https://colab.research.google.com/github/ShinjiroTsuku/Ani-Aide_Hackathon/blob/main/Estimate_GPT_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Google Colabドライブをマウント
from google.colab import drive
drive.mount('/content/drive')

# 必要なライブラリのインストール


# ディレクトリ移動（本実験に変更）
%cd /content/drive/MyDrive/研究室/解析データ/M1/本実験

import numpy as np
import pandas as pd
import pickle
import json
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
import os
from openai import OpenAI
import time
from datetime import datetime, timedelta
from tqdm import tqdm
import re
from google.colab import userdata

# 実行時間計測開始
start_time = time.time()
start_datetime = datetime.now()
print(f"処理開始時刻: {start_datetime.strftime('%Y-%m-%d %H:%M:%S')}")

# 機械学習関連
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score,
                            recall_score, f1_score, precision_recall_curve, auc)
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
import shap

# GPT-4o出力ディレクトリの作成
os.makedirs('Output/GPT_Transformer/aupr', exist_ok=True)
os.makedirs('Output/GPT_Transformer/selected_features', exist_ok=True)
os.makedirs('Output/GPT_Transformer/analysis', exist_ok=True)
os.makedirs('Output/GPT_Transformer/analysis/words', exist_ok=True)
os.makedirs('Output/GPT_Transformer/aupr/individual', exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/研究室/解析データ/M1/本実験
処理開始時刻: 2025-12-24 15:47:50


In [None]:
# ================ 設定 ================
experiment_number = 'c04'  # 実験番号（要変更）
print(f"実験番号 {experiment_number} のGPT-4o + Transformer難易度評価分析を開始します...")

# 出力ディレクトリの作成
os.makedirs('Output/GPT_Transformer/aupr', exist_ok=True)
os.makedirs('Output/GPT_Transformer/aupr/individual', exist_ok=True)
os.makedirs('Output/GPT_Transformer/selected_features', exist_ok=True)
os.makedirs('Output/GPT_Transformer/analysis', exist_ok=True)
os.makedirs('Output/GPT_Transformer/analysis/words', exist_ok=True)

# OpenAI APIクライアントの初期化
try:
    client = OpenAI(api_key=userdata.get('API_KEY'))
    if client is None:
        print("⚠️ OpenAI APIキーが未設定です。")
        exit()
    else:
        print("✅ OpenAI APIクライアントが正常に初期化されました。")
        print("使用モデル: gpt-4o + Transformer")
except Exception as e:
    print(f"❌ OpenAI初期化エラー: {e}")
    exit()

# 実行時間計測開始
start_time = time.time()
start_datetime = datetime.now()
print(f"処理開始時刻: {start_datetime.strftime('%Y-%m-%d %H:%M:%S')}")

# ================ データの読み込み ================
print("データを読み込んでいます...")
# 予備実験のLSTM特徴量ファイルから読み込み
df = pd.read_json(f'Output/LSTM/features/{experiment_number}-features.json')
df = df.dropna()

print(f"データ形状: {df.shape}")
print(f"未知単語ラベルの分布: {Counter(df['unknownWordLabel'])}")

# ================ lineカラムから文脈を取得 ================
print("\nlineカラムから文脈データを取得中...")

# lineカラムが既に存在することを確認
if 'line' not in df.columns:
    print("❌ エラー: 'line'カラムがデータフレームに存在しません。")
    exit()

# lineカラムをsentenceとして使用
df['sentence'] = df['line']
print(f"文脈取得完了: {len(df)} 単語")

# ================ GPT-4o難易度評価関数 ================
def get_gpt_difficulty_score(word, context_sentence, max_retries=3):
    """gpt-4oを使用して単語の文脈的難易度を1-10の整数値で評価"""
    prompt = f"""
You are an expert English language difficulty assessor for second language learners.

Please evaluate the difficulty of the word "{word}" in this specific context for an intermediate English learner:

Context: "{context_sentence}"

Consider these factors:
- Semantic complexity and abstractness in this context
- Collocational patterns and usage constraints

Provide a precise integer score between 1 and 10:
- 1-2: Very easy (basic vocabulary, high frequency)
- 3-4: Easy (common words, straightforward usage)
- 5-6: Moderate (intermediate vocabulary, some complexity)
- 7-8: Difficult (advanced vocabulary, complex usage)
- 9-10: Very difficult (rare, highly technical, or complex)

Return ONLY the integer number (e.g., 2, 6, 9). Do not provide explanations.

Score:
"""

    for retry in range(max_retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a precise language difficulty assessor. Respond ONLY with an integer number between 1 and 10."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=10,
                temperature=0.0,
            )

            score_text = response.choices[0].message.content.strip()
            score_match = re.search(r'\d+', score_text)

            if score_match:
                score_int = int(score_match.group())
                score_int = max(1, min(10, score_int))
                normalized_score = score_int / 10.0
                return normalized_score

        except Exception as e:
            print(f"GPT-4o API エラー (試行 {retry + 1}): {str(e)}")
            if retry < max_retries - 1:
                time.sleep(2)

    raise Exception(f"GPT-4o評価に失敗: {word}")

def add_gpt_difficulty_features(data):
    """データフレームにGPT-4o難易度評価特徴量を追加"""
    gpt_difficulties = []

    print("gpt-4oによる難易度評価を実行中（新規評価）...")
    for idx, row in tqdm(data.iterrows(), total=len(data)):
        word = row['word']
        sentence = row['sentence']

        try:
            difficulty = get_gpt_difficulty_score(word, sentence)
        except Exception as e:
            print(f"評価失敗: {word} - {str(e)}")
            difficulty = 0.5

        gpt_difficulties.append(difficulty)

    data_with_gpt = data.copy()
    data_with_gpt['gpt_difficulty'] = gpt_difficulties

    return data_with_gpt

# ================ Transformerモデル定義 ================
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class TransformerModel(nn.Module):
    def __init__(self, input_dim=1, d_model=32, nhead=2, num_layers=1,
                 dim_feedforward=64, dropout=0.2, pooling='mean'):
        super().__init__()
        self.input_projection = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, 2)
        self.pooling = pooling

    def forward(self, x):
        # x: (batch, seq_len, input_dim)
        x = self.input_projection(x)  # (batch, seq_len, d_model)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)  # (batch, seq_len, d_model)

        # Pooling
        if self.pooling == 'mean':
            x_pooled = x.mean(dim=1)  # (batch, d_model)
        elif self.pooling == 'max':
            x_pooled = x.max(dim=1)[0]
        else:  # last
            x_pooled = x[:, -1, :]

        out = self.fc(x_pooled)
        return out, x_pooled

# ================ 特徴量の定義 ================
baseline_features_original = [
    'length', 'freq', 'seven_character', 'ContentWord', 'syllables',
    'Num_Words', 'Length_Word_Ave', 'freq_Min', 'freq_Max', 'freq_Ave',
    'Num_7Characters', 'Rate_7Characters',
    'Num_ContentWords', 'Rate_ContentWords',
    'Num_FunctionWords', 'Rate_FunctionWords',
    'Num_Monosyllable', 'Num_Polysyllable',
    'Flesch_Reading_Ease', 'ARI', 'Readtime', 'ReadBack'
]

# ベースライン: 従来特徴量 + GPT-4o難易度
gpt_features = ['gpt_difficulty']
baseline_features = baseline_features_original + gpt_features

# 提案手法: ベースライン + Transformer視線特徴量
transformer_features = [f'transformer{i}' for i in range(32)]
proposed_features = baseline_features + transformer_features

print("\n特徴量構成:")
print(f"  従来特徴量: {len(baseline_features_original)}個")
print(f"  ベースライン (従来 + GPT-4o): {len(baseline_features)}個")
print(f"  提案手法 (ベースライン + Transformer): {len(proposed_features)}個")

# ================ ユーティリティ関数 ================
def extract_shap_features(X, y, feature_names, n_features=30):
    """SHAP値を用いて重要な特徴量を抽出"""
    print(f"SHAP分析により上位{n_features}特徴量を選択中...")

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    model = LogisticRegression(class_weight="balanced", max_iter=1000, random_state=0)
    model.fit(X_scaled, y)

    explainer = shap.Explainer(model, X_scaled)
    shap_values = explainer(X_scaled)

    mean_abs_shap = np.abs(shap_values.values).mean(axis=0)
    top_idx = np.argsort(mean_abs_shap)[::-1][:n_features]
    selected_features = [feature_names[i] for i in top_idx]

    print(f"SHAP選択完了: {len(selected_features)}個の特徴量")
    return selected_features
def extract_transformer_features(data, transformer_model, device):
    """Transformerモデルから特徴量を抽出"""
    transformer_model.eval()
    line_features = {}

    for _, row in data.iterrows():
        line_id = str(row['id_line'] + 1)
        x_seq = row['x_coordinates_dict'].get(line_id)

        if x_seq and len(x_seq) > 0:
            # LSTMと同じ形式: [[[x_seq]]]
            x_tensor = torch.tensor([[[x] for x in x_seq]], dtype=torch.float32).to(device)
            with torch.no_grad():
                _, features = transformer_model(x_tensor)
            line_features[row.name] = features.squeeze(0).cpu().numpy()

    return line_features

def add_transformer_features_to_data(data, line_features, n_features=32):
    """データフレームにTransformer特徴量を追加"""
    for i in range(n_features):
        data[f'transformer{i}'] = [
            line_features[idx][i] if idx in line_features else 0.0
            for idx in data.index
        ]
    return data

def feature_selection_sbs(X, y, feature_names):
    """Sequential Backward Selection (SBS)による特徴量選択"""
    print("Sequential Backward Selection (SBS)実行中...")

    clf = SVC(gamma='scale', probability=True, class_weight='balanced', random_state=0)
    selector = sfs(clf,
                   k_features=(1, len(feature_names)),
                   forward=False,
                   floating=False,  # Floatingを無効化してSBSに
                   scoring='f1',
                   cv=list(StratifiedKFold(n_splits=5, shuffle=True, random_state=0).split(X, y)),
                   n_jobs=-1)

    X_df = pd.DataFrame(X, columns=feature_names)
    selector = selector.fit(X_df, y)

    selected_features = X_df.columns[list(selector.k_feature_idx_)].tolist()
    X_selected = X_df[selected_features].values

    print(f"選択された特徴量数: {len(selected_features)}")
    print(f"選択された特徴量: {selected_features}")

    return X_selected, selected_features

def train_and_predict(X_train, y_train, X_test, y_test):
    """SVMモデルの訓練と予測"""
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    clf = SVC(gamma='scale', probability=True, class_weight='balanced', random_state=0)
    clf.fit(X_train_scaled, y_train)

    predictions = clf.predict(X_test_scaled)
    probabilities = clf.predict_proba(X_test_scaled)

    return predictions, probabilities

def calculate_metrics(y_true, y_pred):
    """分類性能指標を計算"""
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    return {
        'confusion_matrix': cm,
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1_score': f1_score(y_true, y_pred, zero_division=0),
        'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn
    }

def analyze_improvement(baseline_results, proposed_results):
    """特徴量追加による改善を分析"""
    baseline_correct = (baseline_results['y_true'] == baseline_results['y_pred'])
    proposed_correct = (proposed_results['y_true'] == proposed_results['y_pred'])

    newly_correct = (~baseline_correct) & proposed_correct
    newly_incorrect = baseline_correct & (~proposed_correct)

    return {
        'newly_correct_count': newly_correct.sum(),
        'newly_incorrect_count': newly_incorrect.sum(),
        'net_improvement': newly_correct.sum() - newly_incorrect.sum(),
        'newly_correct_indices': np.where(newly_correct)[0],
        'newly_incorrect_indices': np.where(newly_incorrect)[0]
    }

def convert_numpy_to_list(obj):
    """NumPy配列を再帰的にリストに変換"""
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_numpy_to_list(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_to_list(item) for item in obj]
    elif isinstance(obj, (np.integer, np.floating)):
        return obj.item()
    else:
        return obj

# ================ GPT-4o難易度特徴量の追加 ================
print("\nGPT-4o難易度特徴量を追加中...")
df_with_gpt = add_gpt_difficulty_features(df)

# ================ メイン処理:文書別クロスバリデーション ================
print("\n文書別クロスバリデーション開始...")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
documents = sorted(df_with_gpt["id_document"].unique())

baseline_results = {'y_true': [], 'y_pred': [], 'y_prob': [], 'selected_features': []}
proposed_results = {'y_true': [], 'y_pred': [], 'y_prob': [], 'selected_features': []}

for test_doc in documents:
    print(f"\n=== 文書 {test_doc} をテストデータとして使用 ===")

    test_data = df_with_gpt[df_with_gpt["id_document"] == test_doc].copy()
    train_data = df_with_gpt[df_with_gpt["id_document"] != test_doc].copy()

    print(f"訓練データ: {len(train_data)} サンプル")
    print(f"テストデータ: {len(test_data)} サンプル")

    # ================ Transformerモデルの訓練と特徴量抽出 ================
    print("Transformerモデルを訓練中...")

    transformer_model = TransformerModel(
        input_dim=1,
        d_model=32,
        nhead=2,
        num_layers=1,
        dim_feedforward=64,
        dropout=0.2,
        pooling='mean'
    ).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(transformer_model.parameters(), lr=0.001)

    # 訓練データ準備
    train_lines = {}
    train_labels = {}

    print(f"訓練データの準備中...")

    for _, row in train_data.iterrows():
        line_id = str(row['id_line'] + 1)
        x_seq = row['x_coordinates_dict'].get(line_id)
        if x_seq and len(x_seq) > 0:
            # 各line_idに対して最初に見つかったシーケンスのみを保持
            if line_id not in train_lines:
                train_lines[line_id] = x_seq
                train_labels[line_id] = row['unknownWordLabel']

    print(f"  ユニークなline_id数: {len(train_lines)}")

    # Transformer訓練
    X_transformer, Y_transformer = [], []
    for lid, x_seq in train_lines.items():
        # [[[x1], [x2], [x3], ...]] の形式に変換
        x_tensor = torch.tensor([[[x] for x in x_seq]], dtype=torch.float32).to(device)
        X_transformer.append(x_tensor)
        Y_transformer.append(train_labels[lid])

    # X_transformerが空の場合の処理
    if len(X_transformer) == 0:
        print("  ⚠️ 警告: 訓練データが見つかりませんでした。Transformer特徴量をスキップします。")
        train_transformer_features = {}
        test_transformer_features = {}
    else:
        transformer_model.train()
        epochs = 10
        for epoch in range(epochs):
            total_loss = 0
            for x_tensor, label in zip(X_transformer, Y_transformer):
                optimizer.zero_grad()
                out, _ = transformer_model(x_tensor)
                loss = criterion(out, torch.tensor([label], dtype=torch.long).to(device))
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            if (epoch + 1) % 2 == 0:
                print(f"  Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(X_transformer):.4f}")

        # Transformer特徴量抽出
        print("Transformer特徴量を抽出中...")
        train_transformer_features = extract_transformer_features(train_data, transformer_model, device)
        test_transformer_features = extract_transformer_features(test_data, transformer_model, device)

    # データにTransformer特徴量を追加
    train_data_with_features = add_transformer_features_to_data(train_data, train_transformer_features)
    test_data_with_features = add_transformer_features_to_data(test_data, test_transformer_features)

    # ================ ベースライン手法（GPT-4o込み） ================
    print("ベースライン手法（従来特徴量 + GPT-4o）を実行中...")

    # SHAP分析で30個に絞る
    X_baseline_all = train_data[baseline_features].fillna(0).values
    y_train = train_data["unknownWordLabel"].values

    baseline_shap_features = extract_shap_features(X_baseline_all, y_train, baseline_features, n_features=30)
    X_baseline = train_data[baseline_shap_features].fillna(0).values

    # SBFS特徴量選択
    X_baseline_selected, baseline_selected_features = feature_selection_sbs(
        X_baseline, y_train, baseline_shap_features)

    X_test_baseline = test_data[baseline_selected_features].fillna(0).values
    y_test = test_data["unknownWordLabel"].values

    baseline_pred, baseline_prob = train_and_predict(
        X_baseline_selected, y_train, X_test_baseline, y_test)

    baseline_results['y_true'].extend(y_test)
    baseline_results['y_pred'].extend(baseline_pred)
    baseline_results['y_prob'].extend(baseline_prob[:, 1])
    baseline_results['selected_features'].append(baseline_selected_features)

    # ================ 提案手法（ベースライン + Transformer特徴量） ================
    print("提案手法（ベースライン + Transformer視線特徴量）を実行中...")

    # SHAP分析で30個に絞る
    X_proposed_all = train_data_with_features[proposed_features].fillna(0).values

    proposed_shap_features = extract_shap_features(X_proposed_all, y_train, proposed_features, n_features=30)
    X_proposed = train_data_with_features[proposed_shap_features].fillna(0).values

    # SBFS特徴量選択
    X_proposed_selected, proposed_selected_features = feature_selection_sbs(
        X_proposed, y_train, proposed_shap_features)

    X_test_proposed = test_data_with_features[proposed_selected_features].fillna(0).values

    proposed_pred, proposed_prob = train_and_predict(
        X_proposed_selected, y_train, X_test_proposed, y_test)

    proposed_results['y_true'].extend(y_test)
    proposed_results['y_pred'].extend(proposed_pred)
    proposed_results['y_prob'].extend(proposed_prob[:, 1])
    proposed_results['selected_features'].append(proposed_selected_features)

    baseline_metrics = calculate_metrics(y_test, baseline_pred)
    proposed_metrics = calculate_metrics(y_test, proposed_pred)

    print(f"ベースライン (GPT-4o) - F1: {baseline_metrics['f1_score']:.3f}")
    print(f"提案手法 (GPT-4o + Transformer) - F1: {proposed_metrics['f1_score']:.3f}")

# ================ 全体結果の分析 ================
print("\n=== 全体結果の分析 ===")

for key in ['y_true', 'y_pred', 'y_prob']:
    baseline_results[key] = np.array(baseline_results[key])
    proposed_results[key] = np.array(proposed_results[key])

baseline_overall = calculate_metrics(baseline_results['y_true'], baseline_results['y_pred'])
proposed_overall = calculate_metrics(proposed_results['y_true'], proposed_results['y_pred'])

print("=== ベースライン手法（従来 + GPT-4o） ===")
print(f"F1-score: {baseline_overall['f1_score']:.3f}")
print(f"Accuracy: {baseline_overall['accuracy']:.3f}")
print(f"Precision: {baseline_overall['precision']:.3f}")
print(f"Recall: {baseline_overall['recall']:.3f}")

print("\n=== 提案手法（ベースライン + Transformer視線情報） ===")
print(f"F1-score: {proposed_overall['f1_score']:.3f}")
print(f"Accuracy: {proposed_overall['accuracy']:.3f}")
print(f"Precision: {proposed_overall['precision']:.3f}")
print(f"Recall: {proposed_overall['recall']:.3f}")

print(f"\n=== Transformer視線特徴量の効果 ===")
print(f"F1スコア改善: {proposed_overall['f1_score'] - baseline_overall['f1_score']:+.3f}")

improvement_analysis = analyze_improvement(baseline_results, proposed_results)

print(f"\n新たに正しく分類された単語数: {improvement_analysis['newly_correct_count']}")
print(f"新たに間違って分類された単語数: {improvement_analysis['newly_incorrect_count']}")
print(f"純改善数: {improvement_analysis['net_improvement']}")

def get_difficulty_level(score):
    """難易度スコアを文字レベルに変換"""
    if score <= 0.2:
        return "Very Easy (1-2)"
    elif score <= 0.4:
        return "Easy (3-4)"
    elif score <= 0.6:
        return "Moderate (5-6)"
    elif score <= 0.8:
        return "Difficult (7-8)"
    else:
        return "Very Difficult (9-10)"

# ================ 単語レベル詳細分析の保存 ================
print("\n単語レベルの詳細分析を保存中...")

word_level_analysis = []
word_index = 0

for test_doc in documents:
    test_data = df_with_gpt[df_with_gpt["id_document"] == test_doc].copy()

    for _, row in test_data.iterrows():
        word_info = {
            'word_index': word_index,
            'document_id': int(row['id_document']),
            'line_id': int(row['id_line']),
            'word_id': int(row['id_word']),
            'word': row['word'],
            'context': row['line'],  # Lineを文脈として使用
            'gpt_difficulty': float(row['gpt_difficulty']),
            'gpt_difficulty_integer': int(row['gpt_difficulty'] * 10),
            'difficulty_level': get_difficulty_level(float(row['gpt_difficulty'])),
            'true_label': int(baseline_results['y_true'][word_index]),
            'baseline_prediction': int(baseline_results['y_pred'][word_index]),
            'proposed_prediction': int(proposed_results['y_pred'][word_index]),
            'baseline_probability': float(baseline_results['y_prob'][word_index]),
            'proposed_probability': float(proposed_results['y_prob'][word_index]),
            'baseline_correct': bool(baseline_results['y_true'][word_index] == baseline_results['y_pred'][word_index]),
            'proposed_correct': bool(proposed_results['y_true'][word_index] == proposed_results['y_pred'][word_index]),
        }

        if not word_info['baseline_correct'] and word_info['proposed_correct']:
            word_info['improvement_category'] = 'newly_correct'
        elif word_info['baseline_correct'] and not word_info['proposed_correct']:
            word_info['improvement_category'] = 'newly_incorrect'
        elif word_info['baseline_correct'] and word_info['proposed_correct']:
            word_info['improvement_category'] = 'both_correct'
        else:
            word_info['improvement_category'] = 'both_incorrect'

        word_level_analysis.append(word_info)
        word_index += 1

# 新しく正解した単語のリスト作成（難易度順）
newly_correct_words = [w for w in word_level_analysis if w['improvement_category'] == 'newly_correct']
newly_correct_words.sort(key=lambda x: x['gpt_difficulty'], reverse=True)

# 新しく誤判定した単語のリスト作成（難易度順）
newly_incorrect_words = [w for w in word_level_analysis if w['improvement_category'] == 'newly_incorrect']
newly_incorrect_words.sort(key=lambda x: x['gpt_difficulty'], reverse=True)

# 新しく正解した単語の詳細ファイル保存
with open(f'Output/GPT_Transformer/analysis/words/newly_correct_words_{experiment_number}.txt', 'w', encoding='utf-8') as f:
    f.write(f"新しく正しく分類された単語一覧 (実験番号: {experiment_number})\n")
    f.write(f"総数: {len(newly_correct_words)} 単語\n")
    f.write("="*80 + "\n\n")

    for i, word_info in enumerate(newly_correct_words, 1):
        f.write(f"[{i}] 単語: {word_info['word']}\n")
        f.write(f"    文脈: {word_info['context']}\n")
        f.write(f"    GPT-4o難易度スコア: {word_info['gpt_difficulty_integer']}/10 ({word_info['difficulty_level']})\n")
        f.write(f"    真のラベル: {'未知' if word_info['true_label']==1 else '既知'}\n")
        f.write(f"    ベースライン予測: {'未知' if word_info['baseline_prediction']==1 else '既知'}\n")
        f.write(f"    提案手法予測: {'未知' if word_info['proposed_prediction']==1 else '既知'}\n")
        f.write("-"*80 + "\n")

# 新しく誤判定した単語の詳細ファイル保存
with open(f'Output/GPT_Transformer/analysis/words/newly_incorrect_words_{experiment_number}.txt', 'w', encoding='utf-8') as f:
    f.write(f"新しく誤って分類された単語一覧 (実験番号: {experiment_number})\n")
    f.write(f"総数: {len(newly_incorrect_words)} 単語\n")
    f.write("="*80 + "\n\n")

    for i, word_info in enumerate(newly_incorrect_words, 1):
        f.write(f"[{i}] 単語: {word_info['word']}\n")
        f.write(f"    文脈: {word_info['context']}\n")
        f.write(f"    GPT-4o難易度スコア: {word_info['gpt_difficulty_integer']}/10 ({word_info['difficulty_level']})\n")
        f.write(f"    真のラベル: {'未知' if word_info['true_label']==1 else '既知'}\n")
        f.write(f"    ベースライン予測: {'未知' if word_info['baseline_prediction']==1 else '既知'}\n")
        f.write(f"    提案手法予測: {'未知' if word_info['proposed_prediction']==1 else '既知'}\n")
        f.write("-"*80 + "\n")

print(f"単語詳細ファイルを保存しました:")
print(f"  - 新しく正解: Output/GPT_Transformer/analysis/words/newly_correct_words_{experiment_number}.txt")
print(f"  - 新しく誤判定: Output/GPT_Transformer/analysis/words/newly_incorrect_words_{experiment_number}.txt")

# ================ 結果の可視化（個別画像保存） ================
print("\n結果を可視化中...")

# GPT-4o難易度の分布を確認
gpt_scores = df_with_gpt['gpt_difficulty'].values
unknown_scores = gpt_scores[df_with_gpt['unknownWordLabel'] == 1]
known_scores = gpt_scores[df_with_gpt['unknownWordLabel'] == 0]

# PR曲線の計算
baseline_precision, baseline_recall, _ = precision_recall_curve(
    baseline_results['y_true'], baseline_results['y_prob'])
proposed_precision, proposed_recall, _ = precision_recall_curve(
    proposed_results['y_true'], proposed_results['y_prob'])

def interpolate_precision_recall(precision, recall):
    precision_interp = []
    for i in range(11):
        recall_level = i / 10.0
        max_precision = 0.0
        for j, r in enumerate(recall):
            if r >= recall_level and precision[j] > max_precision:
                max_precision = precision[j]
        precision_interp.append(max_precision)
    return precision_interp, [i/10.0 for i in range(11)]

baseline_prec_interp, recall_interp = interpolate_precision_recall(baseline_precision, baseline_recall)
proposed_prec_interp, _ = interpolate_precision_recall(proposed_precision, proposed_recall)

# 個別画像1: PR曲線
fig1, ax1 = plt.subplots(figsize=(8, 6))
ax1.plot(recall_interp, proposed_prec_interp, 'b-o',
         label=f'Proposed (AUC={auc(recall_interp, proposed_prec_interp):.3f})', markersize=4)
ax1.plot(recall_interp, baseline_prec_interp, 'r-o',
         label=f'Baseline (AUC={auc(recall_interp, baseline_prec_interp):.3f})', markersize=4)
ax1.set_xlabel('Recall')
ax1.set_ylabel('Precision')
ax1.set_title(f'PR Curve (GPT-4o+Transformer) - Exp {experiment_number}')
ax1.legend()
ax1.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'Output/GPT_Transformer/aupr/individual/PR_curve_{experiment_number}.png', dpi=300, bbox_inches='tight')
plt.close()

# 個別画像2: F1スコア比較
fig2, ax2 = plt.subplots(figsize=(8, 6))
ax2.bar(['Baseline\n(従来+GPT-4o)', 'Proposed\n(+Transformer)'],
        [baseline_overall['f1_score'], proposed_overall['f1_score']],
        color=['red', 'blue'], alpha=0.7)
ax2.set_ylabel('F1-Score')
ax2.set_title(f'F1 Comparison - Exp {experiment_number}')
ax2.grid(True, alpha=0.3, axis='y')
# F1スコア改善を表示
improvement = proposed_overall['f1_score'] - baseline_overall['f1_score']
if improvement > 0:
    ax2.text(1, proposed_overall['f1_score'], f'+{improvement:.3f}',
             ha='center', va='bottom', fontsize=10, color='blue')
plt.tight_layout()
plt.savefig(f'Output/GPT_Transformer/aupr/individual/F1_comparison_{experiment_number}.png', dpi=300, bbox_inches='tight')
plt.close()

# 個別画像3: 改善分析
fig3, ax3 = plt.subplots(figsize=(8, 6))
categories = ['Newly Correct', 'Newly Incorrect', 'Net Improvement']
values = [improvement_analysis['newly_correct_count'],
          improvement_analysis['newly_incorrect_count'],
          improvement_analysis['net_improvement']]
colors = ['green', 'red', 'blue']

bars = ax3.bar(categories, values, color=colors, alpha=0.7)
ax3.set_ylabel('Number of Words')
ax3.set_title(f'Impact (GPT-4o+Transformer) - Exp {experiment_number}')
ax3.grid(True, alpha=0.3, axis='y')

for bar, value in zip(bars, values):
    height = bar.get_height()
    ax3.text(bar.get_x() + bar.get_width()/2., height + 0.5,
             f'{value}', ha='center', va='bottom')
plt.tight_layout()
plt.savefig(f'Output/GPT_Transformer/aupr/individual/improvement_analysis_{experiment_number}.png', dpi=300, bbox_inches='tight')
plt.close()

# 個別画像4: GPT-4o難易度分布比較
fig4, ax4 = plt.subplots(figsize=(8, 6))
ax4.hist(known_scores, bins=10, alpha=0.7,
         label=f'Known Words (n={len(known_scores)})', color='blue', density=True)
ax4.hist(unknown_scores, bins=10, alpha=0.7,
         label=f'Unknown Words (n={len(unknown_scores)})', color='red', density=True)
ax4.set_xlabel('GPT-4o Difficulty Score')
ax4.set_ylabel('Density')
ax4.set_title(f'Difficulty Distribution - Exp {experiment_number}')
ax4.legend()
ax4.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'Output/GPT_Transformer/aupr/individual/difficulty_distribution_{experiment_number}.png', dpi=300, bbox_inches='tight')
plt.close()

print("個別画像を保存しました:")
print(f"  - Output/GPT_Transformer/aupr/individual/PR_curve_{experiment_number}.png")
print(f"  - Output/GPT_Transformer/aupr/individual/F1_comparison_{experiment_number}.png")
print(f"  - Output/GPT_Transformer/aupr/individual/improvement_analysis_{experiment_number}.png")
print(f"  - Output/GPT_Transformer/aupr/individual/difficulty_distribution_{experiment_number}.png")

# ================ 結果の保存 ================
print("\n結果を保存中...")

with open(f'Output/GPT_Transformer/selected_features/baseline_features_{experiment_number}.pkl', 'wb') as f:
    pickle.dump(baseline_results['selected_features'], f)

with open(f'Output/GPT_Transformer/selected_features/proposed_features_{experiment_number}.pkl', 'wb') as f:
    pickle.dump(proposed_results['selected_features'], f)

analysis_results = {
    'experiment_number': experiment_number,
    'baseline_results': convert_numpy_to_list(baseline_overall),
    'proposed_results': convert_numpy_to_list(proposed_overall),
    'improvement_analysis': convert_numpy_to_list(improvement_analysis),
    'word_level_analysis': word_level_analysis
}

with open(f'Output/GPT_Transformer/analysis/complete_analysis_{experiment_number}.json', 'w') as f:
    json.dump(analysis_results, f, indent=2)

end_time = time.time()
elapsed_time = end_time - start_time

print("\n" + "="*60)
print(f"処理完了！ 総実行時間: {elapsed_time/60:.2f} 分")
print("="*60)
print(f"\n結果保存先: Output/GPT_Transformer/")
print(f"  - aupr/individual/ : 個別画像")
print(f"  - analysis/words/ : 単語詳細")
print(f"  - analysis/ : 分析結果JSON")
print(f"  - selected_features/ : 選択特徴量")
print("="*60)

実験番号 c04 のGPT-4o + Transformer難易度評価分析を開始します...
✅ OpenAI APIクライアントが正常に初期化されました。
使用モデル: gpt-4o + Transformer
処理開始時刻: 2025-12-24 15:47:51
データを読み込んでいます...
データ形状: (2452, 31)
未知単語ラベルの分布: Counter({0: 2335, 1: 117})

lineカラムから文脈データを取得中...
文脈取得完了: 2452 単語

特徴量構成:
  従来特徴量: 22個
  ベースライン (従来 + GPT-4o): 23個
  提案手法 (ベースライン + Transformer): 55個

GPT-4o難易度特徴量を追加中...
gpt-4oによる難易度評価を実行中（新規評価）...


100%|██████████| 2452/2452 [17:03<00:00,  2.40it/s]



文書別クロスバリデーション開始...

=== 文書 0 をテストデータとして使用 ===
訓練データ: 2185 サンプル
テストデータ: 267 サンプル
Transformerモデルを訓練中...
訓練データの準備中...
  ユニークなline_id数: 81
  Epoch 2/10, Loss: 0.4325
  Epoch 4/10, Loss: 0.4260
  Epoch 6/10, Loss: 0.4194
  Epoch 8/10, Loss: 0.4157
  Epoch 10/10, Loss: 0.4069
Transformer特徴量を抽出中...
ベースライン手法（従来特徴量 + GPT-4o）を実行中...
SHAP分析により上位30特徴量を選択中...
SHAP選択完了: 23個の特徴量
Sequential Backward Selection (SBS)実行中...
選択された特徴量数: 8
選択された特徴量: ['freq', 'gpt_difficulty', 'Num_Monosyllable', 'ContentWord', 'Num_ContentWords', 'Rate_FunctionWords', 'freq_Max', 'Num_Polysyllable']
提案手法（ベースライン + Transformer視線特徴量）を実行中...
SHAP分析により上位30特徴量を選択中...
SHAP選択完了: 30個の特徴量
Sequential Backward Selection (SBS)実行中...
選択された特徴量数: 9
選択された特徴量: ['freq', 'gpt_difficulty', 'transformer25', 'Num_Monosyllable', 'transformer26', 'transformer2', 'ContentWord', 'transformer15', 'Rate_7Characters']
ベースライン (GPT-4o) - F1: 0.161
提案手法 (GPT-4o + Transformer) - F1: 0.145

=== 文書 1 をテストデータとして使用 ===
訓練データ: 2189 サンプル
テストデータ: 263 サンプル
Transfo

  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(f'Output/GPT_Transformer/aupr/individual/F1_comparison_{experiment_number}.png', dpi=300, bbox_inches='tight')
  plt.savefig(f'Output/GPT_Transformer/aupr/individual/F1_comparison_{experiment_number}.png', dpi=300, bbox_inches='tight')


個別画像を保存しました:
  - Output/GPT_Transformer/aupr/individual/PR_curve_c04.png
  - Output/GPT_Transformer/aupr/individual/F1_comparison_c04.png
  - Output/GPT_Transformer/aupr/individual/improvement_analysis_c04.png
  - Output/GPT_Transformer/aupr/individual/difficulty_distribution_c04.png

結果を保存中...

処理完了！ 総実行時間: 203.45 分

結果保存先: Output/GPT_Transformer/
  - aupr/individual/ : 個別画像
  - analysis/words/ : 単語詳細
  - analysis/ : 分析結果JSON
  - selected_features/ : 選択特徴量


In [None]:
# ================ 設定 ================
experiment_number = 'c07'  # 実験番号（要変更）
print(f"実験番号 {experiment_number} のGPT-4o + Transformer難易度評価分析を開始します...")

# 出力ディレクトリの作成
os.makedirs('Output/GPT_Transformer/aupr', exist_ok=True)
os.makedirs('Output/GPT_Transformer/aupr/individual', exist_ok=True)
os.makedirs('Output/GPT_Transformer/selected_features', exist_ok=True)
os.makedirs('Output/GPT_Transformer/analysis', exist_ok=True)
os.makedirs('Output/GPT_Transformer/analysis/words', exist_ok=True)

# OpenAI APIクライアントの初期化
try:
    client = OpenAI(api_key=userdata.get('API_KEY'))
    if client is None:
        print("⚠️ OpenAI APIキーが未設定です。")
        exit()
    else:
        print("✅ OpenAI APIクライアントが正常に初期化されました。")
        print("使用モデル: gpt-4o + Transformer")
except Exception as e:
    print(f"❌ OpenAI初期化エラー: {e}")
    exit()

# 実行時間計測開始
start_time = time.time()
start_datetime = datetime.now()
print(f"処理開始時刻: {start_datetime.strftime('%Y-%m-%d %H:%M:%S')}")

# ================ データの読み込み ================
print("データを読み込んでいます...")
# 予備実験のLSTM特徴量ファイルから読み込み
df = pd.read_json(f'Output/LSTM/features/{experiment_number}-features.json')
df = df.dropna()

print(f"データ形状: {df.shape}")
print(f"未知単語ラベルの分布: {Counter(df['unknownWordLabel'])}")

# ================ lineカラムから文脈を取得 ================
print("\nlineカラムから文脈データを取得中...")

# lineカラムが既に存在することを確認
if 'line' not in df.columns:
    print("❌ エラー: 'line'カラムがデータフレームに存在しません。")
    exit()

# lineカラムをsentenceとして使用
df['sentence'] = df['line']
print(f"文脈取得完了: {len(df)} 単語")

# ================ GPT-4o難易度評価関数 ================
def get_gpt_difficulty_score(word, context_sentence, max_retries=3):
    """gpt-4oを使用して単語の文脈的難易度を1-10の整数値で評価"""
    prompt = f"""
You are an expert English language difficulty assessor for second language learners.

Please evaluate the difficulty of the word "{word}" in this specific context for an intermediate English learner:

Context: "{context_sentence}"

Consider these factors:
- Semantic complexity and abstractness in this context
- Collocational patterns and usage constraints

Provide a precise integer score between 1 and 10:
- 1-2: Very easy (basic vocabulary, high frequency)
- 3-4: Easy (common words, straightforward usage)
- 5-6: Moderate (intermediate vocabulary, some complexity)
- 7-8: Difficult (advanced vocabulary, complex usage)
- 9-10: Very difficult (rare, highly technical, or complex)

Return ONLY the integer number (e.g., 2, 6, 9). Do not provide explanations.

Score:
"""

    for retry in range(max_retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a precise language difficulty assessor. Respond ONLY with an integer number between 1 and 10."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=10,
                temperature=0.0,
            )

            score_text = response.choices[0].message.content.strip()
            score_match = re.search(r'\d+', score_text)

            if score_match:
                score_int = int(score_match.group())
                score_int = max(1, min(10, score_int))
                normalized_score = score_int / 10.0
                return normalized_score

        except Exception as e:
            print(f"GPT-4o API エラー (試行 {retry + 1}): {str(e)}")
            if retry < max_retries - 1:
                time.sleep(2)

    raise Exception(f"GPT-4o評価に失敗: {word}")

def add_gpt_difficulty_features(data):
    """データフレームにGPT-4o難易度評価特徴量を追加"""
    gpt_difficulties = []

    print("gpt-4oによる難易度評価を実行中（新規評価）...")
    for idx, row in tqdm(data.iterrows(), total=len(data)):
        word = row['word']
        sentence = row['sentence']

        try:
            difficulty = get_gpt_difficulty_score(word, sentence)
        except Exception as e:
            print(f"評価失敗: {word} - {str(e)}")
            difficulty = 0.5

        gpt_difficulties.append(difficulty)

    data_with_gpt = data.copy()
    data_with_gpt['gpt_difficulty'] = gpt_difficulties

    return data_with_gpt

# ================ Transformerモデル定義 ================
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class TransformerModel(nn.Module):
    def __init__(self, input_dim=1, d_model=32, nhead=2, num_layers=1,
                 dim_feedforward=64, dropout=0.2, pooling='mean'):
        super().__init__()
        self.input_projection = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, 2)
        self.pooling = pooling

    def forward(self, x):
        # x: (batch, seq_len, input_dim)
        x = self.input_projection(x)  # (batch, seq_len, d_model)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)  # (batch, seq_len, d_model)

        # Pooling
        if self.pooling == 'mean':
            x_pooled = x.mean(dim=1)  # (batch, d_model)
        elif self.pooling == 'max':
            x_pooled = x.max(dim=1)[0]
        else:  # last
            x_pooled = x[:, -1, :]

        out = self.fc(x_pooled)
        return out, x_pooled

# ================ 特徴量の定義 ================
baseline_features_original = [
    'length', 'freq', 'seven_character', 'ContentWord', 'syllables',
    'Num_Words', 'Length_Word_Ave', 'freq_Min', 'freq_Max', 'freq_Ave',
    'Num_7Characters', 'Rate_7Characters',
    'Num_ContentWords', 'Rate_ContentWords',
    'Num_FunctionWords', 'Rate_FunctionWords',
    'Num_Monosyllable', 'Num_Polysyllable',
    'Flesch_Reading_Ease', 'ARI', 'Readtime', 'ReadBack'
]

# ベースライン: 従来特徴量 + GPT-4o難易度
gpt_features = ['gpt_difficulty']
baseline_features = baseline_features_original + gpt_features

# 提案手法: ベースライン + Transformer視線特徴量
transformer_features = [f'transformer{i}' for i in range(32)]
proposed_features = baseline_features + transformer_features

print("\n特徴量構成:")
print(f"  従来特徴量: {len(baseline_features_original)}個")
print(f"  ベースライン (従来 + GPT-4o): {len(baseline_features)}個")
print(f"  提案手法 (ベースライン + Transformer): {len(proposed_features)}個")

# ================ ユーティリティ関数 ================
def extract_shap_features(X, y, feature_names, n_features=30):
    """SHAP値を用いて重要な特徴量を抽出"""
    print(f"SHAP分析により上位{n_features}特徴量を選択中...")

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    model = LogisticRegression(class_weight="balanced", max_iter=1000, random_state=0)
    model.fit(X_scaled, y)

    explainer = shap.Explainer(model, X_scaled)
    shap_values = explainer(X_scaled)

    mean_abs_shap = np.abs(shap_values.values).mean(axis=0)
    top_idx = np.argsort(mean_abs_shap)[::-1][:n_features]
    selected_features = [feature_names[i] for i in top_idx]

    print(f"SHAP選択完了: {len(selected_features)}個の特徴量")
    return selected_features
def extract_transformer_features(data, transformer_model, device):
    """Transformerモデルから特徴量を抽出"""
    transformer_model.eval()
    line_features = {}

    for _, row in data.iterrows():
        line_id = str(row['id_line'] + 1)
        x_seq = row['x_coordinates_dict'].get(line_id)

        if x_seq and len(x_seq) > 0:
            # LSTMと同じ形式: [[[x_seq]]]
            x_tensor = torch.tensor([[[x] for x in x_seq]], dtype=torch.float32).to(device)
            with torch.no_grad():
                _, features = transformer_model(x_tensor)
            line_features[row.name] = features.squeeze(0).cpu().numpy()

    return line_features

def add_transformer_features_to_data(data, line_features, n_features=32):
    """データフレームにTransformer特徴量を追加"""
    for i in range(n_features):
        data[f'transformer{i}'] = [
            line_features[idx][i] if idx in line_features else 0.0
            for idx in data.index
        ]
    return data

def feature_selection_sbs(X, y, feature_names):
    """Sequential Backward Selection (SBS)による特徴量選択"""
    print("Sequential Backward Selection (SBS)実行中...")

    clf = SVC(gamma='scale', probability=True, class_weight='balanced', random_state=0)
    selector = sfs(clf,
                   k_features=(1, len(feature_names)),
                   forward=False,
                   floating=False,  # Floatingを無効化してSBSに
                   scoring='f1',
                   cv=list(StratifiedKFold(n_splits=5, shuffle=True, random_state=0).split(X, y)),
                   n_jobs=-1)

    X_df = pd.DataFrame(X, columns=feature_names)
    selector = selector.fit(X_df, y)

    selected_features = X_df.columns[list(selector.k_feature_idx_)].tolist()
    X_selected = X_df[selected_features].values

    print(f"選択された特徴量数: {len(selected_features)}")
    print(f"選択された特徴量: {selected_features}")

    return X_selected, selected_features

def train_and_predict(X_train, y_train, X_test, y_test):
    """SVMモデルの訓練と予測"""
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    clf = SVC(gamma='scale', probability=True, class_weight='balanced', random_state=0)
    clf.fit(X_train_scaled, y_train)

    predictions = clf.predict(X_test_scaled)
    probabilities = clf.predict_proba(X_test_scaled)

    return predictions, probabilities

def calculate_metrics(y_true, y_pred):
    """分類性能指標を計算"""
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    return {
        'confusion_matrix': cm,
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1_score': f1_score(y_true, y_pred, zero_division=0),
        'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn
    }

def analyze_improvement(baseline_results, proposed_results):
    """特徴量追加による改善を分析"""
    baseline_correct = (baseline_results['y_true'] == baseline_results['y_pred'])
    proposed_correct = (proposed_results['y_true'] == proposed_results['y_pred'])

    newly_correct = (~baseline_correct) & proposed_correct
    newly_incorrect = baseline_correct & (~proposed_correct)

    return {
        'newly_correct_count': newly_correct.sum(),
        'newly_incorrect_count': newly_incorrect.sum(),
        'net_improvement': newly_correct.sum() - newly_incorrect.sum(),
        'newly_correct_indices': np.where(newly_correct)[0],
        'newly_incorrect_indices': np.where(newly_incorrect)[0]
    }

def convert_numpy_to_list(obj):
    """NumPy配列を再帰的にリストに変換"""
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_numpy_to_list(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_to_list(item) for item in obj]
    elif isinstance(obj, (np.integer, np.floating)):
        return obj.item()
    else:
        return obj

# ================ GPT-4o難易度特徴量の追加 ================
print("\nGPT-4o難易度特徴量を追加中...")
df_with_gpt = add_gpt_difficulty_features(df)

# ================ メイン処理:文書別クロスバリデーション ================
print("\n文書別クロスバリデーション開始...")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
documents = sorted(df_with_gpt["id_document"].unique())

baseline_results = {'y_true': [], 'y_pred': [], 'y_prob': [], 'selected_features': []}
proposed_results = {'y_true': [], 'y_pred': [], 'y_prob': [], 'selected_features': []}

for test_doc in documents:
    print(f"\n=== 文書 {test_doc} をテストデータとして使用 ===")

    test_data = df_with_gpt[df_with_gpt["id_document"] == test_doc].copy()
    train_data = df_with_gpt[df_with_gpt["id_document"] != test_doc].copy()

    print(f"訓練データ: {len(train_data)} サンプル")
    print(f"テストデータ: {len(test_data)} サンプル")

    # ================ Transformerモデルの訓練と特徴量抽出 ================
    print("Transformerモデルを訓練中...")

    transformer_model = TransformerModel(
        input_dim=1,
        d_model=32,
        nhead=2,
        num_layers=1,
        dim_feedforward=64,
        dropout=0.2,
        pooling='mean'
    ).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(transformer_model.parameters(), lr=0.001)

    # 訓練データ準備
    train_lines = {}
    train_labels = {}

    print(f"訓練データの準備中...")

    for _, row in train_data.iterrows():
        line_id = str(row['id_line'] + 1)
        x_seq = row['x_coordinates_dict'].get(line_id)
        if x_seq and len(x_seq) > 0:
            # 各line_idに対して最初に見つかったシーケンスのみを保持
            if line_id not in train_lines:
                train_lines[line_id] = x_seq
                train_labels[line_id] = row['unknownWordLabel']

    print(f"  ユニークなline_id数: {len(train_lines)}")

    # Transformer訓練
    X_transformer, Y_transformer = [], []
    for lid, x_seq in train_lines.items():
        # [[[x1], [x2], [x3], ...]] の形式に変換
        x_tensor = torch.tensor([[[x] for x in x_seq]], dtype=torch.float32).to(device)
        X_transformer.append(x_tensor)
        Y_transformer.append(train_labels[lid])

    # X_transformerが空の場合の処理
    if len(X_transformer) == 0:
        print("  ⚠️ 警告: 訓練データが見つかりませんでした。Transformer特徴量をスキップします。")
        train_transformer_features = {}
        test_transformer_features = {}
    else:
        transformer_model.train()
        epochs = 10
        for epoch in range(epochs):
            total_loss = 0
            for x_tensor, label in zip(X_transformer, Y_transformer):
                optimizer.zero_grad()
                out, _ = transformer_model(x_tensor)
                loss = criterion(out, torch.tensor([label], dtype=torch.long).to(device))
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            if (epoch + 1) % 2 == 0:
                print(f"  Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(X_transformer):.4f}")

        # Transformer特徴量抽出
        print("Transformer特徴量を抽出中...")
        train_transformer_features = extract_transformer_features(train_data, transformer_model, device)
        test_transformer_features = extract_transformer_features(test_data, transformer_model, device)

    # データにTransformer特徴量を追加
    train_data_with_features = add_transformer_features_to_data(train_data, train_transformer_features)
    test_data_with_features = add_transformer_features_to_data(test_data, test_transformer_features)

    # ================ ベースライン手法（GPT-4o込み） ================
    print("ベースライン手法（従来特徴量 + GPT-4o）を実行中...")

    # SHAP分析で30個に絞る
    X_baseline_all = train_data[baseline_features].fillna(0).values
    y_train = train_data["unknownWordLabel"].values

    baseline_shap_features = extract_shap_features(X_baseline_all, y_train, baseline_features, n_features=30)
    X_baseline = train_data[baseline_shap_features].fillna(0).values

    # SBFS特徴量選択
    X_baseline_selected, baseline_selected_features = feature_selection_sbs(
        X_baseline, y_train, baseline_shap_features)

    X_test_baseline = test_data[baseline_selected_features].fillna(0).values
    y_test = test_data["unknownWordLabel"].values

    baseline_pred, baseline_prob = train_and_predict(
        X_baseline_selected, y_train, X_test_baseline, y_test)

    baseline_results['y_true'].extend(y_test)
    baseline_results['y_pred'].extend(baseline_pred)
    baseline_results['y_prob'].extend(baseline_prob[:, 1])
    baseline_results['selected_features'].append(baseline_selected_features)

    # ================ 提案手法（ベースライン + Transformer特徴量） ================
    print("提案手法（ベースライン + Transformer視線特徴量）を実行中...")

    # SHAP分析で30個に絞る
    X_proposed_all = train_data_with_features[proposed_features].fillna(0).values

    proposed_shap_features = extract_shap_features(X_proposed_all, y_train, proposed_features, n_features=30)
    X_proposed = train_data_with_features[proposed_shap_features].fillna(0).values

    # SBFS特徴量選択
    X_proposed_selected, proposed_selected_features = feature_selection_sbs(
        X_proposed, y_train, proposed_shap_features)

    X_test_proposed = test_data_with_features[proposed_selected_features].fillna(0).values

    proposed_pred, proposed_prob = train_and_predict(
        X_proposed_selected, y_train, X_test_proposed, y_test)

    proposed_results['y_true'].extend(y_test)
    proposed_results['y_pred'].extend(proposed_pred)
    proposed_results['y_prob'].extend(proposed_prob[:, 1])
    proposed_results['selected_features'].append(proposed_selected_features)

    baseline_metrics = calculate_metrics(y_test, baseline_pred)
    proposed_metrics = calculate_metrics(y_test, proposed_pred)

    print(f"ベースライン (GPT-4o) - F1: {baseline_metrics['f1_score']:.3f}")
    print(f"提案手法 (GPT-4o + Transformer) - F1: {proposed_metrics['f1_score']:.3f}")

# ================ 全体結果の分析 ================
print("\n=== 全体結果の分析 ===")

for key in ['y_true', 'y_pred', 'y_prob']:
    baseline_results[key] = np.array(baseline_results[key])
    proposed_results[key] = np.array(proposed_results[key])

baseline_overall = calculate_metrics(baseline_results['y_true'], baseline_results['y_pred'])
proposed_overall = calculate_metrics(proposed_results['y_true'], proposed_results['y_pred'])

print("=== ベースライン手法（従来 + GPT-4o） ===")
print(f"F1-score: {baseline_overall['f1_score']:.3f}")
print(f"Accuracy: {baseline_overall['accuracy']:.3f}")
print(f"Precision: {baseline_overall['precision']:.3f}")
print(f"Recall: {baseline_overall['recall']:.3f}")

print("\n=== 提案手法（ベースライン + Transformer視線情報） ===")
print(f"F1-score: {proposed_overall['f1_score']:.3f}")
print(f"Accuracy: {proposed_overall['accuracy']:.3f}")
print(f"Precision: {proposed_overall['precision']:.3f}")
print(f"Recall: {proposed_overall['recall']:.3f}")

print(f"\n=== Transformer視線特徴量の効果 ===")
print(f"F1スコア改善: {proposed_overall['f1_score'] - baseline_overall['f1_score']:+.3f}")

improvement_analysis = analyze_improvement(baseline_results, proposed_results)

print(f"\n新たに正しく分類された単語数: {improvement_analysis['newly_correct_count']}")
print(f"新たに間違って分類された単語数: {improvement_analysis['newly_incorrect_count']}")
print(f"純改善数: {improvement_analysis['net_improvement']}")

def get_difficulty_level(score):
    """難易度スコアを文字レベルに変換"""
    if score <= 0.2:
        return "Very Easy (1-2)"
    elif score <= 0.4:
        return "Easy (3-4)"
    elif score <= 0.6:
        return "Moderate (5-6)"
    elif score <= 0.8:
        return "Difficult (7-8)"
    else:
        return "Very Difficult (9-10)"

# ================ 単語レベル詳細分析の保存 ================
print("\n単語レベルの詳細分析を保存中...")

word_level_analysis = []
word_index = 0

for test_doc in documents:
    test_data = df_with_gpt[df_with_gpt["id_document"] == test_doc].copy()

    for _, row in test_data.iterrows():
        word_info = {
            'word_index': word_index,
            'document_id': int(row['id_document']),
            'line_id': int(row['id_line']),
            'word_id': int(row['id_word']),
            'word': row['word'],
            'context': row['line'],  # Lineを文脈として使用
            'gpt_difficulty': float(row['gpt_difficulty']),
            'gpt_difficulty_integer': int(row['gpt_difficulty'] * 10),
            'difficulty_level': get_difficulty_level(float(row['gpt_difficulty'])),
            'true_label': int(baseline_results['y_true'][word_index]),
            'baseline_prediction': int(baseline_results['y_pred'][word_index]),
            'proposed_prediction': int(proposed_results['y_pred'][word_index]),
            'baseline_probability': float(baseline_results['y_prob'][word_index]),
            'proposed_probability': float(proposed_results['y_prob'][word_index]),
            'baseline_correct': bool(baseline_results['y_true'][word_index] == baseline_results['y_pred'][word_index]),
            'proposed_correct': bool(proposed_results['y_true'][word_index] == proposed_results['y_pred'][word_index]),
        }

        if not word_info['baseline_correct'] and word_info['proposed_correct']:
            word_info['improvement_category'] = 'newly_correct'
        elif word_info['baseline_correct'] and not word_info['proposed_correct']:
            word_info['improvement_category'] = 'newly_incorrect'
        elif word_info['baseline_correct'] and word_info['proposed_correct']:
            word_info['improvement_category'] = 'both_correct'
        else:
            word_info['improvement_category'] = 'both_incorrect'

        word_level_analysis.append(word_info)
        word_index += 1

# 新しく正解した単語のリスト作成（難易度順）
newly_correct_words = [w for w in word_level_analysis if w['improvement_category'] == 'newly_correct']
newly_correct_words.sort(key=lambda x: x['gpt_difficulty'], reverse=True)

# 新しく誤判定した単語のリスト作成（難易度順）
newly_incorrect_words = [w for w in word_level_analysis if w['improvement_category'] == 'newly_incorrect']
newly_incorrect_words.sort(key=lambda x: x['gpt_difficulty'], reverse=True)

# 新しく正解した単語の詳細ファイル保存
with open(f'Output/GPT_Transformer/analysis/words/newly_correct_words_{experiment_number}.txt', 'w', encoding='utf-8') as f:
    f.write(f"新しく正しく分類された単語一覧 (実験番号: {experiment_number})\n")
    f.write(f"総数: {len(newly_correct_words)} 単語\n")
    f.write("="*80 + "\n\n")

    for i, word_info in enumerate(newly_correct_words, 1):
        f.write(f"[{i}] 単語: {word_info['word']}\n")
        f.write(f"    文脈: {word_info['context']}\n")
        f.write(f"    GPT-4o難易度スコア: {word_info['gpt_difficulty_integer']}/10 ({word_info['difficulty_level']})\n")
        f.write(f"    真のラベル: {'未知' if word_info['true_label']==1 else '既知'}\n")
        f.write(f"    ベースライン予測: {'未知' if word_info['baseline_prediction']==1 else '既知'}\n")
        f.write(f"    提案手法予測: {'未知' if word_info['proposed_prediction']==1 else '既知'}\n")
        f.write("-"*80 + "\n")

# 新しく誤判定した単語の詳細ファイル保存
with open(f'Output/GPT_Transformer/analysis/words/newly_incorrect_words_{experiment_number}.txt', 'w', encoding='utf-8') as f:
    f.write(f"新しく誤って分類された単語一覧 (実験番号: {experiment_number})\n")
    f.write(f"総数: {len(newly_incorrect_words)} 単語\n")
    f.write("="*80 + "\n\n")

    for i, word_info in enumerate(newly_incorrect_words, 1):
        f.write(f"[{i}] 単語: {word_info['word']}\n")
        f.write(f"    文脈: {word_info['context']}\n")
        f.write(f"    GPT-4o難易度スコア: {word_info['gpt_difficulty_integer']}/10 ({word_info['difficulty_level']})\n")
        f.write(f"    真のラベル: {'未知' if word_info['true_label']==1 else '既知'}\n")
        f.write(f"    ベースライン予測: {'未知' if word_info['baseline_prediction']==1 else '既知'}\n")
        f.write(f"    提案手法予測: {'未知' if word_info['proposed_prediction']==1 else '既知'}\n")
        f.write("-"*80 + "\n")

print(f"単語詳細ファイルを保存しました:")
print(f"  - 新しく正解: Output/GPT_Transformer/analysis/words/newly_correct_words_{experiment_number}.txt")
print(f"  - 新しく誤判定: Output/GPT_Transformer/analysis/words/newly_incorrect_words_{experiment_number}.txt")

# ================ 結果の可視化（個別画像保存） ================
print("\n結果を可視化中...")

# GPT-4o難易度の分布を確認
gpt_scores = df_with_gpt['gpt_difficulty'].values
unknown_scores = gpt_scores[df_with_gpt['unknownWordLabel'] == 1]
known_scores = gpt_scores[df_with_gpt['unknownWordLabel'] == 0]

# PR曲線の計算
baseline_precision, baseline_recall, _ = precision_recall_curve(
    baseline_results['y_true'], baseline_results['y_prob'])
proposed_precision, proposed_recall, _ = precision_recall_curve(
    proposed_results['y_true'], proposed_results['y_prob'])

def interpolate_precision_recall(precision, recall):
    precision_interp = []
    for i in range(11):
        recall_level = i / 10.0
        max_precision = 0.0
        for j, r in enumerate(recall):
            if r >= recall_level and precision[j] > max_precision:
                max_precision = precision[j]
        precision_interp.append(max_precision)
    return precision_interp, [i/10.0 for i in range(11)]

baseline_prec_interp, recall_interp = interpolate_precision_recall(baseline_precision, baseline_recall)
proposed_prec_interp, _ = interpolate_precision_recall(proposed_precision, proposed_recall)

# 個別画像1: PR曲線
fig1, ax1 = plt.subplots(figsize=(8, 6))
ax1.plot(recall_interp, proposed_prec_interp, 'b-o',
         label=f'Proposed (AUC={auc(recall_interp, proposed_prec_interp):.3f})', markersize=4)
ax1.plot(recall_interp, baseline_prec_interp, 'r-o',
         label=f'Baseline (AUC={auc(recall_interp, baseline_prec_interp):.3f})', markersize=4)
ax1.set_xlabel('Recall')
ax1.set_ylabel('Precision')
ax1.set_title(f'PR Curve (GPT-4o+Transformer) - Exp {experiment_number}')
ax1.legend()
ax1.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'Output/GPT_Transformer/aupr/individual/PR_curve_{experiment_number}.png', dpi=300, bbox_inches='tight')
plt.close()

# 個別画像2: F1スコア比較
fig2, ax2 = plt.subplots(figsize=(8, 6))
ax2.bar(['Baseline\n(従来+GPT-4o)', 'Proposed\n(+Transformer)'],
        [baseline_overall['f1_score'], proposed_overall['f1_score']],
        color=['red', 'blue'], alpha=0.7)
ax2.set_ylabel('F1-Score')
ax2.set_title(f'F1 Comparison - Exp {experiment_number}')
ax2.grid(True, alpha=0.3, axis='y')
# F1スコア改善を表示
improvement = proposed_overall['f1_score'] - baseline_overall['f1_score']
if improvement > 0:
    ax2.text(1, proposed_overall['f1_score'], f'+{improvement:.3f}',
             ha='center', va='bottom', fontsize=10, color='blue')
plt.tight_layout()
plt.savefig(f'Output/GPT_Transformer/aupr/individual/F1_comparison_{experiment_number}.png', dpi=300, bbox_inches='tight')
plt.close()

# 個別画像3: 改善分析
fig3, ax3 = plt.subplots(figsize=(8, 6))
categories = ['Newly Correct', 'Newly Incorrect', 'Net Improvement']
values = [improvement_analysis['newly_correct_count'],
          improvement_analysis['newly_incorrect_count'],
          improvement_analysis['net_improvement']]
colors = ['green', 'red', 'blue']

bars = ax3.bar(categories, values, color=colors, alpha=0.7)
ax3.set_ylabel('Number of Words')
ax3.set_title(f'Impact (GPT-4o+Transformer) - Exp {experiment_number}')
ax3.grid(True, alpha=0.3, axis='y')

for bar, value in zip(bars, values):
    height = bar.get_height()
    ax3.text(bar.get_x() + bar.get_width()/2., height + 0.5,
             f'{value}', ha='center', va='bottom')
plt.tight_layout()
plt.savefig(f'Output/GPT_Transformer/aupr/individual/improvement_analysis_{experiment_number}.png', dpi=300, bbox_inches='tight')
plt.close()

# 個別画像4: GPT-4o難易度分布比較
fig4, ax4 = plt.subplots(figsize=(8, 6))
ax4.hist(known_scores, bins=10, alpha=0.7,
         label=f'Known Words (n={len(known_scores)})', color='blue', density=True)
ax4.hist(unknown_scores, bins=10, alpha=0.7,
         label=f'Unknown Words (n={len(unknown_scores)})', color='red', density=True)
ax4.set_xlabel('GPT-4o Difficulty Score')
ax4.set_ylabel('Density')
ax4.set_title(f'Difficulty Distribution - Exp {experiment_number}')
ax4.legend()
ax4.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'Output/GPT_Transformer/aupr/individual/difficulty_distribution_{experiment_number}.png', dpi=300, bbox_inches='tight')
plt.close()

print("個別画像を保存しました:")
print(f"  - Output/GPT_Transformer/aupr/individual/PR_curve_{experiment_number}.png")
print(f"  - Output/GPT_Transformer/aupr/individual/F1_comparison_{experiment_number}.png")
print(f"  - Output/GPT_Transformer/aupr/individual/improvement_analysis_{experiment_number}.png")
print(f"  - Output/GPT_Transformer/aupr/individual/difficulty_distribution_{experiment_number}.png")

# ================ 結果の保存 ================
print("\n結果を保存中...")

with open(f'Output/GPT_Transformer/selected_features/baseline_features_{experiment_number}.pkl', 'wb') as f:
    pickle.dump(baseline_results['selected_features'], f)

with open(f'Output/GPT_Transformer/selected_features/proposed_features_{experiment_number}.pkl', 'wb') as f:
    pickle.dump(proposed_results['selected_features'], f)

analysis_results = {
    'experiment_number': experiment_number,
    'baseline_results': convert_numpy_to_list(baseline_overall),
    'proposed_results': convert_numpy_to_list(proposed_overall),
    'improvement_analysis': convert_numpy_to_list(improvement_analysis),
    'word_level_analysis': word_level_analysis
}

with open(f'Output/GPT_Transformer/analysis/complete_analysis_{experiment_number}.json', 'w') as f:
    json.dump(analysis_results, f, indent=2)

end_time = time.time()
elapsed_time = end_time - start_time

print("\n" + "="*60)
print(f"処理完了！ 総実行時間: {elapsed_time/60:.2f} 分")
print("="*60)
print(f"\n結果保存先: Output/GPT_Transformer/")
print(f"  - aupr/individual/ : 個別画像")
print(f"  - analysis/words/ : 単語詳細")
print(f"  - analysis/ : 分析結果JSON")
print(f"  - selected_features/ : 選択特徴量")
print("="*60)

実験番号 c07 のGPT-4o + Transformer難易度評価分析を開始します...
✅ OpenAI APIクライアントが正常に初期化されました。
使用モデル: gpt-4o + Transformer
処理開始時刻: 2025-12-24 19:11:19
データを読み込んでいます...
データ形状: (2202, 31)
未知単語ラベルの分布: Counter({0: 2117, 1: 85})

lineカラムから文脈データを取得中...
文脈取得完了: 2202 単語

特徴量構成:
  従来特徴量: 22個
  ベースライン (従来 + GPT-4o): 23個
  提案手法 (ベースライン + Transformer): 55個

GPT-4o難易度特徴量を追加中...
gpt-4oによる難易度評価を実行中（新規評価）...


100%|██████████| 2202/2202 [15:10<00:00,  2.42it/s]



文書別クロスバリデーション開始...

=== 文書 0 をテストデータとして使用 ===
訓練データ: 1953 サンプル
テストデータ: 249 サンプル
Transformerモデルを訓練中...
訓練データの準備中...
  ユニークなline_id数: 77
  Epoch 2/10, Loss: 0.2133
  Epoch 4/10, Loss: 0.2114
  Epoch 6/10, Loss: 0.2090
  Epoch 8/10, Loss: 0.2074
  Epoch 10/10, Loss: 0.2049
Transformer特徴量を抽出中...
ベースライン手法（従来特徴量 + GPT-4o）を実行中...
SHAP分析により上位30特徴量を選択中...
SHAP選択完了: 23個の特徴量
Sequential Backward Selection (SBS)実行中...
選択された特徴量数: 11
選択された特徴量: ['gpt_difficulty', 'Num_7Characters', 'Rate_7Characters', 'freq', 'Num_Polysyllable', 'Num_ContentWords', 'Length_Word_Ave', 'Rate_ContentWords', 'Num_Monosyllable', 'Num_FunctionWords', 'freq_Min']
提案手法（ベースライン + Transformer視線特徴量）を実行中...
SHAP分析により上位30特徴量を選択中...
SHAP選択完了: 30個の特徴量
Sequential Backward Selection (SBS)実行中...
選択された特徴量数: 12
選択された特徴量: ['transformer30', 'transformer19', 'freq', 'transformer20', 'transformer21', 'transformer23', 'Num_7Characters', 'transformer24', 'Num_Polysyllable', 'transformer28', 'Num_Words', 'Length_Word_Ave']
ベースライン (GPT-4o) - F1:

  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(f'Output/GPT_Transformer/aupr/individual/F1_comparison_{experiment_number}.png', dpi=300, bbox_inches='tight')
  plt.savefig(f'Output/GPT_Transformer/aupr/individual/F1_comparison_{experiment_number}.png', dpi=300, bbox_inches='tight')


個別画像を保存しました:
  - Output/GPT_Transformer/aupr/individual/PR_curve_c07.png
  - Output/GPT_Transformer/aupr/individual/F1_comparison_c07.png
  - Output/GPT_Transformer/aupr/individual/improvement_analysis_c07.png
  - Output/GPT_Transformer/aupr/individual/difficulty_distribution_c07.png

結果を保存中...

処理完了！ 総実行時間: 154.81 分

結果保存先: Output/GPT_Transformer/
  - aupr/individual/ : 個別画像
  - analysis/words/ : 単語詳細
  - analysis/ : 分析結果JSON
  - selected_features/ : 選択特徴量
