In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)

## データの読み込み

In [None]:
# データの読み込み
df = pd.read_csv("../../kaggle/input/map-charting-student-math-misunderstandings/train.csv")
print(df.shape)
df.head()

In [None]:
# target列の作成
df.Misconception = df.Misconception.fillna("NA")
df["target"] = df.Category + ":" + df.Misconception

## 重複データの確認

In [None]:
# 重複データの抽出
duplicated_df = df[df.duplicated(subset=["QuestionId", "MC_Answer", "StudentExplanation"], keep=False)].sort_values(by=["QuestionId", "MC_Answer"])
print(duplicated_df.shape)

# Categoryが異なるデータを抽出
conflict_df = duplicated_df.groupby(["QuestionId", "MC_Answer", "StudentExplanation"]).filter(lambda x: x["target"].nunique() > 1)
conflict_df

In [None]:
# QuestionIdのデータ数に対する割合
all_count = df["QuestionId"].value_counts().to_frame().reset_index()
conflict_count = conflict_df["QuestionId"].value_counts().to_frame().reset_index()
merged_count = all_count.merge(conflict_count, on="QuestionId", how="left", suffixes=("_all", "_conflict"))
merged_count["conflict_ratio"] = merged_count["count_conflict"] / merged_count["count_all"]
merged_count = merged_count.fillna(0)
merged_count.sort_values(by="conflict_ratio", ascending=False)

In [None]:
conflict_df.to_csv("../../output/conflict_data.csv", index=False)

## 回答選択肢ごとの値を確認

In [None]:
# pdの表示文字列
pd.set_option("display.max_colwidth", 200)

In [None]:
# 回答の選択肢ごとのtargetのunique値
groupby_df =df.groupby(["QuestionId", "MC_Answer"]).agg({"MC_Answer": "size", "Misconception": ["nunique","unique"]}).reset_index()
groupby_df.columns = ["QuestionId", "MC_Answer", "count", "misconception_nunique", "misconception_unique"]
groupby_df

## Embeddingの分析

In [None]:
# Embeddingデータの読み込み
similarity_matrix = np.load("similarity_matrix.npy")
similarity_matrix.shape

In [None]:
# 類似度行列から直接top3の同一QuestionId・MC_Answerデータを取得してデータフレームに追加
def add_filtered_top_similarities(df, similarity_matrix, top_n=3):
    """類似度行列から同一QuestionId・MC_Answerの上位N件を取得してデータフレームに追加"""
    
    # 結果を格納する列を初期化
    for i in range(1, top_n + 1):
        df[f"top{i}_row_id"] = None
        df[f"top{i}_student_explanation"] = None
        df[f"top{i}_target"] = None
        df[f"top{i}_similarity_score"] = None
    
    n_samples = len(df)
    
    for idx in range(n_samples):
        current_row = df.iloc[idx]
        question_id = current_row["QuestionId"]
        mc_answer = current_row["MC_Answer"]
        
        # 現在の行の類似度を取得
        similarities = similarity_matrix[idx]
        # 自分自身を除外
        similarities_copy = similarities.copy()
        similarities_copy[idx] = -1
        
        # 類似度の高い順にソート
        sorted_indices = np.argsort(similarities_copy)[::-1]
        
        valid_count = 0
        
        # 上位から順に同一QuestionId・MC_Answerをチェック
        for sim_idx in sorted_indices:
            if valid_count >= top_n:
                break
                
            sim_row = df.iloc[sim_idx]
            
            # 同一QuestionId・MC_Answerかチェック
            if (sim_row["QuestionId"] == question_id and 
                sim_row["MC_Answer"] == mc_answer):
                
                valid_count += 1
                similarity_score = similarities_copy[sim_idx]
                
                # データフレームに直接追加
                df.iloc[idx, df.columns.get_loc(f"top{valid_count}_row_id")] = sim_row["row_id"]
                df.iloc[idx, df.columns.get_loc(f"top{valid_count}_student_explanation")] = sim_row["StudentExplanation"]
                df.iloc[idx, df.columns.get_loc(f"top{valid_count}_target")] = sim_row["target"]
                df.iloc[idx, df.columns.get_loc(f"top{valid_count}_similarity_score")] = similarity_score
    
    return df

# データフレームをコピーして処理
df_with_filtered_similarity = df.copy()

# 直接フィルタリングしながら類似度情報を追加
df_with_filtered_similarity = add_filtered_top_similarities(df_with_filtered_similarity, similarity_matrix, top_n=3)

print("処理後のデータフレーム形状:", df_with_filtered_similarity.shape)

In [None]:
# 結果を確認
df_with_filtered_similarity.head(2)

In [None]:
# top1, top2, top3の各条件での分析
def analyze_target_consistency(df, condition_name, filter_condition):
    """指定された条件でフィルタして一致率を分析"""
    filtered_df = df[filter_condition].copy()
    
    print(f"\n=== {condition_name} ===")
    print(f"元データ数: {len(df)}")
    print(f"フィルタ後データ数: {len(filtered_df)}")
    
    if len(filtered_df) == 0:
        print("データが存在しません")
        return None
    
    # 一致率の計算
    filtered_df["target_match"] = (filtered_df["target"] == filtered_df["top1_target"])
    
    # 類似度スコアを細かい範囲でグループ化
    filtered_df["similarity_range"] = pd.cut(
        filtered_df["top1_similarity_score"], 
        bins=[0, 0.5, 0.7, 0.8, 0.85, 0.90, 0.92, 0.94, 0.96, 0.98, 0.99, 1.0], 
        labels=["0.0-0.5", "0.5-0.7", "0.7-0.8", "0.8-0.85", "0.85-0.90", "0.90-0.92", "0.92-0.94", "0.94-0.96", "0.96-0.98", "0.98-0.99", "0.99-1.0"]
    )
    
    # 類似度範囲別の一致率を計算
    match_rate = filtered_df.groupby("similarity_range", observed=False).agg({
        "target_match": ["count", "sum", "mean"],
    }).round(4)

    match_rate.columns = ["total_count", "match_count", "match_rate"]
    display(match_rate)
    
    return match_rate

# 各条件での分析実行
# top1_target
condition0 = df_with_filtered_similarity["top1_target"] == df_with_filtered_similarity["top1_target"]
result0 = analyze_target_consistency(df_with_filtered_similarity, "top1_target == top1_target", condition0)

# 条件1: top1_target == top2_target
condition1 = df_with_filtered_similarity["top1_target"] == df_with_filtered_similarity["top2_target"]
result1 = analyze_target_consistency(df_with_filtered_similarity, "top1_target == top2_target", condition1)

# 条件2: top1_target == top2_target == top3_target
condition2 = (df_with_filtered_similarity["top1_target"] == df_with_filtered_similarity["top2_target"]) & \
             (df_with_filtered_similarity["top2_target"] == df_with_filtered_similarity["top3_target"])
result2 = analyze_target_consistency(df_with_filtered_similarity, "top1_target == top2_target == top3_target", condition2)

In [None]:
# top1~top2が同一かつ平均スコアが閾値以上という条件での分析
def analyze_with_avg_score_threshold(df, threshold_list=[0.9, 0.92, 0.94, 0.96, 0.98]):
    """top1~top2が同一かつ平均スコアが閾値以上という条件で分析"""

    # top1~top2が全て同一の条件
    all_same_condition = (df["top1_target"] == df["top2_target"])
    
    # 平均スコアを計算
    df_filtered = df[all_same_condition].copy()
    df_filtered["avg_similarity_score"] = (df_filtered["top1_similarity_score"] + df_filtered["top2_similarity_score"]) / 2

    results = []
    
    for threshold in threshold_list:
        # 閾値以上の条件
        threshold_condition = df_filtered["avg_similarity_score"] >= threshold
        filtered_data = df_filtered[threshold_condition]
        
        if len(filtered_data) == 0:
            print(f"\n閾値 {threshold}: データなし")
            continue
            
        # 一致率の計算
        filtered_data = filtered_data.copy()
        filtered_data["target_match"] = (filtered_data["target"] == filtered_data["top1_target"])
        
        total_count = len(filtered_data)
        match_count = filtered_data["target_match"].sum()
        match_rate = match_count / total_count
        
        avg_score = filtered_data["avg_similarity_score"].mean()
        
        result = {
            "threshold": threshold,
            "total_count": total_count,
            "match_count": match_count,
            "match_rate": round(match_rate, 4),
            "avg_similarity_score": round(avg_score, 4)
        }
        results.append(result)
    
    return pd.DataFrame(results)

# 分析実行
result_df = analyze_with_avg_score_threshold(df_with_filtered_similarity)
result_df

## 予測データとの比較

In [None]:
# 予測データの読み込み
score_df = pd.read_csv("merged_predictions_with_fold.csv")
score_df.head(2)

In [None]:
# is_correctを取得してマージ
score_result_df = score_df[["row_id", "is_correct"]].sort_values(by="row_id").reset_index(drop=True)
df_with_filtered_similarity = df_with_filtered_similarity.merge(score_result_df, on="row_id", how="left")
df_with_filtered_similarity.head(2)

In [None]:
# top1~top2が同一かつ平均スコアが閾値以上という条件での分析
def analyze_with_avg_score_threshold(df, threshold_list=[0.9, 0.92, 0.94, 0.96, 0.98, 0.99]):
    """top1~top2が同一かつ平均スコアが閾値以上という条件で分析"""

    # top1~top2が全て同一の条件
    all_same_condition = (df["top1_target"] == df["top1_target"])
    
    # 平均スコアを計算
    df_filtered = df[all_same_condition].copy()
    df_filtered["avg_similarity_score"] = (df_filtered["top1_similarity_score"] + df_filtered["top1_similarity_score"]) / 2

    results = []
    
    for threshold in threshold_list:
        # 閾値以上の条件
        threshold_condition = df_filtered["avg_similarity_score"] >= threshold
        filtered_data = df_filtered[threshold_condition]
        
        if len(filtered_data) == 0:
            print(f"\n閾値 {threshold}: データなし")
            continue
            
        # 一致率の計算
        filtered_data = filtered_data.copy()
        filtered_data["target_match"] = (filtered_data["target"] == filtered_data["top1_target"])
        
        total_count = len(filtered_data)
        match_count = filtered_data["target_match"].sum()
        match_rate = match_count / total_count
        pred_count = filtered_data["is_correct"].sum()
        pred_rate = pred_count / total_count
        
        avg_score = filtered_data["avg_similarity_score"].mean()
        
        result = {
            "threshold": threshold,
            "total_count": total_count,
            "match_count": match_count,
            "match_rate": round(match_rate, 4),
            "pred_rate": round(pred_rate, 4),
            "avg_similarity_score": round(avg_score, 4)
        }
        results.append(result)
    
    return pd.DataFrame(results)

# 分析実行
result_df = analyze_with_avg_score_threshold(df_with_filtered_similarity)
result_df

## 予測データとembeddingラベルの不一致分析

In [None]:
# 予測とembeddingラベルの不一致データを抽出
def analyze_prediction_embedding_mismatch(df, min_avg_similarity=0.99):
    """予測データとembeddingラベルの不一致を分析"""
    
    # top1とtop2が同一で高い類似度を持つデータをフィルタ
    same_top_condition = (df["top1_target"] == df["top1_target"])
    df_filtered = df[same_top_condition].copy()
    
    # 平均類似度を計算
    df_filtered["avg_similarity_score"] = (df_filtered["top1_similarity_score"] + df_filtered["top1_similarity_score"]) / 2
    
    # 類似度閾値でフィルタ
    high_similarity_condition = df_filtered["avg_similarity_score"] >= min_avg_similarity
    df_high_sim = df_filtered[high_similarity_condition].copy()
    
    # embeddingベースのラベル（top1_target）とis_correctの一致・不一致を判定
    df_high_sim["embedding_correct"] = df_high_sim["target"] == df_high_sim["top1_target"]
    df_high_sim["pred_embedding_match"] = df_high_sim["is_correct"] == df_high_sim["embedding_correct"]
    
    print(f"高類似度データ数（avg_similarity >= {min_avg_similarity}）: {len(df_high_sim)}")
    print(f"予測とembeddingラベル一致数: {df_high_sim['pred_embedding_match'].sum()}")
    print(f"予測とembeddingラベル不一致数: {(~df_high_sim['pred_embedding_match']).sum()}")
    print(f"一致率: {df_high_sim['pred_embedding_match'].mean():.4f}")
    
    # 不一致データを抽出
    mismatch_data = df_high_sim[~df_high_sim["pred_embedding_match"]].copy()
    
    return df_high_sim, mismatch_data

# 分析実行
df_analysis, mismatch_df = analyze_prediction_embedding_mismatch(df_with_filtered_similarity)
print(f"\n不一致データ数: {len(mismatch_df)}")

In [None]:
# 不一致パターンの詳細分析
def analyze_mismatch_patterns(mismatch_df):
    """不一致パターンを詳細分析"""
    
    print("=== 不一致パターン分析 ===")
    
    # パターン1: 予測正解、embedding不正解
    pred_correct_embed_wrong = mismatch_df[
        (mismatch_df["is_correct"] == True) & 
        (mismatch_df["embedding_correct"] == False)
    ]
    
    # パターン2: 予測不正解、embedding正解
    pred_wrong_embed_correct = mismatch_df[
        (mismatch_df["is_correct"] == False) & 
        (mismatch_df["embedding_correct"] == True)
    ]
    
    print(f"パターン1 (予測正解 & embedding不正解): {len(pred_correct_embed_wrong)}件")
    print(f"パターン2 (予測不正解 & embedding正解): {len(pred_wrong_embed_correct)}件")
    
    # ターゲット分布を確認
    print(f"\n=== パターン1のターゲット分布 ===")
    if len(pred_correct_embed_wrong) > 0:
        pattern1_target_dist = pred_correct_embed_wrong["target"].value_counts()
        print(pattern1_target_dist)
    
    print(f"\n=== パターン2のターゲット分布 ===")
    if len(pred_wrong_embed_correct) > 0:
        pattern2_target_dist = pred_wrong_embed_correct["target"].value_counts()
        print(pattern2_target_dist)
    
    return pred_correct_embed_wrong, pred_wrong_embed_correct

# パターン分析実行
pattern1_df, pattern2_df = analyze_mismatch_patterns(mismatch_df)

In [None]:
# 具体的な不一致事例を確認
def show_mismatch_examples(pattern_df, pattern_name, n_examples=5):
    """不一致事例を表示"""
    
    print(f"\n=== {pattern_name} 具体例 (上位{n_examples}件) ===")
    
    if len(pattern_df) == 0:
        print("該当データなし")
        return
        
    # 類似度の高い順にソート
    sorted_df = pattern_df.sort_values("avg_similarity_score", ascending=False).head(n_examples)
    
    for idx, row in sorted_df.iterrows():
        print(f"\n--- 事例 {idx} ---")
        print(f"QuestionId: {row['QuestionId']}")
        print(f"MC_Answer: {row['MC_Answer']}")
        print(f"StudentExplanation: {row['StudentExplanation']}")
        print(f"実際のTarget: {row['target']}")
        print(f"Top1 Target: {row['top1_target']}")
        print(f"Top1 Explanation: {row['top1_student_explanation']}")
        print(f"予測結果 (is_correct): {row['is_correct']}")
        print(f"Embedding一致 (embedding_correct): {row['embedding_correct']}")
        print(f"平均類似度: {row['avg_similarity_score']:.4f}")
        print("-" * 50)

# パターン1の具体例
show_mismatch_examples(pattern1_df, "パターン1: 予測正解 & embedding不正解")

# パターン2の具体例  
show_mismatch_examples(pattern2_df, "パターン2: 予測不正解 & embedding正解")

## 類似度0.98以上でラベルが異なるデータの抽出

In [None]:
# 類似度0.98以上でラベルが異なるデータを抽出
def extract_high_similarity_different_labels(df, similarity_threshold=0.98):
    """
    類似度が閾値以上でラベルが異なるデータを抽出する
    """
    
    # 類似度が閾値以上のデータをフィルタ
    high_similarity_condition = df["top1_similarity_score"] >= similarity_threshold
    high_sim_df = df[high_similarity_condition].copy()
    
    # ラベルが異なるデータをフィルタ
    different_labels_condition = high_sim_df["target"] != high_sim_df["top1_target"]
    different_labels_df = high_sim_df[different_labels_condition].copy()
    
    print(f"類似度{similarity_threshold}以上のデータ数: {len(high_sim_df)}")
    print(f"そのうちラベルが異なるデータ数: {len(different_labels_df)}")
    print(f"異なるラベルの割合: {len(different_labels_df)/len(high_sim_df)*100:.2f}%")
    
    return different_labels_df

# データを抽出
high_sim_diff_labels_df = extract_high_similarity_different_labels(df_with_filtered_similarity, 0.98)

# データの内容を確認
print(f"\n抽出されたデータの形状: {high_sim_diff_labels_df.shape}")
if len(high_sim_diff_labels_df) > 0:
    print(f"\nラベルの組み合わせ:")
    label_combinations = high_sim_diff_labels_df[["target", "top1_target"]].value_counts()
    print(label_combinations.head(10))

In [None]:
# trainデータフレーム形式で近しいレコードを順に並べて出力
def create_paired_output(df, similarity_threshold=0.98):
    """
    類似度が閾値以上でラベルが異なるデータペアを、trainデータフレーム形式で順に並べて出力
    """
    
    # 類似度が閾値以上でラベルが異なるデータを抽出
    high_similarity_condition = df["top1_similarity_score"] >= similarity_threshold
    different_labels_condition = df["target"] != df["top1_target"]
    
    target_df = df[high_similarity_condition & different_labels_condition].copy()
    
    print(f"類似度{similarity_threshold}以上でラベルが異なるデータ数: {len(target_df)}")
    
    # 出力用のリストを作成
    output_rows = []
    processed_pairs = set()
    
    for idx, row in target_df.iterrows():
        # ペアがすでに処理済みかチェック
        pair_key = tuple(sorted([row["row_id"], row["top1_row_id"]]))
        if pair_key in processed_pairs:
            continue
            
        # 元のレコードを追加
        original_record = df[df["row_id"] == row["row_id"]].iloc[0]
        output_rows.append({
            "row_id": original_record["row_id"],
            "QuestionId": original_record["QuestionId"], 
            "QuestionText": original_record["QuestionText"],
            "MC_Answer": original_record["MC_Answer"],
            "StudentExplanation": original_record["StudentExplanation"],
            "Category": original_record["Category"],
            "Misconception": original_record["Misconception"],
            "similarity_score": row["top1_similarity_score"],
            "pair_type": "original"
        })
        
        # 類似レコードを追加
        similar_record = df[df["row_id"] == row["top1_row_id"]].iloc[0]
        output_rows.append({
            "row_id": similar_record["row_id"],
            "QuestionId": similar_record["QuestionId"],
            "QuestionText": similar_record["QuestionText"], 
            "MC_Answer": similar_record["MC_Answer"],
            "StudentExplanation": similar_record["StudentExplanation"],
            "Category": similar_record["Category"],
            "Misconception": similar_record["Misconception"],
            "similarity_score": row["top1_similarity_score"],
            "pair_type": "similar"
        })
        
        # 処理済みペアに追加
        processed_pairs.add(pair_key)
    
    # データフレームに変換
    output_df = pd.DataFrame(output_rows)
    
    return output_df

# ペア形式のデータを作成
paired_output_df = create_paired_output(df_with_filtered_similarity, 0.98)

# ファイル名
output_filename = "../../output/paired_high_similarity_different_labels_0.98.csv"

# CSVファイルとして出力
paired_output_df.to_csv(output_filename, index=False, encoding='utf-8')

print(f"CSVファイルを出力しました: {output_filename}")
print(f"出力データ数: {len(paired_output_df)}")
print(f"ペア数: {len(paired_output_df)//2}")

# サンプルデータを表示
if len(paired_output_df) > 0:
    print(f"\n--- サンプルデータ (最初の1ペア) ---")
    for i in range(0, min(4, len(paired_output_df)), 2):
        print(f"\n=== ペア {i//2 + 1} (類似度: {paired_output_df.iloc[i]['  _score']:.4f}) ===")
        
        # 元のレコード
        orig = paired_output_df.iloc[i]
        print(f"[元レコード] row_id: {orig['row_id']}")
        print(f"StudentExplanation: {orig['StudentExplanation']}")
        print(f"Category: {orig['Category']}, Misconception: {orig['Misconception']}")
        
        # 類似レコード
        sim = paired_output_df.iloc[i+1]
        print(f"[類似レコード] row_id: {sim['row_id']}")
        print(f"StudentExplanation: {sim['StudentExplanation']}")
        print(f"Category: {sim['Category']}, Misconception: {sim['Misconception']}")
        print("-" * 80)