# Import packages

In [None]:
# Standard library
import re
import os
import json
import gc
# Third-party libraries
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt

# Machine learning and NLP
import evaluate
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel, AutoTokenizer, pipeline

# API clients
from openai import OpenAI


# The main scoring section

## Set up for GPT as judge to do the human-like scoring 

In [None]:
# 從 txt 檔讀取 API 金鑰
with open("/home/NE6131039/Desktop/Confidential_Key.txt", "r") as f:
    api_key = f.read().strip()

# 初始化新版 OpenAI client
client = OpenAI(api_key=api_key)
GPT_MODEL = "gpt-4o"

In [None]:
def build_gpt_score_prompt(prediction, reference, question):
    return f"""
You are an expert in Transmission Electron Microscopy (TEM) image analysis.
Evaluate how well the predicted answer matches the reference answer for the given question.

### Evaluation Criteria:
1. **Factual Accuracy (50%)**: Is the core information scientifically correct?
   - For numerical values: Allow reasonable measurement tolerances (±10-20%)
   - For counts: Accept reasonable approximations within scientific context
   - For descriptions: Focus on scientific validity over exact phrasing
   
2. **Semantic Completeness (30%)**: Does the prediction fully address the question?
   - All key aspects mentioned in reference should be covered
   - Partial answers receive proportional scores
   
3. **Technical Precision (20%)**: Are scientific terms and concepts used correctly?
   - Proper TEM terminology and scientific language
   - Accurate use of materials science concepts

### Scoring Guidelines:
- **0.90-1.00**: Scientifically accurate, complete, and technically precise
- **0.70-0.89**: Mostly correct with minor inaccuracies or omissions
- **0.50-0.69**: Partially correct but missing important information
- **0.30-0.49**: Some relevant content but significant errors or gaps
- **0.00-0.29**: Largely incorrect, irrelevant, or completely missing key information

### Important Notes:
- Focus on scientific substance over linguistic similarity
- Consider context-appropriate terminology variations
- Evaluate based on TEM domain expertise

**Question:** {question}
**Reference:** {reference}
**Prediction:** {prediction}

**Provide a numerical score with exactly 2 decimal places (format: X.XX, range 0.00-1.00):**
**Examples: 0.85, 0.73, 0.91, 0.42**

retrun only a float score
"""

def gpt_score(prediction, reference, question):
    prompt = build_gpt_score_prompt(prediction, reference, question)
    try:
        response = client.chat.completions.create(
            model=GPT_MODEL,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0,
        )
        content = response.choices[0].message.content.strip()
        
        # 更強健的數字匹配，支援兩位小數
        match = re.search(r"(\d+(?:\.\d{1,2})?)", content)
        if match:
            score = float(match.group(1))
            # 確保在有效範圍內並格式化為兩位小數
            score = min(1.0, max(0.0, score))
            return round(score, 2)  # 確保兩位小數
        
        # 如果沒有匹配到，嘗試找其他可能的格式
        print(f"[GPT PARSE WARNING] Could not parse score from: {content}")
        
    except Exception as e:
        print(f"[GPT ERROR] {e}")
    return 0.0

## Load in files & scoring metric

In [None]:
# answers = [
#     "train_Classification_pretrain_7b_predict_final_scored.csv",
#     "train_Recognition_pretrain_7b_predict_final_scored.csv",
#     "train_Reasoning_pretrain_7b_predict_final_scored.csv",
#     "train_Summary_pretrain_7b_predict_final_scored.csv"
# ]
answers = [
    "pretrain_val_predict.csv",
    "finetune_no_curriculum_val_predict.csv",
    # "finetune_curriculum_val_predict.csv",
    "finetune_curriculum_final_val_predict.csv"
]

In [None]:
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")
rouge = evaluate.load("rouge")

# sbert_A = SentenceTransformer("all-MiniLM-L6-v2")
sbert = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
bertscore = evaluate.load("bertscore")

device = torch.device("cuda:2")  # 指定 GPU 1
# sbert_A = sbert_A.to(device)
sbert = sbert.to(device)

## all data scoring

In [None]:
# ==== Main Loop ====
for answer in answers:
    df = pd.read_csv(answer)
    
    sample_df = df.groupby('type').sample(n=2500, random_state=42)
    df = sample_df.reset_index(drop=True)
    
    references = df["expected"].astype(str).tolist()
    predictions = df["predicted"].astype(str).tolist()
    questions = df["question"].astype(str).tolist()

    # =======================
    #        GPT_SCORE
    # =======================
    gpt_scores = []
    for pred, ref,que in tqdm(zip(predictions, references,questions), total=len(predictions), desc="GPT Scoring"):
        gpt_scores.append(gpt_score(pred, ref,que))

    df["gpt_score"] = gpt_scores
    
    lexical_scores = []
    bleu_combined_scores = []
    rouge_combined_scores = []
    meteor_scores = []
    # =======================
    #        LEXICAL
    # =======================
    for pred, ref in tqdm(zip(predictions, references), total=len(predictions)):
        try:
            
            # BLEU-1 to BLEU-4 (get all max_order BLEU scores)
            bleu_scores_all = bleu.compute(predictions=[pred], references=[[ref]], max_order=4, smooth=True)
            bleu1 = bleu_scores_all.get("precisions", [0, 0, 0, 0])[0]
            bleu2 = bleu_scores_all.get("precisions", [0, 0, 0, 0])[1]
            bleu3 = bleu_scores_all.get("precisions", [0, 0, 0, 0])[2]
            bleu4 = bleu_scores_all.get("precisions", [0, 0, 0, 0])[3]
            
            bleu_combined  = 0.4*bleu1+0.3*bleu2+0.2*bleu3+0.1*bleu4
            
            #METEOR 
            meteor_score = meteor.compute(predictions=[pred], references=[ref])["meteor"]

            #ROUGE
            rouge_score = rouge.compute(predictions=[pred], references=[ref])
            rouge_1 = rouge_score.get("rouge1", 0.0)
            rouge_2 = rouge_score.get("rouge2", 0.0)
            rouge_l = rouge_score.get("rougeL", 0.0)
            rouge_lsum = rouge_score.get("rougeLsum", 0.0)
            
            rouge_combined = (rouge_1 + rouge_2 + ((rouge_l + rouge_lsum) / 2)) / 3

            score = (
                0.3 * rouge_combined +
                0.2 * bleu_combined +
                0.5 * meteor_score
            )
        except Exception as e:
            print(f"Error on sample: {e}")
            score = 0.0
            
        # Append metrics
        bleu_combined_scores.append(round(min(max(bleu_combined, 0.0), 1.0), 4))
        rouge_combined_scores.append(round(min(max(rouge_combined, 0.0), 1.0), 4))
        meteor_scores.append(round(min(max(meteor_score, 0.0), 1.0), 4))
        lexical_scores.append(round(min(max(score, 0.0), 1.0), 4))
        
    df["bleu_scores"] = bleu_combined_scores
    df["rouge_scores"] = rouge_combined_scores
    df["meteor_scores"] = meteor_scores
    df["lexical_scores"] = lexical_scores
    

    # =======================
    #        SEMANTIC
    # =======================
    bert_scores = []
    sbert_scores = []
    semantic_scores = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing {answer}"):
        pred = str(row["predicted"])
        ref = str(row["expected"])

        # --- BERTScore ---
        try:
            bert_result = bertscore.compute(
                predictions=[pred],
                references=[ref],
                lang="en",
                model_type="microsoft/deberta-xlarge-mnli",
                device="cuda:3"
            )
            bert_f1 = bert_result["f1"][0]
        except Exception as e:
            print(f"BERTScore error: {e}")
            bert_f1 = 0.0

        # --- SBERT cosine similarity ---
        try:
            emb_pred = sbert.encode(pred, convert_to_tensor=True, device="cuda:2")
            emb_ref = sbert.encode(ref, convert_to_tensor=True, device="cuda:2")
            sim = util.cos_sim(emb_pred, emb_ref).item()
        except Exception as e:
            print(f"SBERT error: {e}")
            sim = 0.0

        # Average semantic score
        avg = (bert_f1 + sim) / 2

        # Accumulate
        bert_scores.append(round(min(max(bert_f1, 0.0), 1.0), 4))
        sbert_scores.append(round(min(max(sim, 0.0), 1.0), 4))
        semantic_scores.append(round(min(max(avg, 0.0), 1.0), 4))

        # Optional: cleanup per-row
        del emb_pred, emb_ref
        torch.cuda.empty_cache()
        gc.collect()

    # Save
    df["bert_scores"] = bert_scores
    df["sbert_scores"] = sbert_scores
    df["semantic_scores"] = semantic_scores
   
    # # =======================
    # #     FINAL SCORE
    # # =======================
    lexical_np = np.array(lexical_scores)
    semantic_np = np.array(semantic_scores)

    #average
    final_scores = 0.5 * lexical_np +0.5* semantic_np
    df["final_scores"] = final_scores
    
    # df.drop(columns=["image", "question", "expected", "predicted"], inplace=True)
    df.to_csv(answer.replace(".csv", "_scored.csv"), index=False)

# deal with difficulity

In [None]:
all_final_average_scores = []
for answer in answers:
    df = pd.read_csv(answer)
    average_scores = np.array(df['final_score_average'])
    all_final_average_scores.append(average_scores)

# 設定標籤和顏色
colors = ["steelblue", "orange", "green", "purple"]
labels = ["Image Classification", "Microstructural Feature Detection", 
          "Feature Analysis & Interpretation", "Comprehensive Image Description"]

# 計算各類別的樣本數量
sizes = [len(scores) for scores in all_final_average_scores]

# 創建圓餅圖 + 統計表格
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# 左側立體圓餅圖
labels_short = ["Image Classification", "Microstructural\nFeature Detection",
               "Feature Analysis\n& Interpretation", "Comprehensive\nImage Description"]

# 添加立體效果參數
explode = (0.05, 0.05, 0.05, 0.05)  # 稍微分離各扇形
wedges, texts, autotexts = ax1.pie(sizes, 
                                  labels=labels_short, 
                                  colors=colors, 
                                  autopct='%1.1f%%', 
                                  startangle=90, 
                                  textprops={'fontsize': 14},
                                  explode=explode,  # 分離效果
                                  wedgeprops=dict(linewidth=2, edgecolor='white'))  # 白色邊框

ax1.set_title("Dataset Composition by Task Type", fontsize=16, fontweight='bold')

# 右側統計表格 - 修正版本
ax2.axis('off')
table_data = []
total = sum(sizes)

for i in range(4):
    table_data.append([
        labels[i],
        f"{sizes[i]:,}",
        f"{sizes[i]/total*100:.1f}%"
    ])
table_data.append(["Total", f"{total:,}", "100.0%"])

table = ax2.table(cellText=table_data, 
                 colLabels=["Task Type", "Sample Count", "Percentage"],
                 cellLoc='left', loc='center',
                 colWidths=[0.5, 0.25, 0.25])  # 調整欄位寬度比例
table.auto_set_font_size(False)
table.set_fontsize(12)  # 縮小字體
table.scale(1.0, 2.0)  # 增加行高，減少寬度縮放

# 美化表格
for i in range(len(table_data) + 1):
    for j in range(3):
        cell = table[(i, j)]
        if i == 0:  # 標題行
            cell.set_facecolor('#E6E6E6')
            cell.set_text_props(weight='bold')
        elif i == len(table_data):  # 總計行
            cell.set_facecolor('#F0F0F0')
            cell.set_text_props(weight='bold')
        
        # 設定文字換行
        if j == 0 and i > 0 and i < len(table_data):  # 任務類型欄位
            text = cell.get_text().get_text()
            if len(text) > 20:
                # 在適當位置換行
                if "Feature" in text and "Detection" in text:
                    cell.get_text().set_text("Microstructural Feature Detection")
                elif "Analysis" in text:
                    cell.get_text().set_text("Feature Analysis & Interpretation")
                elif "Comprehensive" in text:
                    cell.get_text().set_text("Comprehensive Image Description")

plt.tight_layout()
plt.savefig("Dataset_composition.png", dpi=500, bbox_inches='tight')
plt.show()

In [None]:
# 找出最大的count值來設定統一的Y軸
max_count = 0
all_counts_for_range = []
bin_edges = np.linspace(0, 100, 31)  # 30個bins

for i in range(4):
    percent = all_final_average_scores[i] * 100
    counts, _ = np.histogram(percent, bins=bin_edges)  # 修正：bin_edges 不是 bin*edges
    all_counts_for_range.append(counts)
    max_count = max(max_count, max(counts))

# 設定統一的Y軸範圍 (向上取整到千位)
y_max = int(np.ceil(max_count / 1000) * 1000)

# 修改2: 調整子圖間距和標題
fig, axes = plt.subplots(1, 4, figsize=(20, 6))  # 稍微加大
axes = axes.flatten()

# 統計摘要資料
stats_summary = []

for i in range(4):
    percent = all_final_average_scores[i] * 100
    
    # 繪製直方圖
    n, bins, patches = axes[i].hist(percent, bins=30, color=colors[i], 
                                   alpha=0.7, edgecolor='black', linewidth=0.5)
    # 計算統計值
    mean_val = np.mean(percent)
    std_val = np.std(percent)
    
    # 簡化標題
    axes[i].set_title(f'{labels[i]}\nμ={mean_val:.1f}, σ={std_val:.1f}', 
                     fontsize=16, fontweight='bold', pad=10)
    
    # 簡化圖例
    axes[i].axvline(mean_val, color='red', linestyle='--', linewidth=2, label='Mean')
    axes[i].legend(fontsize=8, loc='upper right')
    
    # 添加X軸和Y軸標籤（英文）
    axes[i].set_xlabel('Score', fontsize=14, fontweight='bold')
    axes[i].set_ylabel('Count', fontsize=14, fontweight='bold')
    
    # 統一軸範圍 - 這是關鍵！
    axes[i].set_xlim(0, 100)
    axes[i].set_ylim(0, 12000)  # 統一Y軸範圍讓數量差異更明顯
    
    # 美化刻度標籤
    axes[i].tick_params(axis='x', labelsize=12)
    axes[i].tick_params(axis='y', labelsize=12)
    
    # 添加網格
    axes[i].grid(True, alpha=0.3, linestyle='--')
    
    # 收集統計資料
    stats_summary.append({
        'Task Type': labels[i],
        'Sample Size': len(percent),
        'Mean': mean_val,
        'Std Dev': std_val,
        'Max Count': max(n)
    })

# 修改3: 調整整體佈局
fig.suptitle('Task Difficulty Distribution Analysis\n(Unified Y-axis for quantity comparison)', 
             fontsize=20, fontweight='bold', y=0.95)

plt.tight_layout()
plt.subplots_adjust(top=0.65, hspace=0.3, wspace=0.3)  # 調整間距
plt.savefig("Difficulty_Distributions_with.png", dpi=500, bbox_inches='tight')
plt.show()

In [None]:
# 合併所有資料並分級
all_scores = np.concatenate([scores * 100 for scores in all_final_average_scores])
all_task_types = []

for i, scores in enumerate(all_final_average_scores):
    all_task_types.extend([labels[i]] * len(scores))

# 使用三等份方式分級（根據分數排序）
sorted_scores = np.sort(all_scores)
total_samples = len(sorted_scores)

# 計算三等份的分界點
hard_threshold = sorted_scores[total_samples // 3]
medium_threshold = sorted_scores[2 * total_samples // 3]

def classify_difficulty_tertile(score):
    if score <= hard_threshold:
        return "Hard"
    elif score <= medium_threshold:
        return "Medium"  
    else:
        return "Easy"

# 應用分級
difficulties = [classify_difficulty_tertile(score) for score in all_scores]

# 創建DataFrame
df_curriculum = pd.DataFrame({
    'score': all_scores,
    'task_type': all_task_types,
    'difficulty': difficulties
})

# 計算各難度等級的任務分布
task_difficulty_counts = df_curriculum.groupby(['difficulty', 'task_type']).size().unstack(fill_value=0)

# 創建圖表：垂直堆疊柱狀圖 + 統計表格
fig = plt.figure(figsize=(18, 10))

# 精確定義每個子圖的位置和大小 [left, bottom, width, height]
ax1 = fig.add_axes([0.1, 0.2, 0.4, 0.6])   # 左圖：從10%開始，寬60%
ax2 = fig.add_axes([0.6, 0.1, 0.2, 0.6])  # 右圖：從75%開始，寬20%

difficulty_order = ['Easy', 'Medium', 'Hard']

# 左圖：垂直堆疊柱狀圖
bottom = np.zeros(len(difficulty_order))
bar_width = 0.6  # 柱狀圖寬度

for i, task_type in enumerate(labels):
    if task_type in task_difficulty_counts.columns:
        values = [task_difficulty_counts.loc[diff, task_type] if diff in task_difficulty_counts.index else 0 
                 for diff in difficulty_order]
        
        bars = ax1.bar(difficulty_order, values, bottom=bottom, width=bar_width,
                      color=colors[i], alpha=0.8, label=task_type, 
                      edgecolor='white', linewidth=1)
        
        # 在每個區段標註百分比和數量
        for j, (bar, value) in enumerate(zip(bars, values)):
            if value > 0:
                total_for_difficulty = sum([task_difficulty_counts.loc[difficulty_order[j], task] 
                                          if task in task_difficulty_counts.columns and difficulty_order[j] in task_difficulty_counts.index else 0 
                                          for task in labels])
                percentage = (value / total_for_difficulty) * 100 if total_for_difficulty > 0 else 0
                
                # 只在足夠大的區段顯示文字
                if percentage > 1:
                    y_pos = bottom[j] + value / 2
                    ax1.text(j, y_pos, f'{percentage:.1f}%', 
                            ha='center', va='center', fontsize=10, fontweight='bold',
                            color='white',
                            bbox=dict(boxstyle='round,pad=0.2', facecolor='black', alpha=0.7))
        
        bottom += values

# 美化垂直柱狀圖
ax1.set_xlabel('Difficulty Level', fontsize=12, fontweight='bold')
ax1.set_ylabel('Sample Count', fontsize=12, fontweight='bold')
ax1.legend(title='Task Types', fontsize=10, title_fontsize=11, 
          bbox_to_anchor=(1.02, 1), loc='upper left')
ax1.grid(True, alpha=0.3, axis='y')

# 添加總數標註
for i, diff in enumerate(difficulty_order):
    if diff in task_difficulty_counts.index:
        total_for_diff = sum([task_difficulty_counts.loc[diff, task] 
                             if task in task_difficulty_counts.columns else 0 
                             for task in labels])
        ax1.text(i, total_for_diff + max(bottom) * 0.02, f'Total: {total_for_diff:,}', 
                ha='center', va='bottom', fontsize=11, fontweight='bold')

# 右圖：詳細統計表格
ax2.axis('off')

# 準備表格資料
table_data = []
headers = ['Difficulty\nLevel'] + ['Image\nClassification', 'Feature\nDetection', 'Feature\nAnalysis', 'Image\nDescription'] + ['Total']

for diff in difficulty_order:
    row_data = [diff]
    total_for_diff = 0
    for task_type in labels:
        if task_type in task_difficulty_counts.columns and diff in task_difficulty_counts.index:
            count = task_difficulty_counts.loc[diff, task_type]
        else:
            count = 0
        row_data.append(f"{count:,}")
        total_for_diff += count
    row_data.append(f"{total_for_diff:,}")
    table_data.append(row_data)

# 添加總計行
total_row = ['Total']
for task_type in labels:
    if task_type in task_difficulty_counts.columns:
        total_for_task = sum(task_difficulty_counts[task_type])
    else:
        total_for_task = 0
    total_row.append(f"{total_for_task:,}")
total_row.append(f"{len(all_scores):,}")
table_data.append(total_row)

# 創建表格
table = ax2.table(cellText=table_data, colLabels=headers,
                 cellLoc='center', loc='center',
                 colWidths=[0.2, 0.2, 0.2, 0.2, 0.2, 0.2])
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1.5, 3)

# 美化表格
for i in range(len(table_data) + 1):
    for j in range(len(headers)):
        cell = table[(i, j)]
        if i == 0:  # 標題行
            if j == 0:  # 難度等級欄位
                cell.set_facecolor('#E6E6E6')
            elif 1 <= j <= 4:  # 任務類別欄位
                cell.set_facecolor(colors[j-1])
            else:  # 總計欄位
                cell.set_facecolor('#E6E6E6')
            cell.set_text_props(weight='bold', fontsize=10)
        elif i == len(table_data):  # 總計行
            cell.set_facecolor('#F0F0F0')
            cell.set_text_props(weight='bold', fontsize=10)

# 整體標題
fig.suptitle('Curriculum Learning Analysis: Task Type Distribution Across Difficulty Levels', 
             fontsize=16, fontweight='bold', y=0.9)

plt.tight_layout()
plt.subplots_adjust(top=0.85)
plt.savefig("curriculum_dataset.png", dpi=500, bbox_inches='tight')
plt.show()

# For 3 level training data

In [None]:
# 讀取與合併
df_list = [pd.read_csv(file.replace(".csv", "_final_scored.csv")) for file in answers]
df = pd.concat(df_list, ignore_index=True)

# 確保必要欄位存在
required_columns = {"image", "question", "expected", "final_score_average"}
assert required_columns.issubset(df.columns), f"缺少欄位：{required_columns - set(df.columns)}"

# 計算分位數並分級
df["score"] = df["final_score_average"] * 100
q1 = df["score"].quantile(1/3)
q2 = df["score"].quantile(2/3)

def assign_difficulty(score):
    if score <= q1:
        return "Hard"
    elif score <= q2:
        return "Medium"
    else:
        return "Easy"

df["difficulty"] = df["score"].apply(assign_difficulty)

# LLaVA 格式生成函數
def make_convo(row):
    return {
        "id": f"{row['difficulty'].upper()}_{row.name:06d}",
        "image": f"{row['image']}",
        "conversations": [
            {"from": "human", "value": f"<image>\n{row['question']}"},
            {"from": "gpt", "value": row["expected"]}
        ]
    }

# 依照難度輸出為 JSONL 檔案
for level in ["Hard", "Medium", "Easy"]:
    subset = df[df["difficulty"] == level]
    items = [make_convo(row) for _, row in subset.iterrows()]
    
    with open(f"llava_train_{level.lower()}.json", "w", encoding="utf-8") as f:
        json.dump(items, f, ensure_ascii=False, indent=2)

print("✅ 已輸出為 LLaVA 格式：llava_train_hard.jsonl / medium / easy")