In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "Qwen/Qwen3-14B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda:0",
    dtype=torch.float16,
)


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [2]:
import pandas as pd

df = pd.read_csv("t1.csv")
df.head()

Unnamed: 0,Speaker,Timestamp_Minutes,Unnamed: 2,Utterance,Proposing Strategies and Plans
0,Chris,0.0,3.0,if you guys can put this microphone on your wh...,
1,Natalie,0.0,20.0,Shall we .. Did you guys read it or-,
2,Henry,0.0,22.0,Yeah,
3,Ryan,0.0,23.0,Yeah,
4,TA,,3.0,Don't forgot to put in your name,


In [3]:
df_clean = df[[
    "Speaker",
    "Timestamp_Minutes",
    "Utterance",
    "Proposing Strategies and Plans"
]]

In [4]:
df= df_clean

In [5]:
def build_prompt(utterance):
    return f"""
You are annotating classroom discussion transcripts.

Task:
Decide whether the following utterance involves *proposing strategies or plans*.

Context: 
Students are working in groups on activities to learn about kepler's first law of planetary motion. There is a pen and paper activity (that uses pins, paper, pencil, string) for them to understand how draw an elliptical orbit and then a computer aspect where they work on various immersive computer simulation activities to develop a final claim that orbits are elliptical. The learning objective is for them to work collaboratively to discover this new knowledge through hands on activities.

Definition:
- Articulating specific steps, strategies, or procedures required to organize or accomplish the group's task.
- Look for utterances that set direction or specify how to complete an activity (often using procedural or sequential language). Exclude cases where the speaker is merely following instructions read aloud after being prompted by a peer 

Utterance:
\"\"\"{utterance}\"\"\"

Respond ONLY in valid JSON.
Do NOT include any explanation or extra text.

Format:
{{"label": "YES"}} or {{"label": "NO"}}

"""


In [6]:
df_text = df[["Utterance"]].dropna()


In [7]:
utterance = df_text.iloc[0]["Utterance"]

prompt = build_prompt(utterance)

messages = [
    {"role": "system", "content": "You are an annotation assistant."},
    {"role": "user", "content": build_prompt(utterance)},
]

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,  # ⭐ 关键
)

inputs = tokenizer(text, return_tensors="pt").to("cuda")

out = model.generate(
    **inputs,
    max_new_tokens=1000,
    do_sample=True,
    temperature = 0.6
)

print(tokenizer.decode(out[0], skip_special_tokens=True))



system
You are an annotation assistant.
user

You are annotating classroom discussion transcripts.

Task:
Decide whether the following utterance involves *proposing strategies or plans*.

Context: 
Students are working in groups on activities to learn about kepler's first law of planetary motion. There is a pen and paper activity (that uses pins, paper, pencil, string) for them to understand how draw an elliptical orbit and then a computer aspect where they work on various immersive computer simulation activities to develop a final claim that orbits are elliptical. The learning objective is for them to work collaboratively to discover this new knowledge through hands on activities.

Definition:
- Articulating specific steps, strategies, or procedures required to organize or accomplish the group's task.
- Look for utterances that set direction or specify how to complete an activity (often using procedural or sequential language). Exclude cases where the speaker is merely following instr

In [8]:
import re
import json
import torch

def annotate_with_think(utterance, tokenizer, model, max_new_tokens=1000):
    messages = [
        {
            "role": "system",
            "content": (
                "You are an annotation assistant. "
            ),
        },
        {"role": "user", "content": build_prompt(utterance)},
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature = 0.6
    )
    print(tokenizer.decode(out[0], skip_special_tokens=True))


    # 只取新生成的部分
    gen_ids = out[0][inputs["input_ids"].shape[-1]:]
    gen_text = tokenizer.decode(gen_ids, skip_special_tokens=True)

    # ---- 解析 think ----
    think_match = re.search(r"<think>(.*?)</think>", gen_text, re.S)
    think_text = think_match.group(1).strip() if think_match else None

    # ---- 解析 label ----
    label_match = re.search(r'\{[^{}]*"label"\s*:\s*"(YES|NO)"[^{}]*\}', gen_text)
    label = json.loads(label_match.group())["label"] if label_match else None

    return think_text, label


In [9]:
df_text = df[["Utterance"]].dropna().copy()


In [10]:
import os
import pandas as pd

def append_to_csv(rows, out_path):
    df_new = pd.DataFrame(rows)
    if not os.path.exists(out_path):
        df_new.to_csv(out_path, index=False)
    else:
        df_new.to_csv(out_path, mode="a", header=False, index=False)

In [11]:
from tqdm import tqdm

def run_annotation_in_batches(
    df_text,
    tokenizer,
    model,
    start_idx=0,
    batch_size=10,
    out_path="qwen3_annotations_checkpoint.csv",
):
    n = len(df_text)

    for batch_start in range(start_idx, n, batch_size):
        batch_end = min(batch_start + batch_size, n)
        batch_rows = []

        print(f"\n▶ Processing rows {batch_start}–{batch_end - 1}")

        for i in range(batch_start, batch_end):
            utt = df_text.iloc[i]["Utterance"]

            think, label = annotate_with_think(
                utt,
                tokenizer,
                model,
            )

            batch_rows.append({
                "index": i,
                "Utterance": utt,
                "think": think,
                "label": label,
            })

        # ⭐ 每 10 条立刻保存
        append_to_csv(batch_rows, out_path)
        print(f"✔ Saved batch ending at index {batch_end - 1}")

    print("\n✅ All done.")


In [12]:

run_annotation_in_batches(
    df_text,
    tokenizer,
    model,
    start_idx=450,
    batch_size=10,
    out_path="zeroshot_qwen3_annotations450.csv",
)



▶ Processing rows 450–459
system
You are an annotation assistant. 
user

You are annotating classroom discussion transcripts.

Task:
Decide whether the following utterance involves *proposing strategies or plans*.

Context: 
Students are working in groups on activities to learn about kepler's first law of planetary motion. There is a pen and paper activity (that uses pins, paper, pencil, string) for them to understand how draw an elliptical orbit and then a computer aspect where they work on various immersive computer simulation activities to develop a final claim that orbits are elliptical. The learning objective is for them to work collaboratively to discover this new knowledge through hands on activities.

Definition:
- Articulating specific steps, strategies, or procedures required to organize or accomplish the group's task.
- Look for utterances that set direction or specify how to complete an activity (often using procedural or sequential language). Exclude cases where the speak