# AP Study Assistant — Bilingual Notes & Mind Map (Colab)

本 Notebook 版**可直接在 Colab 运行**：从输入文本生成**中英对照笔记**（英文原句 + 中文术语注释 gloss）、**Mermaid 思维导图**，并输出**词频图表**与可直接上传到 GitHub 的项目压缩包。

### 你能得到
- `output/notes_bilingual.md`：中英对照（演示：中文为术语注释）。
- `output/mindmap.mmd`：Mermaid 思维导图（GitHub 可直接渲染）。
- `assets/wordfreq.png`：词频柱状图（效果图）。
- `assets/notes_preview.png`：笔记预览图（效果图）。
- `ap-note-gen.zip`：含 README、源码、assets、sample 和 output 的**整站压缩包**，可直接传 GitHub。

> 默认不需要外网依赖；如需“真正的机器翻译”和“PDF 文本提取”，可在后面的**可选进阶**单元中安装 `transformers` / `pymupdf`。


In [ ]:
#@title 1) 基础导入 & 目录准备（必须先运行）
from pathlib import Path
import re, os
from collections import Counter
import matplotlib.pyplot as plt

ROOT = Path('.')
ASSETS = ROOT / 'assets'
OUTPUT = ROOT / 'output'
SRC = ROOT / 'src' / 'ap_note_gen'
SAMPLE = ROOT / 'sample_data'
for p in [ASSETS, OUTPUT, SRC, SAMPLE]:
    p.mkdir(parents=True, exist_ok=True)
print('Dirs ready:', ASSETS, OUTPUT, SRC, SAMPLE)


In [ ]:
#@title 2) 核心函数：分句、摘要、术语注释、导出
STOP = set('''a an the and or but if while to for of in on by with from as about across after against among around at before behind below beneath beside between beyond during except inside into like near off onto outside over past since through toward under until up upon within without is are was were be been being this that these those not no nor so such than then too very can could should would may might must do does did having have has it its it's they them he she we you i our your their'''.split())

def simple_sentence_split(text: str):
    text = re.sub(r"\s+", " ", text.strip())
    parts = re.split(r"(?<=[\.!\?])\s+", text)
    return [s.strip() for s in parts if s.strip()]

def tokenize(text: str):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s\-]", " ", text)
    return [t for t in text.split() if t]

def score_sentences(sentences):
    tokens = []
    for s in sentences:
        tokens.extend([t for t in tokenize(s) if t not in STOP])
    c = Counter(tokens)
    if not c:
        return [0.0]*len(sentences)
    maxf = max(c.values())
    for k in list(c.keys()):
        c[k] = c[k]/maxf
    scores = []
    for s in sentences:
        ts = [t for t in tokenize(s) if t not in STOP]
        score = sum(c.get(t,0) for t in ts) / (len(ts)+1e-9)
        scores.append(score)
    return scores

def top_k_summary(sentences, k=6):
    if not sentences:
        return []
    scores = score_sentences(sentences)
    idx = list(range(len(sentences)))
    idx.sort(key=lambda i: scores[i], reverse=True)
    chosen = sorted(idx[:k])
    return [sentences[i] for i in chosen]

CN_GLOSS = {
    "experiment":"实验","experimental":"实验的","design":"设计","random":"随机",
    "randomized":"随机化","assignment":"分配","comparative":"对比","control":"控制",
    "confounding":"混杂","confounders":"混杂因素","variable":"变量","variables":"变量",
    "response":"应变量","explanatory":"自变量","blocking":"分组(区组)","block":"区组",
    "matched":"匹配","pairs":"配对","replication":"重复(样本量)","bias":"偏差",
    "blinding":"盲法","double-blind":"双盲","single-blind":"单盲","placebo":"安慰剂",
    "precision":"精度","variability":"变异","population":"总体","sample":"样本",
    "generalization":"外推(泛化)","causal":"因果的","inference":"推断","scope":"适用范围"
}

def gloss_translate(en_sentence: str) -> str:
    words = en_sentence.split()
    out = []
    for w in words:
        base = re.sub(r"[^a-zA-Z\-]", "", w).lower()
        cn = CN_GLOSS.get(base)
        out.append(f"{w}({cn})" if cn else w)
    return " ".join(out)

def build_bilingual_notes(summary_sentences):
    notes = []
    for s in summary_sentences:
        zh = gloss_translate(s)
        notes.append({"en": s, "zh": zh})
    return notes

def export_markdown(notes, out_path: str):
    md = ["# AP Bilingual Notes (Auto)",
          "",
          "> 本文件由脚本自动生成：上为原文句子，下为带中文术语提示的对照（示范模式）。",
          ""]
    for i, item in enumerate(notes, 1):
        md.append(f"## {i}.")
        md.append("**EN**: " + item["en"])
        md.append("**ZH(Gloss)**: " + item["zh"])
        md.append("")
    Path(out_path).write_text("\n".join(md), encoding="utf-8")

def export_mermaid_mindmap(notes, out_path: str, title="AP Experimental Design"):
    all_text = " ".join(n["en"] for n in notes).lower()
    tokens = [t for t in re.findall(r"[a-z\-]+", all_text) if t not in STOP]
    c = Counter(tokens)
    core = [w for w,_ in c.most_common(10)]
    lines = ["```mermaid","mindmap",f"  root(({title}))"]
    for w in core:
        lines.append(f"    {w}")
    lines.append("```")
    Path(out_path).write_text("\n".join(lines), encoding="utf-8")

def word_frequencies(text: str, topn=12):
    tokens = [t for t in re.findall(r"[a-zA-Z\-]+", text.lower()) if t not in STOP]
    c = Counter(tokens)
    return c.most_common(topn)


In [ ]:
#@title 3) 示例文本（可替换为你自己的 .txt 文本）
sample_text = (
    """
AP Statistics – Experimental Design (author-made demo text)

In AP Statistics, experimental design focuses on how to gather convincing evidence about cause-and-effect.
Key ideas include: explanatory vs. response variables, control of confounding, random assignment, and replication.

A good randomized comparative experiment assigns subjects to treatments by chance. Random assignment helps balance
unknown confounders between groups, so differences in responses can more credibly be attributed to the treatments.
Controlling conditions (like environment or instructions) reduces variability unrelated to the treatments.

Blocking groups similar subjects together on a known factor (e.g., prior achievement) can increase precision by
reducing within-block variability. Matched-pairs designs are a special case of blocking, where each pair is closely
matched (or each subject serves as their own pair across two conditions). Replication—using enough subjects—stabilizes
estimates and makes results more generalizable.

Blinding helps mitigate bias: single-blind means participants do not know which treatment they receive; double-blind
means both participants and those measuring responses do not know the treatment assignments. Placebo controls help
separate the true treatment effect from psychological or expectation effects.

Finally, scope of inference depends on the design: random assignment supports causal inference; random sampling supports
population generalization. An experiment with random assignment but convenience sampling allows strong causal claims for
the studied subjects, but generalization to a wider population remains limited.
    """.strip()
)

(SAMPLE / 'demo_experimental_design.txt').write_text(sample_text, encoding='utf-8')
print('Sample saved to', SAMPLE / 'demo_experimental_design.txt')


In [ ]:
#@title 4) 运行管线（生成笔记 + 思维导图 + 词频图）
text = (SAMPLE / 'demo_experimental_design.txt').read_text(encoding='utf-8')
sentences = simple_sentence_split(text)
summary = top_k_summary(sentences, k=6)
notes = build_bilingual_notes(summary)

export_markdown(notes, str(OUTPUT / 'notes_bilingual.md'))
export_mermaid_mindmap(notes, str(OUTPUT / 'mindmap.mmd'), title='AP Experimental Design')

# 词频图
wf = word_frequencies(text, topn=12)
words, counts = zip(*wf) if wf else ([],[])
plt.figure(figsize=(8,4.5))
plt.bar(range(len(words)), counts)
plt.xticks(range(len(words)), words, rotation=30, ha='right')
plt.title('Word Frequency (Top 12) – Demo')
plt.tight_layout()
plt.savefig(ASSETS / 'wordfreq.png')
plt.show()

# 生成笔记预览图
preview_lines = (OUTPUT / 'notes_bilingual.md').read_text(encoding='utf-8').splitlines()[:22]
plt.figure(figsize=(9,6))
plt.axis('off')
plt.text(0.01, 0.98, "\n".join(preview_lines), va='top', ha='left', wrap=True, fontsize=9, family='monospace')
plt.tight_layout()
plt.savefig(ASSETS / 'notes_preview.png')
plt.show()

print('Generated files:', list(OUTPUT.iterdir()))


### （可选）进阶：真实翻译 / PDF 文本提取

如果你需要**真实机器翻译**或直接从 **PDF** 提取文本：

```python
# 安装
!pip -q install transformers sentencepiece
!pip -q install pymupdf  # 可选：PDF 文本提取

from transformers import MarianTokenizer, MarianMTModel
tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-zh')
model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-zh')

def true_translate(en_sentence):
    inputs = tok([en_sentence], return_tensors='pt', padding=True)
    out = model.generate(**inputs, max_new_tokens=128)
    return tok.batch_decode(out, skip_special_tokens=True)[0]
```

PDF 提取（用 `pymupdf`）示例：

```python
import fitz  # pymupdf
def pdf_to_text(path):
    doc = fitz.open(path)
    texts = []
    for page in doc:
        texts.append(page.get_text())
    return "\n".join(texts)
```


In [ ]:
#@title 5) 一键导出为可传 GitHub 的压缩包（含 README、源码、assets、sample、output）
readme = f"""
# AP Study Assistant — Bilingual Notes & Mind Map Generator (Colab Export)

> 一键生成 **中英文对照笔记** + **Mermaid 思维导图**（GitHub 可直接渲染）。本导出包来自 Colab 运行结果，包含示例与效果图。

## 功能
- 句子级提取摘要（轻量频次模型）。
- 自动输出**中英对照**（英文原句 + 中文术语提示 gloss）。
- 输出 **Mermaid 思维导图**（`output/mindmap.mmd`）。
- 输出**词频统计**与效果图：`assets/wordfreq.png`、`assets/notes_preview.png`。

## 创意点
1. 面向 AP 学科的**术语敏感**笔记生成，不是单纯机翻。
2. 使用 Mermaid 生成轻量思维导图，便于版本管理。
3. 结构清晰，便于扩展接入更强的翻译与摘要模型。

## 快速开始
```bash
python -m src.ap_note_gen.cli --input sample_data/demo_experimental_design.txt --outdir output --k 6
```

## 待改进
- 扩充术语词典；引入 transformers 做真实翻译。
- 支持 PDF/批量处理/前端上传。

## 效果图
![Word Frequency](assets/wordfreq.png)

笔记预览：
![Notes Preview](assets/notes_preview.png)
        
## 目录结构
```
.
├── README.md
├── assets/
│   ├── wordfreq.png
│   └── notes_preview.png
├── output/
│   ├── notes_bilingual.md
│   └── mindmap.mmd
├── sample_data/
│   └── demo_experimental_design.txt
└── src/
    └── ap_note_gen/
        ├── __init__.py
        ├── pipeline.py
        └── cli.py
```
        
## 许可
MIT
        
"""

# 写出源码（与 Notebook 同步的轻量版本）
(SRC / '__init__.py').write_text("__version__='0.1.0'\n", encoding='utf-8')

pipeline_py = r'''
import re
from pathlib import Path
from collections import Counter

STOP = set('''a an the and or but if while to for of in on by with from as about across after against among around at before behind below beneath beside between beyond during except inside into like near off onto outside over past since through toward under until up upon within without is are was were be been being this that these those not no nor so such than then too very can could should would may might must do does did having have has it its it's they them he she we you i our your their'''.split())

def simple_sentence_split(text: str):
    text = re.sub(r"\s+", " ", text.strip())
    parts = re.split(r"(?<=[\.!\?])\s+", text)
    return [s.strip() for s in parts if s.strip()]

def tokenize(text: str):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s\-]", " ", text)
    return [t for t in text.split() if t]

def score_sentences(sentences):
    tokens = []
    for s in sentences:
        tokens.extend([t for t in tokenize(s) if t not in STOP])
    c = Counter(tokens)
    if not c:
        return [0.0]*len(sentences)
    maxf = max(c.values())
    for k in list(c.keys()):
        c[k] = c[k]/maxf
    scores = []
    for s in sentences:
        ts = [t for t in tokenize(s) if t not in STOP]
        score = sum(c.get(t,0) for t in ts) / (len(ts)+1e-9)
        scores.append(score)
    return scores

def top_k_summary(sentences, k=6):
    if not sentences:
        return []
    scores = score_sentences(sentences)
    idx = list(range(len(sentences)))
    idx.sort(key=lambda i: scores[i], reverse=True)
    chosen = sorted(idx[:k])
    return [sentences[i] for i in chosen]

CN_GLOSS = {
    "experiment":"实验","experimental":"实验的","design":"设计","random":"随机",
    "randomized":"随机化","assignment":"分配","comparative":"对比","control":"控制",
    "confounding":"混杂","confounders":"混杂因素","variable":"变量","variables":"变量",
    "response":"应变量","explanatory":"自变量","blocking":"分组(区组)","block":"区组",
    "matched":"匹配","pairs":"配对","replication":"重复(样本量)","bias":"偏差",
    "blinding":"盲法","double-blind":"双盲","single-blind":"单盲","placebo":"安慰剂",
    "precision":"精度","variability":"变异","population":"总体","sample":"样本",
    "generalization":"外推(泛化)","causal":"因果的","inference":"推断","scope":"适用范围"
}

def gloss_translate(en_sentence: str) -> str:
    words = en_sentence.split()
    out = []
    for w in words:
        base = re.sub(r"[^a-zA-Z\-]", "", w).lower()
        cn = CN_GLOSS.get(base)
        out.append(f"{w}({cn})" if cn else w)
    return " ".join(out)

def build_bilingual_notes(summary_sentences):
    notes = []
    for s in summary_sentences:
        zh = gloss_translate(s)
        notes.append({"en": s, "zh": zh})
    return notes

def export_markdown(notes, out_path: str):
    md = ["# AP Bilingual Notes (Auto)",
          "",
          "> 自动生成：上为原文句子，下为带中文术语提示的对照（示范模式）。",
          ""]
    for i, item in enumerate(notes, 1):
        md.append(f"## {i}.")
        md.append("**EN**: " + item["en"])
        md.append("**ZH(Gloss)**: " + item["zh"])
        md.append("")
    Path(out_path).write_text("\n".join(md), encoding="utf-8")

def export_mermaid_mindmap(notes, out_path: str, title="AP Experimental Design"):
    all_text = " ".join(n["en"] for n in notes).lower()
    tokens = [t for t in re.findall(r"[a-z\-]+", all_text) if t not in STOP]
    c = Counter(tokens)
    core = [w for w,_ in c.most_common(10)]
    lines = ["```mermaid","mindmap",f"  root(({title}))"]
    for w in core:
        lines.append(f"    {w}")
    lines.append("```")
    Path(out_path).write_text("\n".join(lines), encoding="utf-8")
'''
(SRC / 'pipeline.py').write_text(pipeline_py, encoding='utf-8')

cli_py = r'''
import argparse
from pathlib import Path
from .pipeline import simple_sentence_split, top_k_summary, build_bilingual_notes, export_markdown, export_mermaid_mindmap

def main():
    ap = argparse.ArgumentParser(description='AP bilingual note generator (demo)')
    ap.add_argument('--input', required=True, help='Path to plain .txt')
    ap.add_argument('--outdir', default='output', help='Directory to save outputs')
    ap.add_argument('--k', type=int, default=6, help='Number of summary sentences')
    args = ap.parse_args()
    Path(args.outdir).mkdir(parents=True, exist_ok=True)
    text = Path(args.input).read_text(encoding='utf-8', errors='ignore')
    sentences = simple_sentence_split(text)
    summary = top_k_summary(sentences, k=args.k)
    notes = build_bilingual_notes(summary)
    export_markdown(notes, str(Path(args.outdir)/'notes_bilingual.md'))
    export_mermaid_mindmap(notes, str(Path(args.outdir)/'mindmap.mmd'))
    print('Saved to', args.outdir)
if __name__ == '__main__':
    main()
'''
(SRC / 'cli.py').write_text(cli_py, encoding='utf-8')

# 写 README
Path('README.md').write_text(readme, encoding='utf-8')

# 拷贝运行结果与示例
from shutil import copyfile
copyfile(ASSETS / 'wordfreq.png', ASSETS / 'wordfreq.png')
copyfile(ASSETS / 'notes_preview.png', ASSETS / 'notes_preview.png')

# 压缩
zip_path = Path('ap-note-gen.zip')
import zipfile
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as z:
    for folder, _, files in os.walk('.'):
        # 排除 ./.ipynb_checkpoints 和本 ipynb 本身
        if '.ipynb_checkpoints' in folder:
            continue
        for f in files:
            if f.endswith('.ipynb'):
                continue
            fp = Path(folder) / f
            z.write(fp, fp.relative_to('.'))
print('Exported zip ->', zip_path.resolve())
