# 准备数据

In [1]:
def per_process(text):
    liens = text.splitlines()
    liens = map(lambda x: x.replace("> ","").replace(">","").replace("*", ""), liens)
    liens = filter(lambda x: x.strip() != "", liens)
   
    return "\n".join(liens)

In [2]:
import os
import glob
from pathlib import Path

examination_paper_list = []

path = Path("./answer_markdown")

for file in path.glob("*.md"):
    with open(file, "r", encoding="utf-8") as f:
        examination_paper_list.append({
            "file_path": file,
            "text": per_process(f.read())
        })

# 创建寻找题目（题目以及对应的选项等）的方法

In [3]:
import bisect

def extract_and_combine_numbers(str_val):
    first_two_chars = str_val[:2]
    numbers = ''.join(c for c in first_two_chars if c.isdigit())
    
    return int(numbers) if numbers else 0


def max_consecutive_increasing_subsequence(topics):
    lst = [extract_and_combine_numbers(topic) for topic in topics]
    if not lst:
        return None, None

    max_length = 1
    curr_length = 1

    start_pos = end_pos = 0

    curr_start_pos = 0

    for i in range(1, len(lst)):
        if lst[i] > lst[i - 1]:
            curr_length += 1
            if curr_length > max_length:
                max_length = curr_length
                start_pos = curr_start_pos
                end_pos = i
        else:
            curr_length = 1
            curr_start_pos = i

    return start_pos, end_pos

In [4]:
import re

def find_next_question_index(start, lines):
    pattern = re.compile(r'^(\d+)[．.|\(]|^[（(]\d+[）)]|^\d+\\\.')
    
    for index in range(start+1, len(lines)):
        match = pattern.match(lines[index])
        
        if match:
            return index

    return len(lines)
    

def get_all_question(text):
    lines = text.splitlines()
    
    all_question = []
    
    index = find_next_question_index(0, lines)
    while index < len(lines):
        next_index = find_next_question_index(index, lines)
       
        all_question.append("\n".join(lines[index: next_index]))
        index = next_index
   
    start, end = max_consecutive_increasing_subsequence(all_question)
        
#     print(f"{start} {end}")
    
    if start is None:
        return None, None

    return all_question[start: end+1], all_question[end]

# 创建测试样本

In [50]:
test_text = examination_paper_list[44]["text"]

In [51]:
find_next_question_index(0, test_text.splitlines())

3

In [52]:
# split_str 用于标记答题区和答案区的分隔
all_question, split_str = get_all_question(test_text)
all_question

['1．若tg=,则tg(+)= [ ]{.underline} .',
 '2．设抛物线的顶点坐标为(2,0),准线方程为x=－1,则它的焦点坐标为 [ ]{.underline} .',
 '3．设集合A={5,log~2~(a+3)},集合B={a,b}.若A∩B={2},则A∪B= [ ]{.underline} .',
 '4．设等比数列{a~n~}(n∈N)的公比q=－,且(a~1~+a~3~+a~5~+...+a~2n-1~)=,则a~1~= [ ]{.underline} .\n![](./notebook/image/media/image6.jpeg)5．设奇函数f(x)的定义域为\\[－5,5\\].若当x∈\\[0,5\\]时,\nf(x)的图象如右图,则不等式f(x)\\<0的\n解是 [ ]{.underline} .',
 '6．已知点A(1, －2),若向量与={2,3}同向,\n=2,则点B的坐标为 [ ]{.underline} .',
 '7．在极坐标系中,点M(4,)到直线l: (2cos+sin)=4的距离d= [ ]{.underline} .',
 '8．圆心在直线2x－y－7=0上的圆C与y轴交于两点A(0, －4),B(0, －2),则圆C的方程\n为 [ ]{.underline} .',
 '9．若在二项式(x+1)^10^的展开式中任取一项,则该项的系数为奇数的概率是 [ ]{.underline} . (结果用分数表示)',
 '10．若函数f(x)=a在\\[0,+∞\\]上为增函数,则实数a、b的取值范围是 [ ]{.underline} .',
 '11．教材中"坐标平面上的直线"与"圆锥曲线"两章内容体现出解析几何的本质是 [ ]{.underline}\n[ ]{.underline} .',
 '12．若干个能唯一确定一个数列的量称为该数列的"基本量".设{a~n~}是公比为q的无穷等比数列,下列{a~n~}的四组量中,一定能成为该数列"基本量"的是第 [ ]{.underline} 组.(写出所有符合要求的组号)\n①S~1~与S~2~; ②a~2~与S~3~; ③a~1~与a~n~; ④q与a~n~.\n其中n为大于1的整数, S~n~为{a~n~}的前n项和.\n二、选择题(本大题满分16分

In [33]:
answer_words = ["解析", "答案", "分析", "详解", "试题解析"]

def number_of_judgments(all_question):
    occurrence_number = 0
    for question in all_question:
        if any(question.find(answer_word) != -1 for answer_word in answer_words):
            occurrence_number += 1
    if occurrence_number < len(all_question) / 2:
        if occurrence_number != 0: 
            occurrence_number = -1  # 答案少于题目的一半则认为答案在下方
        else:
            occurrence_number = 0  # 完全没答案返回-1
    else:
        occurrence_number = 1 #有答案在题目下方
    return occurrence_number

In [34]:
for index, examination_paper in enumerate(examination_paper_list):
    all_question, split_str = get_all_question(examination_paper['text'])
    if all_question is None:
        continue
    examination_paper_list[index]['occurrence_number'] = number_of_judgments(all_question)

In [36]:
examination_paper_list[1]['occurrence_number']

0

# 准备对齐答案的方法

In [45]:
def generate_answer_area_string(text, split_str):
    return text.split(split_str)[1]

In [46]:
answer_str = generate_answer_area_string(test_text, split_str)
answer_str

'\n1．D 2．D 3．A 4．B 5．C 6．C 7．B 8．D 9．C 10．D 11．B 12．A\n二、填空题：本大题共4小题，每小题4分，共16分.把答案填在题中横线上.\n13． 14．35 15．192 16．④\n17．本小题考三角函数的基本公式以及三角函数式的恒等基础知识和基本运算技能，满分12分.\n解法一：由已知得：\n由已知条件可知\n解法二：由已知条件可知\n18．本小题主要考查线面关系和正方体等基础知识，考查空间想象能力和推理能力.满分12分.\n解法一：（Ⅰ）∵A~1~A⊥底面ABCD，则AC是A~1~C在底面ABCD的射影.\n∵AC⊥BD.∴A~1~C⊥BD.\n同理A~1~C⊥DC~1~,又BD∩DC~1~=D,\n∴A~1~C⊥平面BDC~1.~\n（Ⅱ）取EF的中点H，连结BH、CH，\n又E、F分别是AC、B~1~C的中点，\n解法二：（Ⅰ）以点C为坐标原点建立如图所示的空间直角坐标系,则C(0,0,0).\nD(1,0,0),B(0,1,0),A~1~(1,1,1),C~1~(0,0,1),D~1~(1,0,1)\n（Ⅱ）同（I）可证，BD~1~⊥平面AB~1~C.\n19．本小题主要考查向量的概念，平面向量的运算法则，考查运用向量及函数知识的能力，满分12分.\n解法二：以直角顶点A为坐标原点，两直角边所在直线为坐标轴建立如图所示的平面直角坐标系.\n20．本小题主要考查直线、双曲线的方程和性质，曲线与方程的关系，及其综合应用能力，满分12分.\n解：（Ⅰ）将直线\n......①\n依题意，直线l与双曲线C的右支交于不同两点，故\n（Ⅱ）设A、B两点的坐标分别为、，则由①式得\n......②\n假设存在实数k，使得以线段AB为直径的圆经过双曲线C的右焦点F（c,0）.\n则由FA⊥FB得：\n整理得\n......③\n把②式及代入③式化简得\n21．本小题考查概率的基础知识以及运用概率知识解决 实际问题的能力，满分12分.\n解：方案1：单独采用一种预防措施的费用均不超过120万元.由表可知，采用甲措施，可使此突发事件不发生的概率最大，其概率为0.9.\n方案2：联合采用两种预防措施，费用不超过120万元，由表可知.联合甲、丙两种预防措施可使此突发事件不发生的概率最大，其概率为\n1---(1---0.9)(1-

In [47]:
def alignment_answer(all_question, answer_str):
    questions_with_answer = []
    
    numbers = [int(''.join(filter(str.isdigit, item[:2]))) for item in all_question]
    answer_str = answer_str[answer_str.find(str(numbers[0])):][1:]
    
    for index, sequence_number in enumerate(numbers):
        if index >= len(numbers) - 1:
            questions_with_answer.append({
                "answer": answer_str,
                "question": all_question[index]
            })
                
            break
            
        next_sequence_number = numbers[index+1]
        split_str = answer_str.split(str(next_sequence_number))
        answer_str = f"{str(next_sequence_number)}".join(split_str[1:])
        
        questions_with_answer.append({
            "answer": str(next_sequence_number-1) + split_str[0],
            "question": all_question[index]
        })
        
    return questions_with_answer

In [53]:
for row in alignment_answer(all_question, answer_str):
    print()
    print(row["question"])
    print("------------------------------------------------------------")
    print(f"{row['answer']}")
    print()
    print("============================================================")


1．若tg=,则tg(+)= [ ]{.underline} .
------------------------------------------------------------
1．D 


2．设抛物线的顶点坐标为(2,0),准线方程为x=－1,则它的焦点坐标为 [ ]{.underline} .
------------------------------------------------------------
2．D 


3．设集合A={5,log~2~(a+3)},集合B={a,b}.若A∩B={2},则A∪B= [ ]{.underline} .
------------------------------------------------------------
3．A 


4．设等比数列{a~n~}(n∈N)的公比q=－,且(a~1~+a~3~+a~5~+...+a~2n-1~)=,则a~1~= [ ]{.underline} .
![](./notebook/image/media/image6.jpeg)5．设奇函数f(x)的定义域为\[－5,5\].若当x∈\[0,5\]时,
f(x)的图象如右图,则不等式f(x)\<0的
解是 [ ]{.underline} .
------------------------------------------------------------
5．B 5．C 


6．已知点A(1, －2),若向量与={2,3}同向,
=2,则点B的坐标为 [ ]{.underline} .
------------------------------------------------------------
6．C 


7．在极坐标系中,点M(4,)到直线l: (2cos+sin)=4的距离d= [ ]{.underline} .
------------------------------------------------------------
7．B 


8．圆心在直线2x－y－7=0上的圆C与y轴交于两点A(0, －4),B(0, －2),则圆C的方程
为 [ ]{.underline} .
---------------------------------------------

In [54]:
data = []
for examination_paper in examination_paper_list:
   
    text = examination_paper["text"]
    all_question, split_str = get_all_question(text)
    if not all_question:
        continue
    answer_str = generate_answer_area_string(text, split_str)
    question_with_answer = alignment_answer(all_question, answer_str)
    data.append(question_with_answer)
    
len(data) / len(examination_paper_list)

0.9287925696594427