In [15]:
import sys

sys.path.append('../')

from exam_alignment.utils.alignment_utils import one_file_per_process
from exam_alignment.type_exam_parser.abstract_exam_parser import AbstractExamParser
from exam_alignment.type_exam_parser.annotated_exam_parser import AnnotatedExamParser
from exam_alignment.type_exam_parser.standard_exam_parser import StandardExamParser

from exam_alignment.exam_parser_container import ExamParserContainer

from exam_alignment.utils.new_alignment_utils import longest_increasing_subsequence_index

In [16]:
import os
import glob
from pathlib import Path
import re

examination_paper_list = []

path = Path("./docx_markdowns")

for file in path.glob("*.md"):
    with open(file, "r", encoding="utf-8") as f:
        examination_paper_list.append({
            "file_path": str(file),
            "text": one_file_per_process(f.read())
        })
len(examination_paper_list)

2230

In [17]:
remaining_exams = []
no_answer_exams = []

standard_exams = []
annotated_exams = []

for exam_info in examination_paper_list:
    if not AbstractExamParser.check_contains_answers(exam_info["text"]):
        no_answer_exams.append(exam_info)
        continue

    if StandardExamParser.detect_this_exam_type(exam_info["text"]):
        standard_exams.append(exam_info)
        continue

    if AnnotatedExamParser.detect_this_exam_type(exam_info["text"]):
        annotated_exams.append(exam_info)
        continue

    remaining_exams.append(exam_info)

In [30]:
# 识别标注和解析类型的占比
percentage = (len(standard_exams) + len(annotated_exams)) / (len(examination_paper_list)-len(no_answer_exams)) * 100
print(percentage)
# 无答案的试卷
len(remaining_exams)

53.58255451713395


894

In [29]:
len(examination_paper_list)/len(no_answer_exams)

7.3355263157894735

In [28]:
test_text = remaining_exams[100]["text"]
print(test_text)


北师大版小学五年级下册数学第二单元《长方体（一）------露在外面的面》同步检测2（附答案）
1．有5个棱长为40厘米的正方体纸箱放在墙角处(如图)，有几个面露在外面？
![](./docx_images/media/image1.jpeg) 露在外面的面积一共有多少平方厘米？
2．下列各个图形中分别有几个面露在外面？露在外面的面积是多少？(图中小正
方体的棱长为2dm) 来源：www.bcjy123.com/tiku/
![](./docx_images/media/image2.jpeg)（1）
\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_
（2）
![](./docx_images/media/image3.jpeg)
\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_
![](./docx_images/media/image4.jpeg)3．将棱长是2cm的小正方体按下图方式摆放在桌面上，露在外面的面数发生了怎样的变化？
(1)观察上图，把下表填写完整。
  ---------------- --- --- --- --- ---
  小正方形的个数   1   2   3   4   5
  露在外面的面数                   
  ---------------- --- --- --- --- ---
(2)如果摆5个小正方体，露在外面的面的面积是多少平方厘米？
来源：www.bcjy123.com/tiku/
4．在正确答案后面的口里打√。
![](./docx_images/media/image5.jpeg) 一个棱长为2厘米的正方体，在它的一个角上挖掉一块棱长是1厘米的小正方体(如右图)，它的表面积与原来的正方体相比较，结果会怎样？
（1）减少了 （2）相等 （3）增加了
参考答案
1．10个面 16000平方厘米
2．(1)15个面 2×2×15＝60(d㎡)
(2)14个面 2×21×14＝56(d㎡)
3．(1)5 8 11 14 17 (2)2×2×17＝68(c㎡)
4．(2)√
来源：www.bcjy123.com/tiku/


In [55]:
class StandardExamParser(AbstractExamParser):
    def __init__(self, content):
        super().__init__(content)

    
    @staticmethod
    def detect_this_exam_type(content):
        lines = content.splitlines()
        answer_split_str = StandardExamParser.get_answer_split_str(lines[5:])
        if answer_split_str is None:
            return False
        answer_split_str_index = 0
        for i in range(len(lines)):
            if lines[i] == answer_split_str:
                answer_split_str_index = i
                break
        question_list = StandardExamParser.get_all_question_number(lines[:answer_split_str_index])
        answer_list = StandardExamParser.get_all_question_number(lines[answer_split_str_index:])
        return question_list == answer_list

    
    def extract_questions(self):
        lines = self.content.splitlines()
        answer_split_str = StandardExamParser.get_answer_split_str(lines[5:])
        answer_split_str_index = 0
        for i in range(len(lines)):
            if lines[i] == answer_split_str:
                answer_split_str_index = i
                break 
        question_indexes = self.longest_increasing_subsequence_index(StandardExamParser.find_questions_and_answer_indexes(lines[:answer_split_str_index]))
        # 提取每个题目的内容
        questions_content = []
        lines = lines[:answer_split_str_index]
        for i, start_index in enumerate(question_indexes):
            # 如果不是最后一个题目，则结束索引是下一个题目的开始索引
            # 如果是最后一个题目，则内容一直取到文档的结尾
            end_index = question_indexes[i + 1] if i + 1 < len(question_indexes) else len(lines)
            # 提取当前题目的内容，包括起始索引行但不包括结束索引行
            question_content = lines[start_index:end_index]
            # 将所有行连接成一个字符串
            questions_content.append('\n'.join(question_content))

        return questions_content
    
    def extract_topic_details(self):
        topic_numbers_with_content = self.find_all_topic_numbers_with_content()
        topic_details = self.construct_complete_topic_details(topic_numbers_with_content)
        topic_details = self.find_most_concentrated_increasing_subsequence(topic_details)
        return topic_details
    
    def extract_answers(self):
        lines = self.content.splitlines()
        question_list, answer_str = StandardExamParser.find_all_topic_numbers_with_content(lines)
        joined_questions = "".join(question_list[-1]['content'])
        answer_list = []
        if answer_str in joined_questions:
            # 分割字符串，获取 answer_str 右边的内容
            answer_right_side = joined_questions.split(answer_str, 1)[1]
            answer_number = 1
            while(True):
                answer, text = StandardExamParser.find_answer_by_number(answer_right_side, answer_number)
                if answer is not None:
                    answer_list.append(answer)
                    answer_number = answer_number + 1
                else:
                    return answer_list
        else:
            return []
        
 
    def align(self):
        pass
    
    @staticmethod
    def get_all_question_number(lines):
        """
        传入试卷的每行
        返回找到的所有题号（题号不在开头也可以）
        """
        text = ''.join(lines).replace('\\', '')
        number = 1
        number_list = []
        while(number < 24):
            line, split_text = StandardExamParser.find_answer_by_number(text, number)
            if line is None:
                number = number + 1
                
                
                continue
            text = split_text
            number_list.append(number)
            number = number + 1
        return number_list
    
    @staticmethod
    def find_answer_by_number(text, number, isAdaptationSymbol=True):
        # 将数字转换为字符串
        number_str = str(number)
        next_number_str = str(number + 1)

        # 根据是否适配标点符号选择正则表达式
        if isAdaptationSymbol:
            pattern = re.compile(rf'({number_str}[．.]).*?(?={next_number_str}[．.]|\Z)', re.DOTALL)
        else:
            pattern = re.compile(rf'({number_str}).*?(?={next_number_str}|\Z)', re.DOTALL)

        match = pattern.search(text)
        if match:
            # 匹配到的文本是当前题号到下一个题号之间的内容
            matched_text = match.group(0)
            # 剩余的文本是匹配到的内容之后的所有文本
            rest_text = text[match.end():]
            if rest_text == "":
                rest_text = text
            return matched_text, rest_text

        # 如果没有匹配，返回None和原始文本
        return None, text


    @staticmethod
    def extract_number_from_string(input_string):
        pattern = r"(\d+(\.\d+)*)"  # 匹配一个或多个数字和可选的小数点
        match = re.search(pattern, input_string)
        if match:
            number_str = match.group(1)  # 获取匹配的数字部分
            return int(float(number_str))  # 将数字部分转换为浮点数，然后再转换为整数
        return None 
    
    @staticmethod
    def get_paper_question_by_number(question_indexes, lines):
        question_list = []
        answer_area_str = ""
        for i, question_index in enumerate(question_indexes):
            if i+1 == len(question_indexes):
                question_list.append("".join(lines[question_indexes[i]:]))
                answer_area_str = StandardExamParser.get_answer_split_str(lines[question_indexes[i]:])
            else:
                question_list.append("".join(lines[question_index:question_indexes[i+1]]))

        return question_list,answer_area_str
    
    @staticmethod
    def get_answer_split_str(lines, answer_words = ["参考答案", "试题解析", "参考解答"]):
        """
        传入试卷文本的每一行
        返回分割的答案区域分割的位置
        """
        for line in lines:
            if any(answer_word in line for answer_word in answer_words):
                return line
        return None
                
    @staticmethod
    def extract_number(s):
        s = str(s)
        match = re.search(r'(\d+)', s)
        return int(match.group(1)) if match else None
    @staticmethod
    def has_equal_subsequences(lst):
        def get_subsequences(lst):
            subs = []
            temp = [lst[0]]

            i = 1
            while i < len(lst):
                expected_next = str(StandardExamParser.extract_number(temp[-1]) + 1)
                if lst[i] == expected_next:  # Continuation found
                    temp.append(lst[i])
                    i += 1
                else:  # Not continuous, but check if the next expected is coming up later
                    if expected_next in lst[i:i+5]:  # Look ahead up to 5 elements to find the next continuation
                        next_index = lst[i:i+5].index(expected_next) + i
                        temp.append(lst[next_index])
                        i = next_index + 1
                    else:  # No continuation found soon enough, so reset
                        subs.append(temp)
                        temp = [lst[i]]
                        i += 1
            if temp:
                subs.append(temp)

            subs.sort(key=len, reverse=True)
            return subs

        subsequences = get_subsequences(lst)
        if len(subsequences) != 2:
            return False
        return subsequences[0]==subsequences[1]

    @staticmethod
    def extract_leading_number(line):
        match = re.search(r"^\d+[\.|\．|、]", line.replaqce("\\",""))
        if match:
            return match.group().rstrip('.').rstrip('．').rstrip('、')
        else:
            return None
    
    @staticmethod    
    def find_all_topic_number(lines: list[str]) -> list[int]:
        indexes = []
        pattern = r"^\d+[\.|\．|、]"
        for index in range(len(lines)):
            match = re.search(pattern, lines[index].replace("\\",""))
            if match: 
                indexes.append(index)
        return indexes 

    @staticmethod
    def find_questions_and_answer_indexes(lines: list[str]) -> list[super]:
        """
        获取题目和答案的lines下标

        判断行的开始是否为 数字+[. ．]
        """
        question_nums = []
        question_num_indexs = []
        pattern = r"^(\d+)[\.|\．|、]"
        for index in range(len(lines)):
            match = re.search(pattern, lines[index].replace("\\",""))
            if match: 
                question_number = match.group(1)
                question_nums.append(question_number)
                question_num_indexs.append(index)
        return [(int(x), y) for x, y in zip(question_nums, question_num_indexs)]
    
    @staticmethod
    def find_all_topic_numbers_with_content(lines):
        """
        获取标准试卷答案区
        返回每题答案的集合
        """
        question_number_indexs = longest_increasing_subsequence_index(StandardExamParser.find_questions_and_answer_indexes(lines))
        question_list,answer_area_str = StandardExamParser.get_paper_question_by_number(question_number_indexs, lines)
        new_question_list = []
        for question in question_list:
            new_question_list.append({"topic_number":StandardExamParser.extract_number_from_string(question), "content": question})
        return StandardExamParser.construct_complete_topic_details(new_question_list),answer_area_str

In [56]:
StandardExamParser(test_text).extract_answers()

['1．（5分）设i是虚数单位，则复数![](./docx_images/media/image31.png)在复平面内对应的点位于（）A．第一象限 B．第二象限 C．第三象限 D．第四象限【分析】先化简复数，再得出点的坐标，即可得出结论．【解答】解：![](./docx_images/media/image31.png)=i（1+i）=﹣1+i，对应复平面上的点为（﹣1，1），在第二象限，故选：B．【点评】本题考查复数的运算，考查复数的几何意义，考查学生的计算能力，比较基础．',
 '2．（5分）下列函数中，既是偶函数又存在零点的是（）A．y=cosx B．y=sinx C．y=lnx D．y=x^2^+1【分析】利用函数奇偶性的判断方法以及零点的判断方法对选项分别分析选择．【解答】解：对于A，定义域为R，并且cos（﹣x）=cosx，是偶函数并且有无数个零点；对于B，sin（﹣x）=﹣sinx，是奇函数，由无数个零点；对于C，定义域为（0，+∞），所以是非奇非偶的函数，有一个零点；对于D，定义域为R，为偶函数，都是没有零点；故选：A．【点评】本题考查了函数的奇偶性和零点的判断．①求函数的定义域；②如果定义域关于原点不对称，函数是非奇非偶的函数；如果关于原点对称，再判断f（﹣x）与f（x）的关系；相等是偶函数，相反是奇函数；函数的零点与函数图象与x轴的交点以及与对应方程的解的个数是一致的．',
 '3．（5分）设p：1＜x＜2，q：2^x^＞1，则p是q成立的（）A．充分不必要条件 B．必要不充分条件C．充分必要条件 D．既不充分也不必要条件【分析】运用指数函数的单调性，结合充分必要条件的定义，即可判断．【解答】解：由1＜x＜2可得2＜2^x^＜4，则由p推得q成立，若2^x^＞1可得x＞0，推不出1＜x＜2．由充分必要条件的定义可得p是q成立的充分不必要条件．故选：A．【点评】本题考查充分必要条件的判断，同时考查指数函数的单调性的运用，属于基础题．',
 '4．（5分）下列双曲线中，焦点在y轴上且渐近线方程为y=±2x的是（）A．x^2^﹣![](./docx_images/media/image32.png)=1 B．![](./docx_images/media/image33.png)﹣y^2^=1 C．![](./docx_images/media/im

In [33]:
print(test_text.splitlines()[4])

2．（5分）下列函数中，既是偶函数又存在零点的是（）


In [19]:
StandardExamParser.detect_this_exam_type(test_text)

True