In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import joblib
import datasets

# Train

In [6]:
import re

def remove_image_string(input_string):
    pattern = r"!\[(.*?)\]\(.*?\)\{width=\".*?\" height=\".*?\"\}|!\[.*?\]\(.*?\)|\[.*?\]\{.*?\}"
    result = re.sub(pattern, "", input_string)
    return result

def remove_noise_character(input_string):
    pattern = r"[>*|image|data|media|png]"
    
    return re.sub(pattern, "", input_string)

def one_text_pre_process(text):
    precessed_text_split_lines = []
    
    for line in text.splitlines():
        remove_image_line = remove_image_string(line)

        if remove_image_line.strip() in ["", ">"]:
            continue

        remove_image_line = remove_noise_character(remove_image_line)
        precessed_text_split_lines.append(remove_image_line)
        
    return "\n".join(precessed_text_split_lines)

def pre_process(text_list):
    precessed_text_list = [one_text_pre_process(text) for text in text_list]
    return precessed_text_list

def dataset_map_pre_process(row):
    row["text"] = one_text_pre_process(row["text"])
    return row

In [7]:
import jieba

def chinese_tokenizer(text):
    tokens = jieba.cut(text)
    return list(tokens)

vocabulary = {
    "考试",
    "选择题",
    "判断题",
    "填空题",
    "试卷",
    "单元",
    "单元测试",
    "答案",
    "分数",
    "简答题",
    "试题",
    "高考",
    "考试真题",
    "真题"
}


def train(dataset):
#     不建议手动特征
#     vectorizer = CountVectorizer(vocabulary=vocabulary, tokenizer=chinese_tokenizer, ngram_range=(1, 5))
    vectorizer = CountVectorizer(tokenizer=chinese_tokenizer, ngram_range=(1, 10))
    classifier = LogisticRegression(max_iter=1500, verbose=1)
    model = make_pipeline(vectorizer, classifier)
    
    model.fit(dataset["text"], dataset["label"])
    return model

In [8]:
datadict = datasets.load_from_disk("./dataset")

In [9]:
train_dataset = datadict["train"]

In [10]:
train_dataset = train_dataset.map(dataset_map_pre_process)

Loading cached processed dataset at /home/ran/WorkSpace/my_Exam-Question-Bank-Dataset-zh_mnbvc/notebook/dataset/train/cache-fdb59d6d907c2395.arrow


In [11]:
model = train(train_dataset)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.300 seconds.
Prefix dict has been built successfully.
[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =     63301645     M =           10

At X0         0 variables are exactly at the bounds


 This problem is unconstrained.



At iterate    0    f=  1.92626D+03    |proj g|=  3.63434D+05

At iterate   50    f=  2.12263D+02    |proj g|=  1.73208D+04

At iterate  100    f=  9.04706D+01    |proj g|=  2.90586D+03

At iterate  150    f=  4.50236D+01    |proj g|=  8.43264D+01

At iterate  200    f=  3.28981D+01    |proj g|=  9.14091D+00

At iterate  250    f=  2.79421D+01    |proj g|=  4.21906D+01

At iterate  300    f=  2.52349D+01    |proj g|=  6.07052D+00

At iterate  350    f=  2.40528D+01    |proj g|=  9.27223D+01

At iterate  400    f=  2.37315D+01    |proj g|=  4.99852D+01

At iterate  450    f=  2.32382D+01    |proj g|=  2.81430D+01

At iterate  500    f=  2.30191D+01    |proj g|=  5.25895D+00



 Bad direction in the line search;
   refresh the lbfgs memory and restart the iteration.



At iterate  550    f=  2.19636D+01    |proj g|=  2.12715D+01

At iterate  600    f=  2.13172D+01    |proj g|=  5.21387D+00

At iterate  650    f=  2.09607D+01    |proj g|=  3.10798D+00

At iterate  700    f=  2.07168D+01    |proj g|=  1.22201D+01

At iterate  750    f=  2.05378D+01    |proj g|=  4.68612D+00

At iterate  800    f=  2.04890D+01    |proj g|=  5.96808D+00

At iterate  850    f=  2.04561D+01    |proj g|=  2.75782D-01

At iterate  900    f=  2.04441D+01    |proj g|=  2.57272D-01

At iterate  950    f=  2.04385D+01    |proj g|=  9.24879D-01

At iterate 1000    f=  2.04332D+01    |proj g|=  7.97755D-01

At iterate 1050    f=  2.04321D+01    |proj g|=  1.46332D-01

At iterate 1100    f=  2.04307D+01    |proj g|=  5.60908D-01

At iterate 1150    f=  2.04298D+01    |proj g|=  3.29849D-01

At iterate 1200    f=  2.04293D+01    |proj g|=  3.31572D-01

At iterate 1250    f=  2.04291D+01    |proj g|=  6.40701D-01

At iterate 1300    f=  2.04290D+01    |proj g|=  2.19112D+00

At iter

[Parallel(n_jobs=15)]: Done   1 out of   1 | elapsed: 168.6min finished


In [36]:
joblib.dump(model, 'TextClassifier-new-63m.pkl')

['TextClassifier-new-63m.pkl']

# 模型数据

In [12]:
print("模型方程: y = ", model.named_steps['logisticregression'].coef_, " * x + ", model.named_steps['logisticregression'].intercept_)

print("参数量: ", model.named_steps['logisticregression'].coef_.size + 1)

模型方程: y =  [[ 6.42874349e-02 -6.07271510e-04  4.35482550e-07 ...  1.78620692e-08
   1.78620692e-08  1.78620692e-08]]  * x +  [-2.08611946]
参数量:  63301645


In [13]:
count_vectorizer = model.steps[0][1]

In [14]:
count_vectorizer.vocabulary_

{'北师大': 29165570,
 '版': 44338147,
 '小学': 34922410,
 '三年级': 20997501,
 '下册': 21346892,
 '数学': 39892831,
 '第六': 48709930,
 '单元': 29386602,
 '《': 19861441,
 '认识': 51865032,
 '分数': 28058549,
 '》': 19894146,
 '单元测试': 29388936,
 '3': 9261980,
 '（': 55829287,
 '附': 54713684,
 '答案': 48917215,
 '）': 56486272,
 '\n': 0,
 '一': 20342601,
 '、': 18039995,
 '用': 44773335,
 '表示': 51021178,
 '下面': 21394465,
 '每个': 42972104,
 '图里': 32105772,
 '的': 45209542,
 '阴影': 54690534,
 '部分': 53967706,
 '。': 19007634,
 '每空': 43028459,
 '2': 8403637,
 '分': 27846262,
 '，': 57369775,
 '共': 26962411,
 '12': 8108522,
 '二': 24110146,
 '判断题': 28554433,
 '对': 34553114,
 '在': 32187508,
 '括号': 38893746,
 '里': 54130366,
 '打': 38435451,
 '"': 4582718,
 '√': 17123744,
 '错': 54373799,
 '×': 16707845,
 '10': 7962922,
 '1': 7448140,
 ' ': 1953046,
 '是': 40813791,
 '同': 30815750,
 '分母': 28160591,
 '相加': 47732440,
 '减': 27552097,
 '不变': 21562281,
 '分子': 28024464,
 '＋': 57286434,
 '=': 10736687,
 '4': 9667429,
 '如': 33649517,
 '左图': 

# Test

In [21]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from tabulate import tabulate

In [28]:
def predict_with_threshold(model, X, threshold=0.98):
    probabilities = model.predict_proba(X)
    
    positive_probabilities = probabilities[:, 1]
    
    predictions = (positive_probabilities > threshold).astype(int)
    
    return predictions

In [16]:
test_dataset = datadict["test"]

In [17]:
test_dataset = test_dataset.map(dataset_map_pre_process)

Loading cached processed dataset at /home/ran/WorkSpace/my_Exam-Question-Bank-Dataset-zh_mnbvc/notebook/dataset/test/cache-50aac81089b34e9b.arrow


In [29]:
import time

start_time = time.time()

predictions = predict_with_threshold(model, test_dataset["text"])

print(f"time { time.time() - start_time}")

time 23.28389811515808


In [30]:
accuracy = accuracy_score(test_dataset["label"], predictions)
recall = recall_score(test_dataset["label"], predictions)
precision = precision_score(test_dataset["label"], predictions)
f1 = f1_score(test_dataset["label"], predictions)

In [31]:
data = [["Accuracy", accuracy], ["Recall", recall], ["Precision", precision], ["F1 Score", f1]]

print(tabulate(data, headers=["Metric", "Score"]))

Metric        Score
---------  --------
Accuracy   0.992248
Recall     0.990099
Precision  0.995025
F1 Score   0.992556


In [626]:
# threshold=0.5 and threshold=0.9
Metric        Score
---------  --------
Accuracy   0.989664
Recall     0.990099
Precision  0.990099
F1 Score   0.990099


# threshold=0.95（错误3个，全都是英语且没有明显特征或特征单一）
Metric        Score
---------  --------
Accuracy   0.992248
Recall     0.990099
Precision  0.995025
F1 Score   0.992556

SyntaxError: invalid syntax (4077443080.py, line 1)

# Check

In [32]:
# 1："试卷" 0:“其他”
for index, prediction in enumerate(predictions):
    if test_dataset['label'][index] != prediction:
        print(f"{index}  预期:{test_dataset['label'][index]} -- 结果:{prediction}")

11  预期:1 -- 结果:0
192  预期:1 -- 结果:0
386  预期:0 -- 结果:1


In [35]:
for line in test_dataset["text"][386].splitlines():
    print(line)

FREE高考英语词汇班
151 vr 曾经 h bs fl I hv vr s
强调极端概念  fr 
   o I w o s you ! /   
152 uss 猜 随意 错误率高
153 hr 听到 状态
ls o 听 动作
154 hur v 伤害 hur o's fl
v 疼痛 
My h s . 受伤 hur / jur j 受伤的
My h . 疼
sh jur sb = sb s jur jury / hur  伤口
155  o o sh 打算做某事
o 打算 意图 未发生
156 sk 任务 一次性
uy 职责 长期性
job / work 工作 该做的事 责任
crr 职业 rofsso / jor 专业
157 rch v 到达 触及  能够到达/触及的地方
wh y rch 力所能及
byo y rch 力所不能及
rch ou o ohrs = v sb  h 伸出援手
158 原因 rsul  结果 My ffors rsul  y succss.
结果 rsul fro 原因 My succss rsul fro y ffors.
rso / rsul因果 cus / ffc 因果
159 s 台阶 脚步 步骤 s by s
160 sb suffr fro sh 某人遭受某事 负 无被动
 fro  srous sochch, h w o h clc.
161 聪明 clvr / sr / brh / brll / ws / ll
Arfcl Illc / AI 人工智能
162 buy / urchs / cosu / s oy 购买
cusor 顾客 cosur 消费者
cosu -- cosuo 消费
ssu -- ssuo 假想
rsu -- rsuo 继续
sll -- sol -- sol 出售
163 h -- h -- h 隐藏 看不见 h  sk
bh r 在...后面 看不见 bh h sc
164 ju 跳 高度
l 跃 远  b l  cooy
b 节奏 hr b vry fs
sk 跳过 略过 sk h ls
165 lo -- lh 长度 w -- wh 宽度
hh -- hh 高度 wh -- wh 重量
sur  寸法 规格
166 r -- whol 部分---整体
Th cov    ws w 