In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import joblib
import datasets

# Train

In [2]:
import re

def remove_image_string(input_string):
    pattern = r"!\[(.*?)\]\(.*?\)\{width=\".*?\" height=\".*?\"\}|!\[.*?\]\(.*?\)|\[.*?\]\{.*?\}"
    result = re.sub(pattern, "", input_string)
    return result

def is_all_chinese_or_english(text):
    for char in text:
        if not is_chinese_or_english(char):
            return False
    return True

def is_chinese_or_english(char):
    # 判断是否为中文
    if '\u4e00' <= char <= '\u9fff':
        return True
    # 判断是否为英文
    elif 'a' <= char.lower() <= 'z':
        return True
    else:
        return False

def remove_noise_character(input_string):
    pattern = r"[>*|image|data|media|png]"
    
    return re.sub(pattern, "", input_string)

def one_text_pre_process(text):
    precessed_text_split_lines = []
    
    for line in text.splitlines():
        remove_image_line = remove_image_string(line)

        if remove_image_line.strip() in ["", ">"]:
            continue

        remove_image_line = remove_noise_character(remove_image_line)
        precessed_text_split_lines.append(remove_image_line)
        
    return "\n".join(precessed_text_split_lines)

def pre_process(text_list):
    precessed_text_list = [one_text_pre_process(text) for text in text_list]
    return precessed_text_list

def dataset_map_pre_process(row):
    row["text"] = one_text_pre_process(row["text"])
    return row

In [381]:
import jieba


def chinese_tokenizer(text):
    tokens = jieba.cut(text)
    return list(filter(lambda x:is_all_chinese_or_english(x), tokens))


def train(dataset):
    vectorizer = CountVectorizer(tokenizer=chinese_tokenizer, ngram_range=(1, 2))
    classifier = LogisticRegression(max_iter=100, verbose=1)
    model = make_pipeline(vectorizer, classifier)
    
    model.fit(dataset["text"], dataset["label"])
    return model

In [406]:
# datadict = datasets.load_dataset("ranWang/test_paper_textClassifier")
datadict = datasets.load_from_disk("./new_classification_datasets")
datadict

DatasetDict({
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 387
    })
    train: Dataset({
        features: ['text', 'label', 'file_path'],
        num_rows: 9536
    })
})

In [407]:
train_dataset = datadict["train"]
train_dataset = train_dataset.map(dataset_map_pre_process)

Loading cached processed dataset at /home/ran/WorkSpace/my_Exam-Question-Bank-Dataset-zh_mnbvc/notebook/new_classification_datasets/train/cache-8919a8cbc5b95eb2.arrow


In [408]:
# 随机打乱
train_dataset = train_dataset.shuffle(42)
# 验证集大小
validation_size = int(len(train_dataset)/10)
validation_dataset = train_dataset.select(range(validation_size))
train_dataset = train_dataset.select(range(validation_size, len(train_dataset)))

Loading cached shuffled indices for dataset at /home/ran/WorkSpace/my_Exam-Question-Bank-Dataset-zh_mnbvc/notebook/new_classification_datasets/train/cache-677c5f9a11a51a13.arrow


In [409]:
train_dataset

Dataset({
    features: ['text', 'label', 'file_path'],
    num_rows: 8583
})

In [410]:
model = train(train_dataset)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =      6398638     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  5.94928D+03    |proj g|=  1.82487D+05


 This problem is unconstrained.



At iterate   50    f=  1.28316D+02    |proj g|=  2.20646D+02

At iterate  100    f=  5.98905D+01    |proj g|=  4.22555D+01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
*****    100    174      1     0     0   4.226D+01   5.989D+01
  F =   59.890473429223839     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.9min finished


In [405]:
joblib.dump(model, 'TextClassifie-full-final.pkl')

['TextClassifie-full-final.pkl']

# 模型数据

In [411]:
print("模型方程: y = ", model.named_steps['logisticregression'].coef_, " * x + ", model.named_steps['logisticregression'].intercept_)

print("参数量: ", model.named_steps['logisticregression'].coef_.size + 1)

模型方程: y =  [[-6.66293309e-02  2.37002310e-02 -1.40826367e-03 ... -1.49745354e-05
  -1.50341050e-05 -1.06960967e-05]]  * x +  [-0.45084807]
参数量:  6398638


In [412]:
count_vectorizer = model.steps[0][1]

In [413]:
count_vectorizer.vocabulary_

{'悬疑': 3174349,
 '韩国片': 6268712,
 '蒙太奇': 5388607,
 '解说': 5527963,
 '文案': 3642459,
 '清冷': 4324163,
 '死寂': 4127655,
 '的': 4621191,
 '公寓': 1339274,
 '里': 6059641,
 '一': 283847,
 '女子': 2488499,
 '大白天': 2443299,
 '依然': 1167779,
 '躺': 5799151,
 '在': 2257192,
 '床上': 2940506,
 '她': 2491894,
 '形容': 3052059,
 '颓废': 6312777,
 '生': 4508471,
 '无可': 3699414,
 '恋': 3163329,
 '但': 1082661,
 '又': 1830044,
 '必须': 3115343,
 '活着': 4270523,
 '年前': 2912645,
 '女儿': 2487697,
 '西珍': 5471710,
 '被': 5452703,
 '人': 898291,
 '绑架': 5095030,
 '杀害': 3944916,
 '到': 1586463,
 '现在': 4476309,
 '还': 5870213,
 '没': 4218797,
 '找到': 3340973,
 '凶手': 1484121,
 '这': 5879006,
 '是': 3768682,
 '活下去': 4265430,
 '唯一': 2145571,
 '理由': 4496215,
 '警察': 5543581,
 '吴青浩带': 2068215,
 '着': 4759207,
 '一名': 325477,
 '小': 2727533,
 '警员': 5543518,
 '上门': 456999,
 '拜访': 3413009,
 '当年': 3044093,
 '吴青浩是': 2068217,
 '绑架案': 5095119,
 '负责人': 5699384,
 '他们': 972819,
 '客厅': 2645317,
 '等候': 4950685,
 '只见': 1934548,
 '书架上': 768001,
 '摆着': 3528897,
 '大量':

# Test

In [56]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from tabulate import tabulate

In [414]:
def predict_with_threshold(model, X, threshold=0.5):
    probabilities = model.predict_proba(X)
    
    positive_probabilities = probabilities[:, 1]

    predictions = (positive_probabilities > threshold).astype(int)
    
    return predictions, positive_probabilities

In [415]:
# test_dataset = datadict["test"]
test_dataset = validation_dataset

In [416]:
test_dataset = test_dataset.map(dataset_map_pre_process)

Loading cached processed dataset at /home/ran/WorkSpace/my_Exam-Question-Bank-Dataset-zh_mnbvc/notebook/new_classification_datasets/train/cache-05027825d2f02e0a.arrow


In [417]:
import time

start_time = time.time()

predictions, positive_probabilities = predict_with_threshold(model, test_dataset["text"])

print(f"time { time.time() - start_time}")

time 16.69502592086792


In [418]:
accuracy = accuracy_score(test_dataset["label"], predictions)
recall = recall_score(test_dataset["label"], predictions)
precision = precision_score(test_dataset["label"], predictions)
f1 = f1_score(test_dataset["label"], predictions)

In [419]:
data = [["Accuracy", accuracy], ["Recall", recall], ["Precision", precision], ["F1 Score", f1]]

print(tabulate(data, headers=["Metric", "Score"]))

Metric        Score
---------  --------
Accuracy   0.983211
Recall     0.961415
Precision  0.986799
F1 Score   0.973941


# Check

In [421]:
# 按照0.5threshold判断是否为试卷
for index, prediction in enumerate(predictions):
    if test_dataset['label'][index] != prediction:
        scope = '{:.2f}'.format(positive_probabilities[index])
        print(f"{index} -- {scope} -- {test_dataset['file_path'][index]}")

31 -- 0.12 -- positive_file/b08高考冲刺八：地理图像技能（一）.doc
65 -- 0.18 -- positive_file/必修1 第二单元 单元滚动练(二).docx
125 -- 1.00 -- negative_file/contents of002、忏悔录.doc
232 -- 0.17 -- 
245 -- 0.33 -- positive_file/1高考冲刺：化学实验仪器.doc
308 -- 0.18 -- positive_file/02.doc
383 -- 0.56 -- negative_file/45美国期望的各年龄段阅读水平.docx
402 -- 0.22 -- 
510 -- 0.33 -- positive_file/b09高考冲刺九：地理图像技能（二）.doc
513 -- 0.49 -- 
553 -- 0.08 -- positive_file/公考遴选每日考题10道（2023年2月8日）【官方抖音号：资深秘书】.docx
599 -- 0.00 -- positive_file/2023年“安全生产月”主题知识竞赛测试题（附答案）【官方抖音号：资深秘书】.docx
683 -- 0.39 -- 
874 -- 0.77 -- negative_file/初中地理思维导图20组.docx
930 -- 0.98 -- negative_file/3_页面设置-A4纸与黄金分割.doc
943 -- 0.00 -- 


In [423]:
for line in test_dataset["text"][943].splitlines():
    print(line)

2019年"中南传媒湖南新教材杯"重庆市高中数学竞赛
暨全国高中数学联赛（重庆赛区）预赛试题参考答案
一、填空题(每小题8分，共64分)
1.设为三元集合（三个不同实数组成的集合），集合，若，则集合\_\_\_\_\_\_\_\_.
答案：
提示：设，其中
则解得，从而。
2.函数$f\lf( x \rh) = (\sqr{1 + x} + \sqr{1 - x} - 3)(\sqr{1 - x^{2}} + 1)$的最小值为$$，最大值为$M$，则$\frc{M}{} =$\_\_\_\_\_\_\_\_.
答案： $\frc{3 - \sqr{2}}{2}$
提示：设$ = \sqr{1 + x} + \sqr{1 - x}$,则$ \q 0$且$^{2} = 2 + 2\sqr{1 - x^{2}}$，∴$ \ \lf\lbrck \sqr{2},2 \rh\rbrck$.
$f\lf( x \rh) = \lf(  - 3 \rh) \co \frc{^{2}}{2}$，令$\lf(  \rh) = \frc{1}{2}^{2}\lf(  - 3 \rh)$，$ \ \lbrck\sqr{2},2\rbrck$.
令$^{'}\lf(  \rh) = 0$得$ = 2$，$\lf( \sqr{2} \rh) = \sqr{2} - 3$，$\lf( 2 \rh) = - 2$，
∴$M = {\lf(  \rh)}_{\x} = \sqr{2} - 3$，$ = {\lf(  \rh)}_{\} = - 2$，∴$\frc{M}{} = \frc{3 - \sqr{2}}{2}$.
3.$\ 15^{o} + 2\sqr{2}\s 15^{o} =$\_\_\_\_\_\_\_\_.
答案： $1$
提示：$\ 15^{o} + 2\sqr{2}\s 15^{o} = \frc{{\x{s}}15^{o}}{\cos 15^{o}} + 2\sqr{2}\s 15^{o} = \frc{{\x{s}}15^{o} + \sqr{2}\s 30^{o}}{\cos 15^{o}} = \frc{{\x{s}}15^{o} + \sqr{2}\s\lf( 45^{o} - 15^{o} \rh)}{\cos 15^{o}}$
$= \frc{{\x{s}}15^