In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import joblib
import datasets

# Train

In [38]:
import re

def remove_image_string(input_string):
    pattern = r"!\[(.*?)\]\(.*?\)\{width=\".*?\" height=\".*?\"\}|!\[.*?\]\(.*?\)|\[.*?\]\{.*?\}"
    result = re.sub(pattern, "", input_string)
    return result

def remove_noise_character(input_string):
    pattern = r"[>*|image|data|media|png]"
    
    return re.sub(pattern, "", input_string)

def one_text_pre_process(text):
    precessed_text_split_lines = []
    
    for line in text.splitlines():
        remove_image_line = remove_image_string(line)

        if remove_image_line.strip() in ["", ">"]:
            continue

        remove_image_line = remove_noise_character(remove_image_line)
        precessed_text_split_lines.append(remove_image_line)
        
    return "\n".join(precessed_text_split_lines)

def pre_process(text_list):
    precessed_text_list = [one_text_pre_process(text) for text in text_list]
    return precessed_text_list

def dataset_map_pre_process(row):
    row["text"] = one_text_pre_process(row["text"])
    return row

In [39]:
import jieba

def chinese_tokenizer(text):
    tokens = jieba.cut(text)
    return list(tokens)


def train(dataset):
    vectorizer = CountVectorizer(tokenizer=chinese_tokenizer, ngram_range=(1, 5))
    classifier = LogisticRegression(max_iter=1000, verbose=1)
    model = make_pipeline(vectorizer, classifier)
    
    model.fit(dataset["text"], dataset["label"])
    return model

In [40]:
datadict = datasets.load_dataset("ranWang/test_paper_textClassifier")

Downloading readme:   0%|          | 0.00/457 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /home/ran/.cache/huggingface/datasets/ranWang___parquet/ranWang--test_paper_textClassifier-e29578b85194b497/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/29.3M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/387 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/2779 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /home/ran/.cache/huggingface/datasets/ranWang___parquet/ranWang--test_paper_textClassifier-e29578b85194b497/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [41]:
train_dataset = datadict["train"]

In [42]:
train_dataset = train_dataset.map(dataset_map_pre_process)

Map:   0%|          | 0/2779 [00:00<?, ? examples/s]

In [43]:
model = train(train_dataset)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =     18202725     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.92626D+03    |proj g|=  3.63434D+05

At iterate   50    f=  1.80253D+02    |proj g|=  8.14122D+03

At iterate  100    f=  7.95707D+01    |proj g|=  1.96024D+03

At iterate  150    f=  5.68351D+01    |proj g|=  1.45860D+02

At iterate  200    f=  4.50477D+01    |proj g|=  1.96235D+02

At iterate  250    f=  3.89243D+01    |proj g|=  1.13622D+02



 Bad direction in the line search;
   refresh the lbfgs memory and restart the iteration.



At iterate  300    f=  2.44315D+01    |proj g|=  1.00393D+00

At iterate  350    f=  2.24613D+01    |proj g|=  2.73992D+01

At iterate  400    f=  2.17948D+01    |proj g|=  2.20876D+02

At iterate  450    f=  2.15096D+01    |proj g|=  1.29328D+00

At iterate  500    f=  2.12875D+01    |proj g|=  9.65164D-01

At iterate  550    f=  2.12275D+01    |proj g|=  4.95107D+00

At iterate  600    f=  2.11938D+01    |proj g|=  4.90144D+00

At iterate  650    f=  2.11703D+01    |proj g|=  1.29348D+00

At iterate  700    f=  2.11575D+01    |proj g|=  2.00737D+00

At iterate  750    f=  2.11522D+01    |proj g|=  1.97003D+00

At iterate  800    f=  2.11458D+01    |proj g|=  8.89306D+00

At iterate  850    f=  2.11446D+01    |proj g|=  2.88593D+00

At iterate  900    f=  2.11432D+01    |proj g|=  1.78054D+00

At iterate  950    f=  2.11416D+01    |proj g|=  5.33317D-01

At iterate 1000    f=  2.11410D+01    |proj g|=  1.99100D-01

At iterate 1050    f=  2.11405D+01    |proj g|=  1.24651D+00

At iter

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 48.2min finished


In [36]:
joblib.dump(model, 'TextClassifie-13m.pkl')

['TextClassifier-new-63m.pkl']

# 模型数据

In [44]:
print("模型方程: y = ", model.named_steps['logisticregression'].coef_, " * x + ", model.named_steps['logisticregression'].intercept_)

print("参数量: ", model.named_steps['logisticregression'].coef_.size + 1)

模型方程: y =  [[ 5.69997076e-02 -9.37742454e-04  5.54575040e-07 ...  2.01404136e-08
   2.01404136e-08  2.01404136e-08]]  * x +  [-2.05576517]
参数量:  18202725


In [45]:
count_vectorizer = model.steps[0][1]

In [46]:
count_vectorizer.vocabulary_

{'北师大': 7627371,
 '版': 12690599,
 '小学': 9555423,
 '三年级': 4978322,
 '下册': 5086834,
 '数学': 11240116,
 '第六': 14016926,
 '单元': 7700473,
 '《': 4662663,
 '认识': 15048884,
 '分数': 7271788,
 '》': 4673438,
 '单元测试': 7701102,
 '3': 2039044,
 '（': 16384374,
 '附': 16028151,
 '答案': 14079262,
 '）': 16483839,
 '\n': 0,
 '一': 4749334,
 '、': 4124608,
 '用': 12844651,
 '表示': 14799757,
 '下面': 5103166,
 '每个': 12230159,
 '图里': 8608515,
 '的': 12987750,
 '阴影': 16019957,
 '部分': 15770992,
 '。': 4433296,
 '每空': 12250018,
 '2': 1839560,
 '分': 7221997,
 '，': 16683449,
 '共': 6933667,
 '12': 1751932,
 '二': 5957558,
 '判断题': 7413674,
 '对': 9428348,
 '在': 8631910,
 '括号': 10894029,
 '里': 15827869,
 '打': 10731060,
 '"': 887094,
 '√': 3892070,
 '错': 15914918,
 '×': 3789532,
 '10': 1709087,
 '1': 1602748,
 ' ': 383869,
 '是': 11535280,
 '同': 8174450,
 '分母': 7293829,
 '相加': 13671825,
 '减': 7134218,
 '不变': 5157120,
 '分子': 7261146,
 '＋': 16663564,
 '=': 2452804,
 '4': 2146946,
 '如': 9120412,
 '左图': 9761460,
 '长方形': 15946921,
 '与'

# Test

In [47]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from tabulate import tabulate

In [55]:
def predict_with_threshold(model, X, threshold=0.9):
    probabilities = model.predict_proba(X)
    
    positive_probabilities = probabilities[:, 1]
    
    predictions = (positive_probabilities > threshold).astype(int)
    
    return predictions

In [56]:
test_dataset = datadict["test"]

In [57]:
test_dataset = test_dataset.map(dataset_map_pre_process)

Loading cached processed dataset at /home/ran/.cache/huggingface/datasets/ranWang___parquet/ranWang--test_paper_textClassifier-e29578b85194b497/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-6dfa6fe4807d3d9d.arrow


In [58]:
import time

start_time = time.time()

predictions = predict_with_threshold(model, test_dataset["text"])

print(f"time { time.time() - start_time}")

time 13.097956418991089


In [59]:
accuracy = accuracy_score(test_dataset["label"], predictions)
recall = recall_score(test_dataset["label"], predictions)
precision = precision_score(test_dataset["label"], predictions)
f1 = f1_score(test_dataset["label"], predictions)

In [60]:
data = [["Accuracy", accuracy], ["Recall", recall], ["Precision", precision], ["F1 Score", f1]]

print(tabulate(data, headers=["Metric", "Score"]))

Metric        Score
---------  --------
Accuracy   0.992248
Recall     0.99505
Precision  0.990148
F1 Score   0.992593


# Check

In [61]:
# 1："试卷" 0:“其他”
for index, prediction in enumerate(predictions):
    if test_dataset['label'][index] != prediction:
        print(f"{index}  预期:{test_dataset['label'][index]} -- 结果:{prediction}")

11  预期:1 -- 结果:0
342  预期:0 -- 结果:1
386  预期:0 -- 结果:1


In [62]:
for line in test_dataset["text"][11].splitlines():
    print(line)

Fr高考英语秋季班

43\. Accor o h ss, rslc s  vul's bly \_\_\_\_\_\_\_\_\_\_\_\_. 细节 --- 位置+对象
A. o  crclly
B. o c o's ow 
C. o lv  
D. o rcovr fro vrsy
44\. Wh os h url wor ""   rfr o? 细节 代词向前指代 --- 复数名词
A. Th sycholoss
B. Th  chlr
C. Posv ls
D. Irl locus of corol 单数
45\. Accor o , w c lr h \_\_\_\_\_\_\_\_\_\_\_\_. 细节---第四段重点内容---主题
A. your  y . 主题
B.  r or rcv h . vs对比
C. xrc vrsy rcs h you wll . 肯定
D.  os' ur you wll .
46\. Wh s h uhor's uros of wr hs ss?
主旨---开头结尾
A. To ch ol how o b .
B. To  ol o lv hrouh vrsy.
C. To c ol's rco  fro ch ohr. 对比 彼此不同
D. To  ffr rsrch fs bou . 对比---说明主题的写作方法

67\. Th sycholoss fro h Uvrsy of Mch f h \_\_\_\_\_\_\_\_\_\_\_\_. 研究结论
A.   o chv succss  
B.  r lkly o   
C.   o sh  
D.  r coforbl wh work  
国家对比---文化差异
68\. Th fl xr  \_\_\_\_\_\_\_\_\_\_\_\_. 细节
A. cor ffr  owr 
B. f ffr vws bou rsol succss
C. ju  of ffr rous
D. cofr whch  s br
69\. A  wll b or ssf f h  fro \_\_\_\_\_\_\_\_\_\_\_\_. 细节
A. hs hbors
B. hs clsss
C. hs chrs
D. hs rs
70\. Accor o K Wu,