In [137]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import joblib
import datasets

# Train

In [189]:
import re

def remove_image_string(input_string):
    pattern = r"!\[(.*?)\]\(.*?\)\{width=\".*?\" height=\".*?\"\}|!\[.*?\]\(.*?\)|\[.*?\]\{.*?\}"
    result = re.sub(pattern, "", input_string)
    return result

def is_all_chinese_or_english(text):
    for char in text:
        if not is_chinese_or_english(char):
            return False
    return True

def is_chinese_or_english(char):
    # 判断是否为中文
    if '\u4e00' <= char <= '\u9fff':
        return True
    # 判断是否为英文
    elif 'a' <= char.lower() <= 'z':
        return True
    else:
        return False

def remove_noise_character(input_string):
    pattern = r"[>*|image|data|media|png]"
    
    return re.sub(pattern, "", input_string)

def one_text_pre_process(text):
    precessed_text_split_lines = []
    
    for line in text.splitlines():
        remove_image_line = remove_image_string(line)

        if remove_image_line.strip() in ["", ">"]:
            continue

        remove_image_line = remove_noise_character(remove_image_line)
        precessed_text_split_lines.append(remove_image_line)
        
    return "\n".join(precessed_text_split_lines)

def pre_process(text_list):
    precessed_text_list = [one_text_pre_process(text) for text in text_list]
    return precessed_text_list

def dataset_map_pre_process(row):
    row["text"] = one_text_pre_process(row["text"])
    return row

In [290]:
import jieba


def chinese_tokenizer(text):
    tokens = jieba.cut(text)
    return list(filter(lambda x:len(x) > 1 and is_all_chinese_or_english(x), tokens))


def train(dataset):
    vectorizer = CountVectorizer(tokenizer=chinese_tokenizer, ngram_range=(1, 2))
    classifier = LogisticRegression(max_iter=200, verbose=1)
    model = make_pipeline(vectorizer, classifier)
    
    model.fit(dataset["text"], dataset["label"])
    return model

In [308]:
# datadict = datasets.load_dataset("ranWang/test_paper_textClassifier")
datadict = datasets.load_from_disk("./new_classification_datasets")
datadict

DatasetDict({
    test: Dataset({
        features: ['text', 'label', 'file_path'],
        num_rows: 387
    })
    train: Dataset({
        features: ['text', 'label', 'file_path'],
        num_rows: 13247
    })
})

In [303]:
train_dataset = datadict["train"]
train_dataset = train_dataset.map(dataset_map_pre_process)

Loading cached processed dataset at /home/ran/WorkSpace/my_Exam-Question-Bank-Dataset-zh_mnbvc/notebook/new_classification_datasets/train/cache-168c4eb11baae2f3.arrow


In [304]:
# 随机打乱
# shuffle(36) Accuracy = 0.9835
# shuffle(42) Accuracy = 0.9853
# shuffle(100) Accuracy = 0.9888
train_dataset = train_dataset.shuffle(42)
# 验证集大小
validation_size = int(len(train_dataset)/10)
validation_dataset = train_dataset.select(range(validation_size))
# train_dataset = train_dataset.select(range(validation_size, len(train_dataset)))

Loading cached shuffled indices for dataset at /home/ran/WorkSpace/my_Exam-Question-Bank-Dataset-zh_mnbvc/notebook/new_classification_datasets/train/cache-7b2f72804adbdac9.arrow


In [305]:
train_dataset

Dataset({
    features: ['text', 'label', 'file_path'],
    num_rows: 13158
})

In [306]:
model = train(train_dataset)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =      8140171     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  9.12043D+03    |proj g|=  7.10775D+04


 This problem is unconstrained.



At iterate   50    f=  1.11860D+02    |proj g|=  2.40415D+01

At iterate  100    f=  7.16133D+01    |proj g|=  4.49200D+00



 Bad direction in the line search;
   refresh the lbfgs memory and restart the iteration.



At iterate  150    f=  6.37758D+01    |proj g|=  7.56378D+00

At iterate  200    f=  6.22753D+01    |proj g|=  1.19815D+00

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
*****    200    451      2     0     0   1.198D+00   6.228D+01
  F =   62.275309218033911     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.5min finished


In [307]:
joblib.dump(model, 'TextClassifie-full-final.pkl')

['TextClassifie-full-final.pkl']

# 模型数据

In [292]:
print("模型方程: y = ", model.named_steps['logisticregression'].coef_, " * x + ", model.named_steps['logisticregression'].intercept_)

print("参数量: ", model.named_steps['logisticregression'].coef_.size + 1)

模型方程: y =  [[-1.81238793e-03  9.02865915e-06  1.82451388e-05 ...  1.49851219e-07
  -5.86489000e-05 -5.86489000e-05]]  * x +  [-1.21892501]
参数量:  7666782


In [293]:
count_vectorizer = model.steps[0][1]

In [294]:
count_vectorizer.vocabulary_

{'职位': 6155006,
 '说明书': 6725438,
 '基本': 2596495,
 '名称': 2291754,
 '生产': 5336144,
 '科长': 5696207,
 '情况': 3657282,
 '所属单位': 3798083,
 '轧钢厂': 6910048,
 '所属': 3797712,
 '直接': 5515542,
 '上级': 454412,
 '副厂长': 1786749,
 '下属': 468770,
 '轧辊': 6909947,
 '车间主任': 6909527,
 '调度长': 6743750,
 '设置': 6659726,
 '目的': 5509067,
 '协调': 1938432,
 '组织': 5961338,
 '完成': 2928127,
 '任务': 1069900,
 '厂长': 1994503,
 '交待': 883651,
 '临时工': 699206,
 '工作': 3182290,
 '负责': 6772113,
 '分解': 1683243,
 '落实': 6384607,
 '分厂': 1658109,
 '下达': 477970,
 '生产指标': 5342172,
 '成本': 3731333,
 '指标': 3975369,
 '职责': 6161299,
 '和工': 2358995,
 '作内': 1181251,
 '处理': 2648590,
 '科室': 5691979,
 '公司': 1422888,
 '及其': 2043239,
 '协作': 1933071,
 '事宜': 826298,
 '每月': 4840305,
 '每日': 4839562,
 '安排': 2915473,
 '检修': 4728564,
 '计划': 6593366,
 '安全': 2909110,
 '文明': 4245933,
 '管理': 5850691,
 '劳动纪律': 1848859,
 '纪律': 5937977,
 '工艺': 3207413,
 '备品备件': 2655218,
 '管理工作': 5859777,
 '各项': 2243050,
 '统计': 6036940,
 '磨辊': 5647762,
 '车间': 6908603,
 '奖惩': 278865

# Test

In [172]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from tabulate import tabulate

In [295]:
def predict_with_threshold(model, X, threshold=0.5):
    probabilities = model.predict_proba(X)
    
    positive_probabilities = probabilities[:, 1]

    predictions = (positive_probabilities > threshold).astype(int)
    
    return predictions, positive_probabilities

In [296]:
# test_dataset = datadict["test"]
test_dataset = validation_dataset

In [297]:
test_dataset = test_dataset.map(dataset_map_pre_process)

Map:   0%|          | 0/1315 [00:00<?, ? examples/s]

In [298]:
import time

start_time = time.time()

predictions, positive_probabilities = predict_with_threshold(model, test_dataset["text"])

print(f"time { time.time() - start_time}")

time 17.84801483154297


In [299]:
accuracy = accuracy_score(test_dataset["label"], predictions)
recall = recall_score(test_dataset["label"], predictions)
precision = precision_score(test_dataset["label"], predictions)
f1 = f1_score(test_dataset["label"], predictions)

In [300]:
data = [["Accuracy", accuracy], ["Recall", recall], ["Precision", precision], ["F1 Score", f1]]

print(tabulate(data, headers=["Metric", "Score"]))

Metric        Score
---------  --------
Accuracy   0.98403
Recall     0.971564
Precision  0.97852
F1 Score   0.97503


# Check

In [213]:
# 按照0.5threshold判断是否为试卷
# 文件路径为空则是当初数据集版本没有添加这个字段
for index, prediction in enumerate(predictions):
    if test_dataset['label'][index] != prediction:
        scope = '{:.2f}'.format(positive_probabilities[index])
        print(f"{index} -- {scope} -- {test_dataset['file_path'][index]}")

150 -- 0.63 -- negative_file/人民时评常用词汇【官方抖音号：材料大师姐】.docx
195 -- 0.00 -- positive_file/销售在线测试题.doc
320 -- 0.97 -- negative_file/双体系现场问卷知识汇总.doc
355 -- 0.00 -- positive_file/2016年湖南湘江新区遴选公务员考试真题及答案-纪检监察岗【官方抖音号：资深秘书】.docx
437 -- 1.00 -- negative_file/浙江大学2023年硕士研究生招生考试复试分数线的基本要求.docx
468 -- 1.00 -- negative_file/党建理论知识应知应会【唯一微信 aoling18031988287】.doc
489 -- 0.00 -- 
496 -- 0.44 -- 
517 -- 0.01 -- positive_file/公考遴选每日考题10道（2022年12月24日）【关注抖音号：资深秘书】.docx
529 -- 0.60 -- 
553 -- 0.00 -- positive_file/2016年湖北省直机关遴选公务员考试真题及答案-综合执法类【官方抖音号：资深秘书】.docx
560 -- 1.00 -- negative_file/大作文分论点“思维工具”--观点倍增术（上篇）.docx
606 -- 0.00 -- 
659 -- 0.63 -- negative_file/亚商-东阿阿胶集团—2000年广告.doc
693 -- 1.00 -- negative_file/ch12;.doc
725 -- 1.00 -- negative_file/ch12a.doc
767 -- 0.00 -- positive_file/2011年7月13日河南郑州市遴选公务员考试真题及答案【官方抖音号：资深秘书】.docx
817 -- 0.01 -- positive_file/公考遴选每日考题10道（2022年11月17日）【官方抖音号：资深秘书】.docx
821 -- 0.99 -- negative_file/20201111-职场礼仪问答（88题）【唯一微信 aoling18031988287】.doc
874 -- 0.22 -- 
919 -- 0.99 --

In [221]:
for line in test_dataset["text"][195].splitlines():
    print(line)

                               销售在线测试题
姓名：                     部门：                    分数：
                                 （笔试题）
1. 组织型客户与个人型客户有什么区别？（5分）
2. 销售在线需要收集记录哪些与客户有关的关键信息？（8分）
3. 客户的360度全面查看可以查看哪些主要信息？（5分）
4. 任务的责任人和受分配人有什么区别？（7分）
5. 任务有哪些要素？（7分）
6. 销售在线中业务机会有哪三个要素？（5分）
7. 市场百科全书的目录层次结构是怎样的？（5分）
8. 市场百科全书能够发布的内容有哪三种？（5分）
                                 （上机题）
用户用自己的帐号登录销售在线系统 （5分）
（用户名＝电子邮件用户名，密码＝888888）
将下面的销售活动记录到销售在线系统中：（48分）
1月5日上午9点，销售人员电话预约一家新客户，客户名称为上海××空调公司，地址在上
海市虹口区××路××号，准备中午与客户代表人王×吃饭。中午12点，与王×交谈当中，了
解到该客户将在二月底购买一批CG系列的压缩机，估计金额在500000元左右，销售人员
了解到该客户与竞争对手——另一商家美芝空调压缩机股份有限公司也在商讨购买事宜，
经过商谈后，销售人员觉得这张单子从客户意向收集开始，成功拿下的可能性有20%，同
时自己一个人不够，需要加入一名销售人员，并将该消息通过MES发送给营业所总代表。
下午3点，老客户青岛海尔公司打来电话需要紧急购进一批SH系列的变频压缩机，该销售
人员由于忙于新客户的单子，就用任务指派的形式，将青岛海尔的单子转交给另一销售
人员。
