In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import joblib
import datasets

# Train

In [2]:
import sys

sys.path.append('../../')

from examination_paper_classifier import chinese_tokenizer
from examination_paper_classifier import dataset_map_pre_process

In [3]:
def train(dataset):
    vectorizer = CountVectorizer(tokenizer=chinese_tokenizer, ngram_range=(1, 2))
    classifier = LogisticRegression(max_iter=100, verbose=1)
    model = make_pipeline(vectorizer, classifier)
    
    model.fit(dataset["text"], dataset["label"])
    return model

In [4]:
datadict = datasets.load_dataset("ranWang/test_paper_textClassifier")
# datadict = datasets.load_from_disk("./classification_datasets")
datadict

Found cached dataset parquet (/home/ran/.cache/huggingface/datasets/ranWang___parquet/ranWang--test_paper_textClassifier-17b1cf1968004118/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['text', 'label', 'file_path'],
        num_rows: 387
    })
    train: Dataset({
        features: ['text', 'label', 'file_path'],
        num_rows: 13621
    })
})

In [5]:
train_dataset = datadict["train"]
train_dataset = train_dataset.map(dataset_map_pre_process)

Loading cached processed dataset at /home/ran/.cache/huggingface/datasets/ranWang___parquet/ranWang--test_paper_textClassifier-17b1cf1968004118/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-4791a4353f4885b0.arrow


In [6]:
# 随机打乱
# shuffle(36) Accuracy = 0.9787
# shuffle(42) Accuracy = 0.9809
# shuffle(100) Accuracy = 0.9838
train_dataset = train_dataset.shuffle(7)
# 验证集大小
validation_size = int(len(train_dataset)/10)
validation_dataset = train_dataset.select(range(validation_size))
train_dataset = train_dataset.select(range(validation_size, len(train_dataset)))

Loading cached shuffled indices for dataset at /home/ran/.cache/huggingface/datasets/ranWang___parquet/ranWang--test_paper_textClassifier-17b1cf1968004118/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-dae979c08c5cd77a.arrow


In [7]:
train_dataset

Dataset({
    features: ['text', 'label', 'file_path'],
    num_rows: 12259
})

In [8]:
model = train(train_dataset)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.341 seconds.
Prefix dict has been built successfully.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =      8219976     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  8.49729D+03    |proj g|=  2.31264D+05


 This problem is unconstrained.



At iterate   50    f=  2.23503D+02    |proj g|=  2.86426D+02

At iterate  100    f=  8.44822D+01    |proj g|=  2.38868D+01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
*****    100    156      1     0     0   2.389D+01   8.448D+01
  F =   84.482195459727748     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.4min finished


In [75]:
joblib.dump(model, 'TextClassifie-full-final.pkl')

['TextClassifie-full-final.pkl']

# 模型数据

In [9]:
print("模型方程: y = ", model.named_steps['logisticregression'].coef_, " * x + ", model.named_steps['logisticregression'].intercept_)

print("参数量: ", model.named_steps['logisticregression'].coef_.size + 1)

模型方程: y =  [[-1.80728394e-01  1.12157291e-01 -1.09530181e-03 ... -3.08192342e-05
  -1.63116621e-03 -5.25263332e-06]]  * x +  [-0.51810037]
参数量:  8219976


In [10]:
count_vectorizer = model.steps[0][1]

In [11]:
count_vectorizer.vocabulary_

{'固定资产': 2812305,
 '投资': 4284649,
 '计划': 7121434,
 '管理条例': 6371106,
 '总则': 4010151,
 '为': 842683,
 '加强': 2076692,
 '公司法人': 1672592,
 '治理': 5385355,
 '结构': 6523275,
 '规范': 7073938,
 '公司': 1664735,
 '内部': 1785572,
 '管理程序': 6371811,
 '组织': 6478369,
 '和': 2644232,
 '落实': 6905634,
 '各类': 2509418,
 '项': 8056154,
 '目的': 6016498,
 '文件': 4614044,
 '编制': 6573937,
 '立项': 6261678,
 '申请': 5831463,
 '审批': 3343873,
 '等': 6317286,
 '工作': 3572016,
 '按照': 4376488,
 '的': 5910924,
 '发展': 2353148,
 '战略': 4162107,
 '总体规划': 4009016,
 '依靠': 1466034,
 '科技': 6196452,
 '进': 7576726,
 '步': 5236392,
 '技术创新': 4259327,
 '综合利用': 6566066,
 '资源': 7365919,
 '优化': 1316505,
 '产品': 1108304,
 '努力': 2099439,
 '降低消耗': 7938818,
 '用': 5781741,
 '较': 7469598,
 '少': 3487222,
 '投入': 4279692,
 '快': 3970218,
 '速度': 7671320,
 '建成': 3781124,
 '项目': 8065166,
 '促进': 1476273,
 '特': 5658923,
 '制定': 2010131,
 '本': 4974029,
 '条例': 5036835,
 '所指': 4187498,
 '基建': 2959420,
 '技改': 4255733,
 '大修': 3077948,
 '更新': 4859088,
 '及': 2311234,
 '新': 4

# Test

In [12]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from tabulate import tabulate
from examination_paper_classifier import get_predict_with_threshold

In [13]:
# test_dataset = datadict["test"]
test_dataset = validation_dataset

In [14]:
test_dataset = test_dataset.map(dataset_map_pre_process)

Loading cached processed dataset at /home/ran/.cache/huggingface/datasets/ranWang___parquet/ranWang--test_paper_textClassifier-17b1cf1968004118/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-a29d079fba08ee3d.arrow


In [15]:
import time

start_time = time.time()

predictions, positive_probabilities = get_predict_with_threshold(model, test_dataset["text"])

print(f"time { time.time() - start_time}")

time 25.915993213653564


In [16]:
accuracy = accuracy_score(test_dataset["label"], predictions)
recall = recall_score(test_dataset["label"], predictions)
precision = precision_score(test_dataset["label"], predictions)
f1 = f1_score(test_dataset["label"], predictions)

In [17]:
data = [["Accuracy", accuracy], ["Recall", recall], ["Precision", precision], ["F1 Score", f1]]

print(tabulate(data, headers=["Metric", "Score"]))

Metric        Score
---------  --------
Accuracy   0.980176
Recall     0.97561
Precision  0.964912
F1 Score   0.970232


# Check

In [18]:
# 按照0.5threshold判断是否为试卷
# 文件路径为空则是当初数据集版本没有添加这个字段
for index, prediction in enumerate(predictions):
    if test_dataset['label'][index] != prediction:
        scope = '{:.2f}'.format(positive_probabilities[index])
        print(f"{index} -- {scope} -- {test_dataset['file_path'][index]}")

10 -- 1.00 -- negative_file/【公众号：zsxx_xxyg】第一章 绪论.doc
62 -- 0.38 -- positive_file/单元测试【阿里资源站 aliyunshare.cn】.docx
63 -- 0.83 -- negative_file/笔误更正公告.docx
95 -- 0.99 -- negative_file/《法律基本概念》、《人民警察法》、《社会主义法治理念》、《治安管理处罚法》备考手册.docx
270 -- 0.37 -- 
307 -- 0.00 -- positive_file/05-考点精讲（五）.doc
381 -- 0.19 -- 
477 -- 1.00 -- 
495 -- 0.00 -- 
519 -- 0.00 -- positive_file/7非连续性文本阅读专项突破卷更多资源-XH1080.com.doc
628 -- 0.68 -- 
701 -- 0.08 -- positive_file/第1章  云计算概述（习题答案）.docx
711 -- 0.02 -- 
734 -- 0.51 -- negative_file/wswjy_jy2001.doc
786 -- 0.61 -- negative_file/培训游戏—团队游戏—齐眉1.doc
793 -- 1.00 -- negative_file/2017年上半年 信息安全工程师 应用技术.docx
804 -- 0.98 -- negative_file/07 成为文章鉴赏家-养成批注好习惯/第7讲资料/第七讲巩固解析.docx
849 -- 0.98 -- 
900 -- 0.51 -- negative_file/05、三线表的制作【微信公众号：小七素材】一个永久免费的公众号.docx
1098 -- 0.89 -- negative_file/jczs_qh_mst_jy6011.doc
1137 -- 1.00 -- negative_file/正版心理咨询师教材习题集勘误表.doc
1213 -- 0.47 -- positive_file/2020年山东高考数学试题答案（word版）.docx
1231 -- 0.44 -- positive_file/2007年普通高等学校招生全国统一考试（上海卷）物理试卷

In [221]:
for line in test_dataset["text"][195].splitlines():
    print(line)

                               销售在线测试题
姓名：                     部门：                    分数：
                                 （笔试题）
1. 组织型客户与个人型客户有什么区别？（5分）
2. 销售在线需要收集记录哪些与客户有关的关键信息？（8分）
3. 客户的360度全面查看可以查看哪些主要信息？（5分）
4. 任务的责任人和受分配人有什么区别？（7分）
5. 任务有哪些要素？（7分）
6. 销售在线中业务机会有哪三个要素？（5分）
7. 市场百科全书的目录层次结构是怎样的？（5分）
8. 市场百科全书能够发布的内容有哪三种？（5分）
                                 （上机题）
用户用自己的帐号登录销售在线系统 （5分）
（用户名＝电子邮件用户名，密码＝888888）
将下面的销售活动记录到销售在线系统中：（48分）
1月5日上午9点，销售人员电话预约一家新客户，客户名称为上海××空调公司，地址在上
海市虹口区××路××号，准备中午与客户代表人王×吃饭。中午12点，与王×交谈当中，了
解到该客户将在二月底购买一批CG系列的压缩机，估计金额在500000元左右，销售人员
了解到该客户与竞争对手——另一商家美芝空调压缩机股份有限公司也在商讨购买事宜，
经过商谈后，销售人员觉得这张单子从客户意向收集开始，成功拿下的可能性有20%，同
时自己一个人不够，需要加入一名销售人员，并将该消息通过MES发送给营业所总代表。
下午3点，老客户青岛海尔公司打来电话需要紧急购进一批SH系列的变频压缩机，该销售
人员由于忙于新客户的单子，就用任务指派的形式，将青岛海尔的单子转交给另一销售
人员。
