In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import joblib
import datasets

# Train

In [404]:
import re

def remove_image_string(input_string):
    pattern = r"!\[\]\(.*?\)\{width=\".*?\" height=\".*?\"\}"
    result = re.sub(pattern, "", input_string)
    return result

def remove_noise_character(input_string):
    return input_string.replace(">","").replace("*","")

def pre_process(text_list):
    precessed_text_list = []
    
    for text in text_list:
        precessed_text_split_lines = []
        
        for line in text.splitlines():
            remove_image_line = remove_image_string(line)
            
            if remove_image_line.strip() in ["", ">"]:
                continue
                
            remove_image_line = remove_noise_character(remove_image_line)
            precessed_text_split_lines.append(remove_image_line)
            
        precessed_text_list.append("\n".join(precessed_text_split_lines))
        
    return precessed_text_list

In [425]:
# 手动特征效果不太好
vocabulary = ['考试',
              "试题", 
              "答题卡", 
              "选择题", 
              "全国统一考试", 
              "解法", 
              "参考答案", 
              "填空题", 
              "判断题", 
              "考试时间",
              "解答题",
              "年试题",
              "真题试卷"
             ]

def train(dataset):
#     vectorizer = CountVectorizer(vocabulary=vocabulary, ngram_range=(1, 2))
    vectorizer = CountVectorizer(ngram_range=(1, 5))
    classifier = LogisticRegression(max_iter=1000, verbose=1)
    model = make_pipeline(vectorizer, classifier)
    
    train_text_list = pre_process(dataset["text"])
    
    model.fit(train_text_list, dataset["label"])
    return model

In [406]:
datadict = datasets.load_from_disk("./dataset")

In [407]:
model = train(datadict["train"])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =      9172228     M =           10

At X0         0 variables are exactly at the bounds


 This problem is unconstrained.



At iterate    0    f=  1.92210D+03    |proj g|=  1.02327D+05

At iterate   50    f=  1.56017D+01    |proj g|=  5.73278D-01

At iterate  100    f=  1.52551D+01    |proj g|=  7.38594D-02

At iterate  150    f=  1.52528D+01    |proj g|=  4.83058D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
*****    174    258      1     0     0   7.331D-04   1.525D+01
  F =   15.252815190968130     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.3min finished


In [419]:
joblib.dump(model, 'TextClassifier.pkl')

['TextClassifier.pkl']

# Test

In [409]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from tabulate import tabulate

In [426]:
# 默认是0.5效果也不好
def predict_with_threshold(model, X, threshold=0.9):
    probabilities = model.predict_proba(X)
    
    positive_probabilities = probabilities[:, 1]
    
    predictions = (positive_probabilities > threshold).astype(int)
    
    return predictions

In [411]:
test_dataset = datadict["test"]

In [412]:
len(test_dataset)

392

In [413]:
predictions = predict_with_threshold(model, pre_process(test_dataset["text"]))

In [414]:
accuracy = accuracy_score(test_dataset["label"], predictions)
recall = recall_score(test_dataset["label"], predictions)
precision = precision_score(test_dataset["label"], predictions)
f1 = f1_score(test_dataset["label"], predictions)

In [415]:
data = [["Accuracy", accuracy], ["Recall", recall], ["Precision", precision], ["F1 Score", f1]]

print(tabulate(data, headers=["Metric", "Score"]))

Metric        Score
---------  --------
Accuracy   0.987245
Recall     0.990148
Precision  0.985294
F1 Score   0.987715


# Check

In [423]:
# 1："试卷" 0:“其他”
for index, prediction in enumerate(predictions):
    if test_dataset['label'][index] != prediction:
        print(f"{index}  {test_dataset['label'][index]} -- {prediction}")

63  1 -- 0
154  1 -- 0
266  0 -- 1
342  0 -- 1
358  0 -- 1


In [422]:
for line in test_dataset["text"][63].splitlines():
    print(line)

**2004年普通高等学校春季招生考试**

英语 （北京卷）

National Matriculation English Test（NMET 2004）

本试卷分第一卷（选择题）和第二卷（非选择题）两部分。第一卷1至16页。第二卷17至20页。共150分。考试时间120分钟。

第一卷（三部分，共115分）

注意事项：

1\. 答第一卷前，考生务必将自己的姓名、准考证号、考试科目用铅笔涂写在答题卡上。

2\. 每小题选出答案后，用铅笔把答题卡上对应题目的答案标号涂黑。如需改动，用橡皮擦干净后，再选涂其它答案标号。不能答在试卷上。

3\. 考试结束后，考生将本试卷和答题卡一并交回。

第一部分：听力理解（共两节，满分30分）

第一节（共5小题；每小题1.5分，满分7.5分）

听下面5段对话。每段对话后有一道小题，从每题所给的A、B、C三个选项中选出最佳选项。听完每段对话后，你将有10秒钟的时间来回答有关小题和阅读下一小题。每段对话你将听一遍。

例：

What is the man going to read?

A. A newspaper.

B. A magazine.

C. A book.

答案是A。

1\. How many students are there in the college?

A. Six hundred.

B. Three hundred.

C. Seven hundred and fifty.

2\. What was the weather like on Wednesday?

A. Rainy.

B. Sunny.

C. Cloudy.

3\. What will Jane have for dinner on Thanksgiving Day?

A. Ham.

B. Potatoes.

C. Fish soup.

4\. What is the woman going to do this weekend?

A. Work in the garden.

B. Go to the beach.

C. Play football.

5\. What is the woman doing?

A. Making suggestions.

B. Making excuses.
