# **英文作文打分**

## 第一步 读取数据

In [49]:
import os
import pandas as pd
DATASET_DIR = './'
df = pd.read_excel( '作文打分语料（英语）.xlsx', sheet_name= 0)
print("数据量：",len(df))
#X 是输入； y 是输出
X = df
y = df['分数']

数据量： 1200


In [12]:
df.head()

Unnamed: 0,编号,作文,分数
0,1,"Dear Local Newspaper, I believe the computer d...",8
1,2,"Dear @CAPS1 @CAPS2, I have heard the concern o...",10
2,3,@CAPS1 you know we all like computers and we a...,8
3,4,I Believe that computers have a positive effec...,11
4,5,"Dear newstimes, I really think that computers ...",9


原始excel文件中包含训练语料和测试预料两部分。<br />
由于测试预料中没有评分结果，所以我们只使用训练语料采用5折交叉验证的方法来进行评估。<br />
通过上面代码，我们可以看到一共有1200条数据可用。

## 第二步 定义数据预处理函数

In [13]:
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec

#功能：对输入的句子进行处理，移除非字母符号，并根据remove_stopwords来判断是否移除stopwords，返回处理后的句子。
def essay_to_wordlist(essay_v, remove_stopwords):
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

#功能：对输入的一篇文章进行tokenize处理，同时调用上面的函数对每一个句子进行处理。返回处理后的多条句子。
def essay_to_sentences(essay_v, remove_stopwords):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

#根据训练好的word2vec模型，获取每个词的词向量
def makeFeatureVec(words, model, num_features, maxlen):
    featureVec = np.zeros((maxlen, num_features),dtype="float32")
    
    counter = 0
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            featureVec[counter] = model[word]
        counter = counter + 1
        if counter >= maxlen:
            break
    
    return featureVec

#输入文章和word2vec模型，以及编码的维度，返回一篇文章的所有词向量
def getAllFeatureVecs(essays, model, num_features):
    #参数，每句话的最大长度
    maxlen = 200
    counter = 0
    essayFeatureVecs = np.zeros((len(essays), maxlen, num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features, maxlen)
        counter = counter + 1
    return essayFeatureVecs

In [14]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\FengYJ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 第三步 对原始数据进行简单分析

In [18]:
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from keras.preprocessing.sequence import pad_sequences

essays = X['作文']

sentences = []
for essay in essays:
    essay = essay.split()
    #print(len(essay))
    sentences.append(len(essay))
    #break
#print(sentences)
print("最长的句子词数是",max(sentences))

sentences = []
for essay in essays:
    sentences += essay_to_sentences(essay, remove_stopwords = False)
print("第一条句子经处理后的结果：",sentences[0])

clean_train_essays = []
for essay in essays:
    #essay_to_wordlist(essay_v, remove_stopwords=True)
    #print(essay_to_wordlist(essay, remove_stopwords=True))
    clean_train_essays.append(len(essay_to_wordlist(essay, remove_stopwords=True)))
    #break
for essay in essays:
    print("第一批篇文章经处理后的结果：",essay_to_wordlist(essay, remove_stopwords=True))
    break
print("最长的文章词数是",max(clean_train_essays))
print("平均的文章词数是",sum(clean_train_essays)/len(clean_train_essays))


最长的句子词数是 785
第一条句子经处理后的结果： ['dear', 'local', 'newspaper', 'i', 'believe', 'the', 'computer', 'does', 'have', 'positive', 'perks', 'but', 'it', 'also', 'has', 'negative', 'perks']
第一批篇文章经处理后的结果： ['dear', 'local', 'newspaper', 'believe', 'computer', 'positive', 'perks', 'also', 'negative', 'perks', 'reason', 'one', 'loose', 'interaction', 'people', 'reason', 'two', 'forget', 'excersize', 'reason', 'three', 'health', 'jeopardy', 'firstly', 'loose', 'interaction', 'family', 'friends', 'would', 'spend', 'much', 'time', 'computers', 'ruins', 'marriage', 'married', 'kids', 'risk', 'relationship', 'saying', 'stop', 'using', 'computers', 'use', 'saying', 'make', 'life', 'computer', 'good', 'secondly', 'need', 'excersize', 'stastics', 'show', 'percent', 'people', 'live', 'computers', 'gain', 'weight', 'faster', 'would', 'know', 'num', 'yr', 'old', 'gained', 'little', 'weight', 'computers', 'people', 'computer', 'tend', 'eat', 'caps', 'well', 'known', 'facts', 'proven', 'since', 'walk', 'computer

从结果可以看出，1200篇文章经处理后，每篇平均的单词数为185，这里可以方便我们选择实验参数maxlen（每句话的最大长度）。

## 第四步 定义模型

### 我们共使用了四种模型，从模型的复杂程度进行了排序

#### 模型一：一层LSTM<br />
LSTM的一大优势就是其能够处理变长序列,我们可以通过在LSTM层前加一个Masking层来实现此功能。

In [37]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential
import keras.backend as K

def get_model1():
    model = Sequential()
    
    #是否使用变长序列
    #model.add(Masking(mask_value= 0,input_shape=(200, 300,)))
    
    model.add(LSTM(256, dropout=0.4, recurrent_dropout=0.4, input_shape=[200, 300]))
    model.add(Dropout(0.4))
    #model.add(Lambda(lambda x: K.mean(x, axis=1, keepdims=True)))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(1, activation='relu'))
    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

#### 模型二：两层LSTM

In [42]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten, Conv1D, GlobalMaxPooling1D, Masking
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K

def get_model2():
    """Define the model."""
    model = Sequential()
    
    #是否使用变长序列
#     model.add(Masking(mask_value= 0,input_shape=(200, 300,)))
#     model.add(LSTM(250, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))
    
    model.add(LSTM(250, dropout=0.4, recurrent_dropout=0.4, input_shape=[200, 300], return_sequences=True))
    model.add(LSTM(128, recurrent_dropout=0.4))
    
    model.add(Dropout(0.4))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

#### 模型三：加入CNN卷积层

In [21]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, Lambda
from keras.models import Sequential
import keras.regularizers
import keras.backend as K
def get_model3():
    model = Sequential()
    
    model.add(Conv1D(filters=50, kernel_size=5, padding='same', input_shape=[200, 300]))
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4,return_sequences=True))
    model.add(Lambda(lambda x: K.mean(x, axis=1)))
    model.add(Dropout(0.4))
    model.add(Dense(1, activation='relu', activity_regularizer=keras.regularizers.l2(0.0)))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.summary()
    return model

#### 模型四：双向LSTM + 池化层

In [32]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten, Bidirectional, GlobalMaxPooling1D
from keras.models import Sequential
import keras.backend as K

def get_model4():
    model = Sequential()
    
    model.add(Bidirectional(LSTM(units=64, dropout=0.4, return_sequences=True),input_shape=[200, 300]))
    model.add(GlobalMaxPooling1D())
    #model.add(LSTM(64, dropout=0.4, recurrent_dropout=0.4, input_shape=[200, 300]))
    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

## 第五步 模型训练和结果

我们可以通过选择上述不同的模型来得到不同的结果。
经调试，在此数据集上效果最好的是model3。（LSTM+CNN）

In [47]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import mean_squared_error
from math import sqrt

#5折交叉验证
cv = KFold(n_splits = 5, shuffle = True)
results_spear = []
results_perason = []
results_RMSE = []
results_kappa = []

y_pred_list = []

count = 1
for traincv, testcv in cv.split(X):
    print("\n--------Fold {}--------\n".format(count))
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]
#     print(X_train)
    train_essays = X_train['作文']
    test_essays = X_test['作文']
    
    sentences = []
    
    for essay in train_essays:
            # Obtaining all sentences from the training essays.
            sentences += essay_to_sentences(essay, remove_stopwords = True)
    
    #print(sentences[0])
    
    # 设置 word2vec 参数
    num_features = 300 
    min_word_count = 40
    num_workers = 4
    context = 10
    downsampling = 1e-3

    print("Training Word2Vec Model...")
    model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

    model.init_sims(replace=True)
    model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

    # 生成训练和测试的词向量集
    clean_train_essays = []
    for essay_v in train_essays:
        clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))

    #print(train_essays)
    #print(clean_train_essays[0])
    #print(len(clean_train_essays[0]))
    
    trainDataVecs = getAllFeatureVecs(clean_train_essays, model, num_features)
    #print(trainDataVecs[0])
    #print(len(trainDataVecs))
    #print(trainDataVecs.shape)
    
    clean_test_essays = []
    for essay_v in test_essays:
        clean_test_essays.append(essay_to_wordlist( essay_v, remove_stopwords=True ))
    testDataVecs = getAllFeatureVecs( clean_test_essays, model, num_features )
    
    trainDataVecs = np.array(trainDataVecs)
    testDataVecs = np.array(testDataVecs)
    
    trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], trainDataVecs.shape[1], trainDataVecs.shape[2]))
    testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], trainDataVecs.shape[1], testDataVecs.shape[2]))
    #print(trainDataVecs.shape)
    #print(y_train.shape)
    
    #****************************************************************************
    #选择不同的模型！！
    lstm_model = get_model3()
    lstm_model.fit(trainDataVecs, y_train, batch_size=64, epochs=10)
    #lstm_model.load_weights('./model_weights/final_lstm.h5')
    y_pred = lstm_model.predict(testDataVecs)
    
    
    # Round y_pred to the nearest integer.
    y_pred = np.around(y_pred)
    
    # 模型评估
    kappa = cohen_kappa_score(y_test.values,y_pred,weights='quadratic')
    df2 = pd.DataFrame()
    df2['true'] = y_test.values
    df2['pred'] = y_pred
    spear = df2.corr('spearman')['pred'][0]
    pearson = df2.corr('pearson')['pred'][0]
    RMSE = sqrt(mean_squared_error(y_test.values, y_pred))
    print()
    print("Spearman’s ρ Score:", spear)
    print("Pearson r Score:", pearson)
    print("RMSE Score:", RMSE)
    print("Cohen’s κ Score:", kappa)
   # print(y_test.values)
    #print(y_pred)
    
    results_spear.append(spear)
    results_perason.append(pearson)
    results_RMSE.append(RMSE)
    results_kappa.append(kappa)
    
    count += 1
    #break


--------Fold 1--------

Training Word2Vec Model...




Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_2 (Conv1D)            (None, 200, 50)           75050     
_________________________________________________________________
lstm_35 (LSTM)               (None, 200, 300)          421200    
_________________________________________________________________
lambda_6 (Lambda)            (None, 300)               0         
_________________________________________________________________
dropout_29 (Dropout)         (None, 300)               0         
_________________________________________________________________
dense_30 (Dense)             (None, 1)                 301       
Total params: 496,551
Trainable params: 496,551
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

S



Model: "sequential_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_3 (Conv1D)            (None, 200, 50)           75050     
_________________________________________________________________
lstm_36 (LSTM)               (None, 200, 300)          421200    
_________________________________________________________________
lambda_7 (Lambda)            (None, 300)               0         
_________________________________________________________________
dropout_30 (Dropout)         (None, 300)               0         
_________________________________________________________________
dense_31 (Dense)             (None, 1)                 301       
Total params: 496,551
Trainable params: 496,551
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

S



Model: "sequential_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_4 (Conv1D)            (None, 200, 50)           75050     
_________________________________________________________________
lstm_37 (LSTM)               (None, 200, 300)          421200    
_________________________________________________________________
lambda_8 (Lambda)            (None, 300)               0         
_________________________________________________________________
dropout_31 (Dropout)         (None, 300)               0         
_________________________________________________________________
dense_32 (Dense)             (None, 1)                 301       
Total params: 496,551
Trainable params: 496,551
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

S



Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_5 (Conv1D)            (None, 200, 50)           75050     
_________________________________________________________________
lstm_38 (LSTM)               (None, 200, 300)          421200    
_________________________________________________________________
lambda_9 (Lambda)            (None, 300)               0         
_________________________________________________________________
dropout_32 (Dropout)         (None, 300)               0         
_________________________________________________________________
dense_33 (Dense)             (None, 1)                 301       
Total params: 496,551
Trainable params: 496,551
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

S



Model: "sequential_27"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_6 (Conv1D)            (None, 200, 50)           75050     
_________________________________________________________________
lstm_39 (LSTM)               (None, 200, 300)          421200    
_________________________________________________________________
lambda_10 (Lambda)           (None, 300)               0         
_________________________________________________________________
dropout_33 (Dropout)         (None, 300)               0         
_________________________________________________________________
dense_34 (Dense)             (None, 1)                 301       
Total params: 496,551
Trainable params: 496,551
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

S

In [48]:
print("Average Spearman’s ρ score after a 5-fold cross validation: ",np.around(np.array(results_spear).mean(),decimals=4))
print("Average Pearson r score after a 5-fold cross validation: ",np.around(np.array(results_perason).mean(),decimals=4))
print("Average RMSE score after a 5-fold cross validation: ",np.around(np.array(results_RMSE).mean(),decimals=4))
print("Average Kappa score after a 5-fold cross validation: ",np.around(np.array(results_kappa).mean(),decimals=4))

Average Spearman’s ρ score after a 5-fold cross validation:  0.6515
Average Pearson r score after a 5-fold cross validation:  0.7117
Average RMSE score after a 5-fold cross validation:  1.0885
Average Kappa score after a 5-fold cross validation:  0.6766
