导入数据

In [37]:
import os
import pandas as pd
DATASET_DIR = './'
df = pd.read_excel( '作文打分语料（英语）.xlsx', sheet_name= 0)
y = df['分数']
X = df

print(type(X))
print(y)

<class 'pandas.core.frame.DataFrame'>
0        8
1       10
2        8
3       11
4        9
        ..
1195     9
1196     8
1197     8
1198     8
1199    11
Name: 分数, Length: 1200, dtype: int64


In [38]:
X.head()

Unnamed: 0,编号,作文,分数
0,1,"Dear Local Newspaper, I believe the computer d...",8
1,2,"Dear @CAPS1 @CAPS2, I have heard the concern o...",10
2,3,@CAPS1 you know we all like computers and we a...,8
3,4,I Believe that computers have a positive effec...,11
4,5,"Dear newstimes, I really think that computers ...",9


数据预处理

In [39]:
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec

def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

def makeFeatureVec(words, model, num_features, maxlen):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((maxlen, num_features),dtype="float32")
    
    counter = 0
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            featureVec[counter] = model[word]
        counter = counter + 1
        if counter >= maxlen:
            break
    
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for word2vec model."""
    maxlen = 200
    counter = 0
    essayFeatureVecs = np.zeros((len(essays), maxlen, num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features, maxlen)
        counter = counter + 1
    return essayFeatureVecs

In [40]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\FengYJ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

对原始数据进行一些统计

In [41]:
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from keras.preprocessing.sequence import pad_sequences

essays = X['作文']

sentences = []
for essay in essays:
    essay = essay.split()
    #print(len(essay))
    sentences.append(len(essay))
    #break
#print(sentences)
print("最长的句子词数是",max(sentences))

sentences = []
for essay in essays:
    sentences += essay_to_sentences(essay, remove_stopwords = False)
print("第一条句子：",sentences[0])

clean_train_essays = []
for essay in essays:
    #essay_to_wordlist(essay_v, remove_stopwords=True)
    #print(essay_to_wordlist(essay, remove_stopwords=True))
    clean_train_essays.append(len(essay_to_wordlist(essay, remove_stopwords=True)))
    #break
print("最长的文章词数是",max(clean_train_essays))
print("平均的文章词数是",sum(clean_train_essays)/len(clean_train_essays))


最长的句子词数是 785
第一条句子： ['dear', 'local', 'newspaper', 'i', 'believe', 'the', 'computer', 'does', 'have', 'positive', 'perks', 'but', 'it', 'also', 'has', 'negative', 'perks']
最长的文章词数是 427
平均的文章词数是 185.85583333333332


模型一：两层LSTM

In [12]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten, Conv1D, GlobalMaxPooling1D, Masking
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K

def get_model():
    """Define the model."""
    model = Sequential()
    
    #是否使用变长序列
    #model.add(Masking(mask_value= 0,input_shape=(200, 300,)))
    
    model.add(LSTM(250, dropout=0.4, recurrent_dropout=0.4, input_shape=[200, 300], return_sequences=True))
    model.add(LSTM(128, recurrent_dropout=0.4))
    
    model.add(Dropout(0.4))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

模型二：加入CNN卷积层

In [42]:
"""
Implements LSTM with Mean over Time layer.
"""
from keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, Lambda
from keras.models import Sequential
import keras.regularizers
import keras.backend as K
def get_model2():
    """
    Returns compiled model.
    """
    model = Sequential()
    
    model.add(Conv1D(filters=50, kernel_size=5, padding='same', input_shape=[200, 300]))
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4,return_sequences=True))
    model.add(Lambda(lambda x: K.mean(x, axis=1)))
    model.add(Dropout(0.4))
    model.add(Dense(1, activation='relu', activity_regularizer=keras.regularizers.l2(0.0)))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.summary()
    return model

模型三：一层LSTM

In [20]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential
import keras.backend as K

def get_model3():
    model = Sequential()
    
    model.add(LSTM(64, dropout=0.4, recurrent_dropout=0.4, input_shape=[200, 300]))
    model.add(Dropout(0.5))
    model.add(Lambda(lambda x: K.mean(x, axis=1, keepdims=True)))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

模型四：双向LSTM

In [32]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten, Bidirectional, GlobalMaxPooling1D
from keras.models import Sequential
import keras.backend as K

def get_model4():
    model = Sequential()
    
    model.add(Bidirectional(LSTM(units=64, dropout=0.4, return_sequences=True),input_shape=[200, 300]))
    model.add(GlobalMaxPooling1D())
    #model.add(LSTM(64, dropout=0.4, recurrent_dropout=0.4, input_shape=[200, 300]))
    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

训练模型

In [33]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import mean_squared_error
from math import sqrt

cv = KFold(n_splits = 5, shuffle = True)
results = []
y_pred_list = []

count = 1
for traincv, testcv in cv.split(X):
    print("\n--------Fold {}--------\n".format(count))
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]
#     print(X_train)
    train_essays = X_train['作文']
    test_essays = X_test['作文']
    
    sentences = []
    
    for essay in train_essays:
            # Obtaining all sentences from the training essays.
            sentences += essay_to_sentences(essay, remove_stopwords = True)
    
    print(sentences[0])
    
    # Initializing variables for word2vec model.
    num_features = 300 
    min_word_count = 40
    num_workers = 4
    context = 10
    downsampling = 1e-3

    print("Training Word2Vec Model...")
    model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

    model.init_sims(replace=True)
    model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

    # Generate training and testing data word vectors.
    clean_train_essays = []
    for essay_v in train_essays:
        clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))

    #print(train_essays)
    #print(clean_train_essays[0])
    #print(len(clean_train_essays[0]))
    
    trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)
    #print(trainDataVecs[0])
    #print(len(trainDataVecs))
    print(trainDataVecs.shape)
    
    clean_test_essays = []
    for essay_v in test_essays:
        clean_test_essays.append(essay_to_wordlist( essay_v, remove_stopwords=True ))
    testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )
    
    trainDataVecs = np.array(trainDataVecs)
    testDataVecs = np.array(testDataVecs)
    # Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
    trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], trainDataVecs.shape[1], trainDataVecs.shape[2]))
    testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], trainDataVecs.shape[1], testDataVecs.shape[2]))
    print(trainDataVecs.shape)
    print(y_train.shape)
    
    #****************************************************************************8
    lstm_model = get_model4()
    lstm_model.fit(trainDataVecs, y_train, batch_size=64, epochs=10)
    #lstm_model.load_weights('./model_weights/final_lstm.h5')
    y_pred = lstm_model.predict(testDataVecs)
    
    # Save any one of the 5 models.
    if count == 5:
         lstm_model.save('./model_weights/final_lstm.h5')
    
    # Round y_pred to the nearest integer.
    y_pred = np.around(y_pred)
    
    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    kappa = cohen_kappa_score(y_test.values,y_pred,weights='quadratic')
    df2 = pd.DataFrame()
    df2['true'] = y_test.values
    df2['pred'] = y_pred
    spear = df2.corr('spearman')['pred'][0]
    pearson = df2.corr('pearson')['pred'][0]
    RMSE = sqrt(mean_squared_error(y_test.values, y_pred))
    print()
    print("Spearman’s ρ Score:", spear)
    print("Pearson r Score:", pearson)
    print("RMSE Score:", RMSE)
    print("Cohen’s κ Score:", kappa)
   # print(y_test.values)
    #print(y_pred)
    
    results.append(kappa)
    
    count += 1
    break


--------Fold 1--------

['dear', 'caps', 'caps', 'heard', 'concern', 'many', 'scientists', 'computers']
Training Word2Vec Model...




(960, 200, 300)
(960, 200, 300)
(960,)
Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_3 (Bidirection (None, 200, 128)          186880    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 16)                2064      
_________________________________________________________________
dropout_10 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 17        
Total params: 188,961
Trainable params: 188,961
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/

#embedding处理判断

In [43]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import mean_squared_error
from math import sqrt

cv = KFold(n_splits = 5, shuffle = True)
results = []
y_pred_list = []

count = 1
for traincv, testcv in cv.split(X):
    print("\n--------Fold {}--------\n".format(count))
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]
#     print(X_train)
    train_essays = X_train['作文']
    test_essays = X_test['作文']
    
    sentences = []
    
    for essay in train_essays:
            # Obtaining all sentences from the training essays.
            sentences += essay_to_sentences(essay, remove_stopwords = True)
    
    print("第一条句子：",sentences[0])
    
    # Initializing variables for word2vec model.
    num_features = 300 
    min_word_count = 40
    num_workers = 4
    context = 10
    downsampling = 1e-3

    print("Training Word2Vec Model...")
    model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

    model.init_sims(replace=True)
    model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

    # Generate training and testing data word vectors.
    clean_train_essays = []
    for essay_v in train_essays:
        clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
        break
    #print(train_essays)
    print(clean_train_essays[0])
    print(len(clean_train_essays[0]))
    
    trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)
    #print(trainDataVecs[0])
    #print(len(trainDataVecs))
    print("一篇文章的shape：",trainDataVecs.shape)
    print(trainDataVecs[0,len(clean_train_essays[0])-1])
    
    index2word_set = set(model.wv.index2word)
    if clean_train_essays[0][-1] in index2word_set:
        print(model[clean_train_essays[0][-1]])
    else:
        print("不在model中")
    break


--------Fold 1--------

第一条句子： ['dear', 'local', 'newspaper', 'believe', 'computer', 'positive', 'perks', 'also', 'negative', 'perks']
Training Word2Vec Model...
['dear', 'local', 'newspaper', 'believe', 'computer', 'positive', 'perks', 'also', 'negative', 'perks', 'reason', 'one', 'loose', 'interaction', 'people', 'reason', 'two', 'forget', 'excersize', 'reason', 'three', 'health', 'jeopardy', 'firstly', 'loose', 'interaction', 'family', 'friends', 'would', 'spend', 'much', 'time', 'computers', 'ruins', 'marriage', 'married', 'kids', 'risk', 'relationship', 'saying', 'stop', 'using', 'computers', 'use', 'saying', 'make', 'life', 'computer', 'good', 'secondly', 'need', 'excersize', 'stastics', 'show', 'percent', 'people', 'live', 'computers', 'gain', 'weight', 'faster', 'would', 'know', 'num', 'yr', 'old', 'gained', 'little', 'weight', 'computers', 'people', 'computer', 'tend', 'eat', 'caps', 'well', 'known', 'facts', 'proven', 'since', 'walk', 'computer', 'hand', 'stop', 'moving', 'm

