In [3]:
DATASET_DIR = './data'
SAVE_DIR = './'

In [4]:
import os
import pandas as pd

In [12]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Adefemi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Adefemi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [5]:
data_set = pd.read_csv(os.path.join(DATASET_DIR, 'training_set_rel3.tsv'), sep='\t', encoding='ISO-8859-1')
score_col = data_set['domain1_score']
data_set = data_set.dropna(axis=1)
data_set = data_set.drop(columns=['rater1_domain1', 'rater2_domain1'])

In [6]:
data_set.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8


In [7]:
# Max and Min Scores

minimum_scores = [-1, 2, 1, 0, 0, 0, 0, 0, 0]
maximum_scores = [-1, 12, 6, 3, 3, 4, 4, 30, 60]

In [29]:
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec

def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return words  # Removed the tuple to return a list.

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,), dtype="float32")
    num_words = 0.
    for word in words:
        if word in model.wv.key_to_index:  # Updated to use key_to_index.
            num_words += 1
            featureVec = np.add(featureVec, model.wv[word])  # Updated to use model.wv[word].
    if num_words == 0:
        return featureVec  # Return zeros if no valid words found.
    featureVec = np.divide(featureVec, num_words)
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for word2vec model."""
    essayFeatureVecs = np.zeros((len(essays), num_features), dtype="float32")
    for i, essay in enumerate(essays):
        essayFeatureVecs[i] = makeFeatureVec(essay, model, num_features)
    return essayFeatureVecs


In [17]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K

def get_model():
    """Define the model."""
    model = Sequential()
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

Training Phase
Now we train the model on the dataset.

We will use 5-Fold Cross Validation and measure the Quadratic Weighted Kappa for each fold. We will then calculate Average Kappa for all the folds.

In [7]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

cv = KFold(n_splits = 5, shuffle = True)
results = []
y_pred_list = []
all_y_pred = []
all_y_test = []

count = 1
for traincv, testcv in cv.split(data_set):
    print("\n--------Fold {}--------\n".format(count))
    X_test, X_train, y_test, y_train = data_set.iloc[testcv], data_set.iloc[traincv], score_col.iloc[testcv], score_col.iloc[traincv]
    
    train_essays = X_train['essay']
    test_essays = X_test['essay']
    
    sentences = []

    
    for essay in train_essays:
            # Obtaining all sentences from the training essays.
            sentences += essay_to_sentences(essay, remove_stopwords = True)
            
    # Initializing variables for word2vec model.
    num_features = 300 
    min_word_count = 40
    num_workers = 4
    context = 10
    downsampling = 1e-3

    print("Training Word2Vec Model...")
    model = Word2Vec(sentences, workers=num_workers, vector_size=num_features, min_count = min_word_count, window = context, sample = downsampling)

    model.init_sims(replace=True)
    model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

    clean_train_essays = []
    
    # Generate training and testing data word vectors.
    for essay_v in train_essays:
        clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
    trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)
    
    clean_test_essays = []
    for essay_v in test_essays:
        clean_test_essays.append(essay_to_wordlist( essay_v, remove_stopwords=True ))
    testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )
    
    trainDataVecs = np.array(trainDataVecs)
    testDataVecs = np.array(testDataVecs)
    # Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
    trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
    testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))
    
    lstm_model = get_model()
    lstm_model.fit(trainDataVecs, y_train, batch_size=64, epochs=2)
    #lstm_model.load_weights('./model_weights/final_lstm.h5')
    y_pred = lstm_model.predict(testDataVecs)
    y_pred = np.around(y_pred).flatten()  # Flatten and round predictions
    
    # Append predictions and actual scores to the lists
    all_y_pred.extend(y_pred)
    all_y_test.extend(y_test.values)
    
    # Save any one of the 5 models.
    if count == 5:
         lstm_model.save('./model_weights/final_lstm.h5')
    
    # Round y_pred to the nearest integer.
    y_pred = np.around(y_pred)
    
    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    result = cohen_kappa_score(y_test.values,y_pred,weights='quadratic')
    print("Kappa Score: {}".format(result))
    results.append(result)

    count += 1
    
    
# After completing all folds, create and plot the confusion matrix
cm = confusion_matrix(all_y_test, all_y_pred)

plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt="d")
plt.title('Confusion Matrix')
plt.ylabel('Actual Scores')
plt.xlabel('Predicted Scores')
plt.show()
        

NameError: name 'data_set' is not defined

Rata-rata skor kappa is a statistical measure used to assess the level of agreement between two raters in evaluating a particular object or subject. Kappa scores range from -1 to 1, with a value of 1 indicating perfect agreement between the two raters, 0 indicating agreement that is the same as expected by chance, and -1 indicating perfect disagreement between the two raters 1. The average kappa score can be used to evaluate the reliability of an assessment instrument 1.

In [19]:
print("Rata-rata skor kappa: ", np.around(np.array(results).mean(),decimals=4))

Rata-rata skor kappa:  0.7347


In [28]:
import math
from gensim.test.utils import datapath

essay_sample = """
        Dear @CAPS1 @CAPS2 I feel that computers do take away from peoples life and aren�t as important than the other factors of life. 
        First of all you know that the world is becoming obease because of lack of exercise. 
        Also people are becoming more and more anti-social because of computers.
    """

content = essay_sample

if len(content) > 20:
    num_features = 300
    clean_test_essays = []
    clean_test_essays.append(essay_to_wordlist(content, remove_stopwords=True))
    testDataVecs = getAvgFeatureVecs(clean_test_essays, model, num_features)
    testDataVecs = np.array(testDataVecs)
    testDataVecs = np.reshape(
        testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1])
    )

    predict = lstm_model.predict(testDataVecs)

    if math.isnan(predict):
        predict = 0
    else:
        predict = np.round(predict)

    if predict < 0:
        predict = 0
else:
    predict = 0

print(predict)

[[2.]]


  if math.isnan(predict):
