## Importing the Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:


import os
import pandas as pd

X = pd.read_csv( '/content/drive/MyDrive/maindataset/training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')
y = X['domain1_score']
X = X.dropna(axis=1)
X = X.drop(columns=['rater1_domain1', 'rater2_domain1'])

In [None]:


import pandas as pd
temp = pd.read_csv('/content/drive/MyDrive/maindataset/Processed_data.csv')


In [None]:
X['domain1_score']=temp['final_score']

In [None]:
X.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",6
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",7
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",5
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",8
4,5,1,"Dear @LOCATION1, I know having computers has a...",6


## Preprocessing the Data

We will preprocess all essays and convert them to feature vectors so that they can be fed into the RNN.

These are all helper functions used to clean the essays.

In [None]:
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec

def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            num_words += 1
            featureVec = np.add(featureVec,model[word])
    featureVec = np.divide(featureVec,num_words)
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for word2vec model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

In [None]:
import numpy as np

class LSTM_layer:
    def __init__(self, input_size, hidden_size, dropout_prob=0.4):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.dropout_prob = dropout_prob
        self.W_f = np.random.randn(input_size, hidden_size)
        self.W_i = np.random.randn(input_size, hidden_size)
        self.W_c = np.random.randn(input_size, hidden_size)
        self.W_o = np.random.randn(input_size, hidden_size)
        self.U_f = np.random.randn(hidden_size, hidden_size)
        self.U_i = np.random.randn(hidden_size, hidden_size)
        self.U_c = np.random.randn(hidden_size, hidden_size)
        self.U_o = np.random.randn(hidden_size, hidden_size)
        self.b_f = np.zeros(hidden_size)
        self.b_i = np.zeros(hidden_size)
        self.b_c = np.zeros(hidden_size)
        self.b_o = np.zeros(hidden_size)

    def forward(self, x, prev_hidden_state, prev_cell_state):
        f = sigmoid(np.dot(x, self.W_f) + np.dot(prev_hidden_state, self.U_f) + self.b_f)
        i = sigmoid(np.dot(x, self.W_i) + np.dot(prev_hidden_state, self.U_i) + self.b_i)
        c_tilde = np.tanh(np.dot(x, self.W_c) + np.dot(prev_hidden_state, self.U_c) + self.b_c)
        o = sigmoid(np.dot(x, self.W_o) + np.dot(prev_hidden_state, self.U_o) + self.b_o)

        c = f * prev_cell_state + i * c_tilde
        h = o * np.tanh(c)

        return h, c

class Dropout_layer:
    def __init__(self, dropout_prob):
        self.dropout_prob = dropout_prob

    def forward(self, x):
        mask = np.random.rand(*x.shape) < (1 - self.dropout_prob)
        return x * mask / (1 - self.dropout_prob)

class Dense_layer:
    def __init__(self, input_size, output_size, activation='relu'):
        self.input_size = input_size
        self.output_size = output_size
        self.activation = activation
        self.W = np.random.randn(input_size, output_size)
        self.b = np.zeros(output_size)

    def forward(self, x):
        output = np.dot(x, self.W) + self.b
        if self.activation == 'relu':
            return np.maximum(0, output)
        elif self.activation == 'sigmoid':
            return 1 / (1 + np.exp(-output))

class Sequential_model:
    def __init__(self):
        self.layers = []

    def add(self, layer):
        self.layers.append(layer)

    def compile(self, loss, optimizer, metrics):
        self.loss = loss
        self.optimizer = optimizer
        self.metrics = metrics

    def summary(self):
        for i, layer in enumerate(self.layers):
            print(f"Layer {i+1}: {layer.__class__.__name__}")

    def fit(self, X, y, batch_size, epochs):
        for epoch in range(epochs):
            print(f"Epoch {epoch+1}/{epochs}")
            for i in range(0, len(X), batch_size):
                batch_X = X[i:i+batch_size]
                batch_y = y[i:i+batch_size]
                self.train_on_batch(batch_X, batch_y)

    def train_on_batch(self, X_batch, y_batch):
        prev_hidden_state = np.zeros((X_batch.shape[0], self.layers[0].hidden_size))  # Initial hidden state
        prev_cell_state = np.zeros((X_batch.shape[0], self.layers[0].hidden_size))  # Initial cell state

        # Forward pass
        for layer in self.layers:
            if isinstance(layer, LSTM_layer):
                X_batch, prev_hidden_state = layer.forward(X_batch, prev_hidden_state, prev_cell_state)
            else:
                X_batch = layer.forward(X_batch)

        # Compute loss and perform backpropagation
        loss = self.loss_function(X_batch, y_batch)
        self.backward(loss)

    def loss_function(self, y_pred, y_true):
        return np.mean((y_pred - y_true) ** 2)

    def backward(self, loss):
        pass  # Implement backpropagation here

# Helper function for sigmoid activation
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Example usage:
# Define your model
model = Sequential_model()
model.add(LSTM_layer(input_size=300, hidden_size=300))
model.add(Dropout_layer(dropout_prob=0.4))
model.add(LSTM_layer(input_size=300, hidden_size=64))
model.add(Dropout_layer(dropout_prob=0.5))
model.add(Dense_layer(input_size=64, output_size=1, activation='relu'))

# Compile the model
model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])

# Print model summary
model.summary()


Layer 1: LSTM_layer
Layer 2: Dropout_layer
Layer 3: LSTM_layer
Layer 4: Dropout_layer
Layer 5: Dense_layer


In [None]:
import matplotlib.pyplot as plt

class Sequential_model:
    def __init__(self):
        self.layers = []
        self.loss_history = []

    def add(self, layer):
        self.layers.append(layer)

    def compile(self, loss, optimizer, metrics):
        self.loss = loss
        self.optimizer = optimizer
        self.metrics = metrics

    def summary(self):
        for i, layer in enumerate(self.layers):
            print(f"Layer {i+1}: {layer.__class__.__name__}")

    def fit(self, X, y, batch_size, epochs):
        for epoch in range(epochs):
            print(f"Epoch {epoch+1}/{epochs}")
            epoch_loss = 0
            for i in range(0, len(X), batch_size):
                batch_X = X[i:i+batch_size]
                batch_y = y[i:i+batch_size]
                loss = self.train_on_batch(batch_X, batch_y)
                epoch_loss += loss
            epoch_loss /= len(X) / batch_size  # Average loss per sample
            print(f"Epoch Loss: {epoch_loss}")
            self.loss_history.append(epoch_loss)

    def train_on_batch(self, X_batch, y_batch):
    # Forward pass
       y_pred = self.predict(X_batch)

    # Compute loss
       loss = np.mean((y_pred - y_batch) ** 2)

    # Backpropagation
       delta = 2 * (y_pred - y_batch) / len(y_batch)  # Derivative of mean squared error loss

       for layer in reversed(self.layers):
        delta = layer.backward(delta)  # Backward pass to compute gradients and update delta

       return loss

    def predict(self, X):
        prev_hidden_state = np.zeros((X.shape[0], self.layers[0].hidden_size))  # Initial hidden state
        prev_cell_state = np.zeros((X.shape[0], self.layers[0].hidden_size))  # Initial cell state
        for layer in self.layers:
            if isinstance(layer, LSTM_layer):
                X, prev_hidden_state, prev_cell_state = layer.forward(X, prev_hidden_state, prev_cell_state)
            else:
                X = layer.forward(X)
        return X

    def train_on_batch(self, X_batch, y_batch):
        prev_hidden_state = np.zeros((X_batch.shape[0], self.layers[0].hidden_size))  # Initial hidden state
        prev_cell_state = np.zeros((X_batch.shape[0], self.layers[0].hidden_size))  # Initial cell state
        for layer in self.layers:
            if isinstance(layer, LSTM_layer):
                X_batch, prev_hidden_state, prev_cell_state = layer.forward(X_batch, prev_hidden_state, prev_cell_state)
            else:
                X_batch = layer.forward(X_batch)
        # Update weights and compute loss
        loss = self._backpropagation(X_batch, y_batch)
        return loss


# Generate some example data
X_train = np.random.randn(1000, 300)
y_train = np.random.rand(1000)

# Define your model
model = Sequential_model()
model.add(LSTM_layer(input_size=300, hidden_size=300))
model.add(Dropout_layer(dropout_prob=0.4))
model.add(LSTM_layer(input_size=300, hidden_size=64))
model.add(Dropout_layer(dropout_prob=0.5))
model.add(Dense_layer(input_size=64, output_size=1, activation='relu'))

# Compile the model
model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])

# Print model summary
model.summary()

# Fit the model
model.fit(X_train, y_train, batch_size=64, epochs=10)

# Make predictions
X_test = np.random.randn(100, 300)
y_pred = model.predict(X_test)

# Draw loss curve
plt.plot(model.loss_history)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.ylim(0, 1)  # Set y-axis range from 0 to 1
plt.title('Loss Curve')
plt.show()


Layer 1: LSTM_layer
Layer 2: Dropout_layer
Layer 3: LSTM_layer
Layer 4: Dropout_layer
Layer 5: Dense_layer
Epoch 1/10


ValueError: not enough values to unpack (expected 3, got 2)

In [None]:
# Define your data splitting manually
from sklearn.model_selection import train_test_split

# Split your data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Adjust test_size as needed

train_essays = X_train['essay']
test_essays = X_test['essay']

sentences = []

for essay in train_essays:
    # Obtaining all sentences from the training essays.
    sentences += essay_to_sentences(essay, remove_stopwords=True)

# Initializing variables for word2vec model.
num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

print("Training Word2Vec Model...")
model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count=min_word_count, window=context, sample=downsampling)

model.init_sims(replace=True)
# model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

clean_train_essays = []

# Generate training and testing data word vectors.
for essay_v in train_essays:
    clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)

clean_test_essays = []
for essay_v in test_essays:
    clean_test_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
testDataVecs = getAvgFeatureVecs(clean_test_essays, model, num_features)

trainDataVecs = np.array(trainDataVecs)
testDataVecs = np.array(testDataVecs)
# Reshaping train and test vectors to 3 dimensions. (1 represents one timestep)
trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))

lstm_model = get_model()
lstm_model.fit(trainDataVecs, y_train, batch_size=64, epochs=50)
# lstm_model.load_weights('./model_weights/final_lstm.h5')
y_pred = lstm_model.predict(testDataVecs)

# Round y_pred to the nearest integer.
y_pred = np.around(y_pred)

# Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
result = cohen_kappa_score(y_test.values, y_pred, weights='quadratic')
print("Kappa Score: {}".format(result))


KeyboardInterrupt: 

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Defining the model

Here we define a 2-Layer LSTM Model.

Note that instead of using sigmoid activation in the output layer we will use
Relu since we are not normalising training labels.

In [None]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K

def get_model():
    """Define the model."""
    model = Sequential()
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

## Training Phase

Now we train the model on the dataset.

We will use 5-Fold Cross Validation and measure the Quadratic Weighted Kappa for each fold.
We will then calculate Average Kappa for all the folds.

In [None]:
from sklearn.cross_validation import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score

cv = KFold(len(X), n_folds=5, shuffle=True)
results = []
y_pred_list = []

count = 1
for traincv, testcv in cv:
    print("\n--------Fold {}--------\n".format(count))
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]

    train_essays = X_train['essay']
    test_essays = X_test['essay']

    sentences = []

    for essay in train_essays:
            # Obtaining all sentences from the training essays.
            sentences += essay_to_sentences(essay, remove_stopwords = True)

    # Initializing variables for word2vec model.
    num_features = 300
    min_word_count = 40
    num_workers = 4
    context = 10
    downsampling = 1e-3

    print("Training Word2Vec Model...")
    model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

    model.init_sims(replace=True)
    model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

    clean_train_essays = []

    # Generate training and testing data word vectors.
    for essay_v in train_essays:
        clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
    trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)

    clean_test_essays = []
    for essay_v in test_essays:
        clean_test_essays.append(essay_to_wordlist( essay_v, remove_stopwords=True ))
    testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )

    trainDataVecs = np.array(trainDataVecs)
    testDataVecs = np.array(testDataVecs)
    # Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
    trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
    testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))

    lstm_model = get_model()
    lstm_model.fit(trainDataVecs, y_train, batch_size=64, epochs=50)
    #lstm_model.load_weights('./model_weights/final_lstm.h5')
    y_pred = lstm_model.predict(testDataVecs)

    # Save any one of the 8 models.
    # if count == 5:
    #      lstm_model.save('./model_weights/final_lstm.h5')

    # Round y_pred to the nearest integer.
    y_pred = np.around(y_pred)

    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    result = cohen_kappa_score(y_test.values,y_pred,weights='quadratic')
    print("Kappa Score: {}".format(result))
    results.append(result)

    count += 1


ModuleNotFoundError: No module named 'sklearn.cross_validation'

The Avg. Kappa Score is 0.961 which is the highest we have ever seen on this dataset.

In [None]:
print("Average Kappa score after a 5-fold cross validation: ",np.around(np.array(results).mean(),decimals=4))

Average Kappa score after a 5-fold cross validation:  0.9613
