In [1]:
# Constants
DATASET_DIR = './data/'
GLOVE_DIR = './glove.6B/'
SAVE_DIR = './'

import os
import pandas as pd

X = pd.read_csv(os.path.join(DATASET_DIR, 'training_set_rel3.tsv'), sep='\t', encoding='ISO-8859-1')
y = X['domain1_score']
X = X.dropna(axis=1)
X = X.drop(columns=['rater1_domain1', 'rater2_domain1', 'essay_id', 'essay_set'])
X.head()

Unnamed: 0,essay,domain1_score
0,"Dear local newspaper, I think effects computer...",8
1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,"Dear @LOCATION1, I know having computers has a...",8


In [2]:

import nltk
import re
from nltk.corpus import stopwords
import random

def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

window_size = 7
def essay_to_windows(X, window_size):
    """Split the essay into windows with window_size length."""
    X_windows = []
    for i in range(len(X)):
      words = essay_to_wordlist(X.iloc[i, 0], True)
      j = 0
      for j in range(0, len(words), window_size):
        X_windows.append(words[j:j+window_size])
      X_windows.append(words[j:len(words)])
        
    return X_windows
# nltk.download('words')
vocabulary = nltk.corpus.words.words()
def generate_corrupted(X_train):
    """Corrupts the training data by substituting random words each interval."""
    X_train_corrupted = []

    for i in range(len(X_train)):
        window = X_train[i]
        randIdx = random.randint(0, len(window)-1)
        cor = window.copy()
        cor[randIdx] = random.choice(vocabulary)
        X_train_corrupted.append(cor)
    return X_train_corrupted

In [3]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Layer

class RankingLossLayer(Layer):
    """Custom layer to compute the ranking loss."""
    def __init__(self, margin=1.0, **kwargs):
        super().__init__(**kwargs)
        self.margin = margin

    def call(self, inputs):
        valid_score, corrupted_score = inputs
        loss = K.maximum(0.0, self.margin - valid_score + corrupted_score)
        self.add_loss(K.mean(loss))
        return loss

def get_cw_model(vocab_size=10000, embedding_dim=300, window_size=window_size):
    """Define the Collobert and Weston (C&W) model."""
    # Inputs for valid and corrupted sequences
    valid_input = Input(shape=(window_size,), name="valid_sequence")
    corrupted_input = Input(shape=(window_size,), name="corrupted_sequence")

    # Shared embedding layer
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=window_size)

    # Embed valid and corrupted sequences
    valid_embedding = embedding_layer(valid_input)
    corrupted_embedding = embedding_layer(corrupted_input)

    # Flatten embeddings
    valid_flattened = Flatten()(valid_embedding)
    corrupted_flattened = Flatten()(corrupted_embedding)

    # Shared scoring network
    dense_layer = Dense(128, activation="tanh")
    valid_score = Dense(1, activation="linear")(dense_layer(valid_flattened))
    corrupted_score = Dense(1, activation="linear")(dense_layer(corrupted_flattened))

    # Custom ranking loss layer
    ranking_loss = RankingLossLayer()([valid_score, corrupted_score])

    # Create the model
    model = Model(inputs=[valid_input, corrupted_input], outputs=[valid_score, corrupted_score])
    model.compile(optimizer=Adam(learning_rate=0.001))
    model.summary()

    return model

2024-11-25 08:31:08.565949: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-25 08:31:08.577292: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-25 08:31:08.695204: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-25 08:31:08.798627: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-25 08:31:08.910776: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registe

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = essay_to_windows(X_train, window_size)
X_train_corrupted = generate_corrupted(X_train)
#generate unique id for each word in train set in a dictionary
word2idx = {}
idx = 0
for word in vocabulary:
  if word not in word2idx:
    word2idx[word] = idx
    idx += 1

# transform words to index
X_train = [[word2idx[word] for word in window] for window in X_train]
X_train_corrupted = [[word2idx[word] for word in window] for window in X_train_corrupted]

#check 5 first words in the first window
print(X_train[0][:5])
print(X_train_corrupted[0][:5])

KeyError: 'types'

In [8]:
model = get_cw_model(vocab_size=10000, embedding_dim=300, window_size=window_size)
model.fit([X_train[:5], X_train_corrupted[:5]], epochs=10, batch_size=1)



ValueError: Unrecognized data type: x=[[['many', 'types', 'reading', 'materials', 'people', 'read', 'library'], ['find', 'things', 'cars', 'trucks', 'sports', 'made', 'stories'], ['fun', 'materials', 'library', 'though', 'could', 'offensive', 'materials'], ['removed', 'library', 'think', 'others', 'think', 'opinion', 'think'], ['things', 'removed', 'library', 'certain', 'people', 'month', 'find']], [['many', 'types', 'reading', 'materials', 'people', 'adaw', 'library'], ['find', 'things', 'cars', 'trucks', 'sports', 'haemodoraceous', 'stories'], ['fun', 'materials', 'library', 'though', 'could', 'dehydrant', 'materials'], ['removed', 'interposure', 'think', 'others', 'think', 'opinion', 'think'], ['things', 'removed', 'library', 'certain', 'aerocraft', 'month', 'find']]] (of type <class 'list'>)