# Problem Statement
We have a dataset containing excerpts of different pasaages and target column indicating the difficult of the passage i.e the reading ease. We need to predict the reading ease of unknown passages in test data.

## Approach Followed:
Here we follow below steps to solve this problem -
* Combine the train and test excerpts to preprocess them at once
* Apply preprocessing to these texts
* Prepare data for LSTM model by creating padded sequences of same length
* Use Google Word2Vec pretrained embeddings to create an embedding matrix for the LSTM input layer
* Run LSTM model on train data, keeping test split as validation set

# Load Data

In [1]:
import numpy as np
import pandas as pd
train_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
print(train_df.shape)
print(test_df.shape)
train_df.head(5)

(2834, 6)
(7, 4)


Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


# Clean the excerpt text
1. Remove all characters apart from alphabets
2. Lowercase the text
3. Lemmatize the text data

In [4]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemma = WordNetLemmatizer()
def preprocess(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = text.split()
    words = [lemma.lemmatize(word) for word in words if word not in stopwords.words('english')]
    return words

In [5]:
excerpt_text = train_df['excerpt'].append(test_df['excerpt'])
excerpt_text = excerpt_text.apply(lambda x: preprocess(x))
excerpt_text.head()

0    [young, people, returned, ballroom, presented,...
1    [dinner, time, mr, fayre, somewhat, silent, ey...
2    [roger, predicted, snow, departed, quickly, ca...
3    [outside, palace, great, garden, walled, round...
4    [upon, time, three, bear, lived, together, hou...
Name: excerpt, dtype: object

# Build Keras Model

## Prepare data for Keras

In [6]:
# Tokenize the excerpt text data. This will assign unique integer to every unique word in excerpt text data
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(excerpt_text)

In [7]:
# Get total words in excerpt text. This will be used to create the embedding matrix of shape (vocab_size, Dimension(word_embedding))
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size is {}".format(vocab_size))

Vocabulary size is 22597


In [8]:
sequences = tokenizer.texts_to_sequences(excerpt_text)
print(f"There are {excerpt_text.shape[0]} excerpts and {len(sequences)} sequences")
print(f"Min sequence length is {min([len(s) for s in sequences])}")
print(f"Max sequence length is {max([len(s) for s in sequences])}")

There are 2841 excerpts and 2841 sequences
Min sequence length is 52
Max sequence length is 135


In [9]:
from keras.preprocessing.sequence import pad_sequences
sequences = pad_sequences(sequences, padding='post')
print(f"Min sequence length is {min([len(s) for s in sequences])}")
print(f"Max sequence length is {max([len(s) for s in sequences])}")
print(f"Shape of sequences is {sequences.shape}")

Min sequence length is 135
Max sequence length is 135
Shape of sequences is (2841, 135)


## Load Google Word2Vec embeddings

In [10]:
embedding = gensim.models.KeyedVectors.load_word2vec_format('../input/word2vec-google/GoogleNews-vectors-negative300.bin', binary=True)

In [11]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    try:
        embedding_vector = embedding.get_vector(word)
    except:
        embedding_vector = np.zeros((300,))
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
embedding_matrix.shape

(22597, 300)

## Run the model

In [16]:
from tensorflow import keras
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense, LSTM
from keras.initializers import Constant

model = keras.Sequential()
model.add(Embedding(vocab_size, 300, embeddings_initializer=Constant(embedding_matrix), input_length=135, trainable=False))
model.add(LSTM(100))
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 135, 300)          6779100   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 6,939,601
Trainable params: 160,501
Non-trainable params: 6,779,100
_________________________________________________________________


In [13]:
from sklearn.model_selection import train_test_split
target = train_df['target']
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(sequences).head(train_df.shape[0]), target)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(2125, 135)
(2125,)
(709, 135)
(709,)


In [17]:
print("Training...")
history = model.fit(x=X_train, y=y_train, batch_size=16, epochs=15, validation_data=(X_test, y_test), verbose=2)

Training...
Epoch 1/15
133/133 - 15s - loss: 0.9485 - mean_squared_error: 0.9485 - val_loss: 0.8839 - val_mean_squared_error: 0.8839
Epoch 2/15
133/133 - 12s - loss: 0.7402 - mean_squared_error: 0.7402 - val_loss: 0.7240 - val_mean_squared_error: 0.7240
Epoch 3/15
133/133 - 12s - loss: 0.8970 - mean_squared_error: 0.8970 - val_loss: 0.9136 - val_mean_squared_error: 0.9136
Epoch 4/15
133/133 - 12s - loss: 0.9655 - mean_squared_error: 0.9655 - val_loss: 1.0076 - val_mean_squared_error: 1.0076
Epoch 5/15
133/133 - 12s - loss: 0.9887 - mean_squared_error: 0.9887 - val_loss: 0.9937 - val_mean_squared_error: 0.9937
Epoch 6/15
133/133 - 12s - loss: 0.9879 - mean_squared_error: 0.9879 - val_loss: 1.0007 - val_mean_squared_error: 1.0007
Epoch 7/15
133/133 - 12s - loss: 0.9875 - mean_squared_error: 0.9875 - val_loss: 1.0096 - val_mean_squared_error: 1.0096
Epoch 8/15
133/133 - 12s - loss: 0.9882 - mean_squared_error: 0.9882 - val_loss: 1.0000 - val_mean_squared_error: 1.0000
Epoch 9/15
133/133 -

# Predictions

In [19]:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

0.7126405569416674

In [34]:
pred = model.predict(pd.DataFrame(sequences).tail(test_df.shape[0])).reshape(test_df.shape[0])
sample_submission = pd.DataFrame(list(zip(test_df['id'], pred)), columns=['id', 'target'])
sample_submission

Unnamed: 0,id,target
0,c0f722661,-0.976133
1,f0953f0a5,-0.757631
2,0df072751,-1.031678
3,04caf4e0c,-1.752487
4,0e63f8bea,-2.273083
5,12537fe78,-1.096488
6,965e592c0,0.050508


In [35]:
sample_submission.to_csv('submission.csv',index=False)