In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
data.head()

In [None]:
# Null check
data.isnull().sum()/len(data) * 100

In [None]:
data.drop(['url_legal', 'license'], 1, inplace=True)

In [None]:
from gensim.parsing.preprocessing import remove_stopwords

docs = data['excerpt'].str.lower().str.replace('[^a-z\s]', '')

docs = docs.apply(remove_stopwords)
docs[:10]

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()

tokenizer.fit_on_texts(docs)
vocab = list(tokenizer.word_index)

print('Total number of unique tokens in corpus: %d' % len(vocab))

In [None]:
zip_path = '/kaggle/input/quora-insincere-questions-classification/embeddings.zip'
from zipfile import ZipFile
zf = ZipFile(zip_path)
zf.filelist

### Glove Embedding Layer

In [None]:
glove_path = 'glove.840B.300d/glove.840B.300d.txt'
count = 0
with zf.open(glove_path) as file:
    embeddings_glove = {}
    for line in file:
        line = line.decode('utf-8').replace('\n', '').split(' ')
        curr_word = line[0]
        if curr_word in vocab:
            vector = line[1:]
            vector = np.array(vector).astype(float)
            embeddings_glove[curr_word] = vector

In [None]:
vocab_size = len(vocab) + 1
embedding_dim = 300
words_not_available = []

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, wid in tokenizer.word_index.items():
    if word in embeddings_glove:
        embedding_matrix[wid] = embeddings_glove[word]
    else:
        words_not_available.append(word)
        
print('Percentage of words not avaialable %.2f%%' % (len(words_not_available)/len(vocab)*100))
print('Percentage of words avaialable %.2f%%' % (100 - len(words_not_available)/len(vocab)*100))

In [None]:
train_x_seq = tokenizer.texts_to_sequences(docs)

max_doc_len = 115
train_x_padded = pad_sequences(train_x_seq, padding='post', maxlen=max_doc_len)

### Using the word embeddings which has the maximum word’s coverage, create a regressor using simple neural network

In [None]:
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim,
                          weights = [embedding_matrix],
                          input_length=max_doc_len,
                          trainable=False))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1))
model.compile(optimizer='sgd', loss='mse', metrics=["mae"])
history = model.fit(train_x_padded, data['target'], epochs=10, verbose=1)

In [None]:
test_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
test_docs = test_df['excerpt'].str.lower().str.replace('[^a-z\s]', '')

test_docs = test_docs.apply(remove_stopwords)

test_x_seq = tokenizer.texts_to_sequences(test_docs)

test_x_padded = pad_sequences(test_x_seq, padding='post', maxlen=max_doc_len)

In [None]:
test_y_pred = model.predict(test_x_padded)
model.summary()

In [None]:
submission_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/sample_submission.csv')
submission_df['target'] = test_y_pred
submission_df.to_csv("submission_glove_embedding.csv", index=False)


### Google News Embedding Layer

In [None]:
from gensim.models import KeyedVectors

embedding_file = 'GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings = KeyedVectors.load_word2vec_format(zf.open(embedding_file), binary=True)

In [None]:
vocab_size = len(vocab) + 1
embedding_dim = 300
words_not_available = []

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, wid in tokenizer.word_index.items():
    if word in embeddings:
        embedding_matrix[wid] = embeddings[word]
    else:
        words_not_available.append(word)
        
print('Percentage of words not avaialable %.2f%%' % (len(words_not_available)/len(vocab)*100))
print('Percentage of words avaialable %.2f%%' % (100 - len(words_not_available)/len(vocab)*100))

In [None]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim,
                          weights = [embedding_matrix],
                          input_length=max_doc_len,
                          trainable=False))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1))
model.compile(optimizer='sgd', loss='mse', metrics=["mae"])
history = model.fit(train_x_padded, data['target'], epochs=10, verbose=1)

test_y_pred = model.predict(test_x_padded)
model.summary()

In [None]:
submission_df['target'] = test_y_pred
submission_df.to_csv("submission_google_embedding.csv", index=False)

### Build custom word embeddings using genism word2vec model (with window size=5) and retrain the neural network

In [None]:
from gensim.models import word2vec

docs_words = [doc.split(' ') for doc in docs]
len(docs_words)

In [None]:
embedding_dim = 100 
model = word2vec.Word2Vec(sentences =docs_words, vector_size=embedding_dim, min_count=50, window=5, sg=1)
vocab = model.wv.index_to_key
df_embedding_matrix = pd.DataFrame(model.wv[vocab], index=vocab)
df_embedding_matrix.shape

In [None]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim,
                          input_length=max_doc_len,
                          trainable=True))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1))
model.compile(optimizer='adam', loss='mse', metrics=["mae"])
history = model.fit(train_x_padded, data['target'], epochs=25, verbose=1)

test_y_pred = model.predict(test_x_padded)
model.summary()

### Keras Embedding Layer

In [None]:
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

vocab_size = len(vocab) + 1
embedding_dim = 300

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim,
                          input_length=max_doc_len,
                          trainable=True))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1))
model.compile(optimizer='adam', loss='mse', metrics=["mae"])
history = model.fit(train_x_padded, data['target'], epochs=10, verbose=1)

test_y_pred = model.predict(test_x_padded)
model.summary()

In [None]:
submission_df['target'] = test_y_pred
submission_df.to_csv("submission_keras_embedding.csv", index=False)

In [None]:
train_x_seq = tokenizer.texts_to_sequences(docs)

In [None]:
docs_size = []
for doc in train_x_seq:
    size = len(doc)
    docs_size.append(size)
pd.Series(docs_size).plot.box()

In [None]:
max_doc_len = 115

train_x_padded = pad_sequences(train_x_seq, padding='post', maxlen=max_doc_len)

In [None]:
docs[:5]

In [None]:
pd.DataFrame(train_x_padded[:5])

In [None]:
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

vocab_size = len(vocab) + 1
embedding_dim = 300

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim,
                          input_length=max_doc_len,
                          trainable=True))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1))
model.compile(optimizer='adam', loss='mse', metrics=["mae"])
history = model.fit(train_x_padded, data['target'], epochs=25, verbose=1)

In [None]:
# sgd optimizer
# loss: 0.3717 - mae: 0.5002

# rmsprop
# loss: 0.0658 - mae: 0.2122

# adam
# loss: 0.0594 - mae: 0.1468  # Finalized

In [None]:
test_data = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
test_data = test_data[['id', 'excerpt']]

test_docs = test_data['excerpt'].str.lower().str.replace('[^a-z\s]', '')

test_docs = test_docs.apply(remove_stopwords)

test_x_seq = tokenizer.texts_to_sequences(test_docs)
test_x_padded = pad_sequences(test_x_seq, padding='post', maxlen=max_doc_len)
test_docs

In [None]:
pd.DataFrame(test_x_padded)

In [None]:
test_y_pred = model.predict(test_x_padded)
model.summary()

In [None]:
submission_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/sample_submission.csv')
submission_df

In [None]:
submission_df['target'] = test_y_pred

In [None]:
submission_df.to_csv("submission.csv", index=False)