In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns
from datetime import datetime

# Load and Preprocess Dataset

In [None]:
!wget -q -O nytcrosswords.csv 'https://www.dropbox.com/scl/fi/frj3j6vyrg36cjb4rvdtm/nytcrosswords.csv?rlkey=0wsqemquskwy6fta48mjk46f2&dl=0'

In [None]:
# Import and clean data

try:
    data = pd.read_csv('nytcrosswords.csv', encoding='latin1')
except UnicodeDecodeError:
    try:
        data = pd.read_csv('nytcrosswords.csv', encoding='ISO-8859-1')
    except UnicodeDecodeError:
        data = pd.read_csv('nytcrosswords.csv', encoding='utf-8-sig')

data = data.astype("string")
data['word_length'] = data['Word'].str.len()
data = data.dropna()

# Only select words of length 3-7
data = data[(data['word_length'] >= 3) & (data['word_length'] <= 8)]
data = data[data.duplicated('Word', keep=False)]
data = data.drop_duplicates(subset=['Word','Clue'])
data

Unnamed: 0,Date,Word,Clue,word_length
0,10/31/2021,PAT,"Action done while saying ""Good dog""",3
1,10/31/2021,RASCALS,Mischief-makers,7
2,10/31/2021,PEN,It might click for a writer,3
3,10/31/2021,SEP,Fall mo.,3
4,10/31/2021,ECO,Kind to Mother Nature,3
...,...,...,...,...
781562,11/21/1993,NIOBE,Tantalus's daughter,5
781563,11/21/1993,IRAQI,Kirkuk native,5
781564,11/21/1993,ARS,"""___ magna"" (anagrams, appropriately)",3
781567,11/21/1993,ACE,King's superior,3


In [None]:
data['Token Count'] = data['Clue'].apply(lambda x: len(x.split()))

# If you want the total number of tokens across all clues
total_tokens = data['Token Count'].sum()

total_tokens

1789104

In [None]:
# Split dataset into training and validation sets
n = 50000
train_df = data[0:int(0.9*n)]
test_df = data[int(0.9*n):n]

# BERT Model


*   Load pre-processed model
*   Fine-tune on crossword data
*   Assess top 10 outputs



In [None]:
!pip install -q -U tensorflow-text

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Import BERT
import tensorflow_hub as hub
import tensorflow_text

bert_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

bert_layers = 12
bert_units = 768
bert_heads = 12

bert_encoder = f'https://tfhub.dev/tensorflow/bert_en_uncased_L-{bert_layers}_H-{bert_units}_A-{bert_heads}/4'

In [None]:
# Prepare data for BERT
bert_preprocess_model = hub.KerasLayer(bert_preprocess)
max_length = 512
preprocessor = hub.load(bert_preprocess)
encoder = hub.KerasLayer(bert_encoder, trainable=False)

def bert_textvect(x):
    """
    Converts a list of strings into a list of tokens for BERT
    """
    input = keras.layers.Input(shape=(), dtype=tf.string)
    tokenized_input = hub.KerasLayer(preprocessor.tokenize)(input)
    bert_pack_inputs = hub.KerasLayer(preprocessor.bert_pack_inputs, arguments=dict(seq_length=max_length))
    output = bert_pack_inputs([tokenized_input])
    model = keras.Model(input, output)
    result = model.predict(x)
    return result

def bert_features(x):
    """
    Converts the list of tokens into 768-dimensional embeddings for BERT.
    """
    inputs = dict(
    input_word_ids=keras.layers.Input(shape=(max_length,), dtype=tf.int32),
    input_mask=keras.layers.Input(shape=(max_length,), dtype=tf.int32),
    input_type_ids=keras.layers.Input(shape=(max_length,), dtype=tf.int32),
    )

    output = encoder(inputs)['sequence_output'][:, 0, :]
    model = keras.Model(inputs, output)
    return model.predict(x)

X_bert_train = bert_textvect(train_df['Clue'])
X_bert_test = bert_textvect(test_df['Clue'])

features_train = bert_features(X_bert_train)
features_test = bert_features(X_bert_test)

features_train.shape



(45000, 768)

In [None]:
# Train BERT Model
y = pd.get_dummies(data['Word']).to_numpy()
y_train = y[0:int(0.9*n)]
y_test = y[int(0.9*n):n]

# Neural Network
input = keras.Input(shape=(bert_units, ))

x = keras.layers.Dense(128, activation='relu')(input)
x = keras.layers.Dropout(0.1)(x)
x = keras.layers.Dense(128, activation='relu')(x)
x = keras.layers.Dropout(0.1)(x)

output = keras.layers.Dense(y_train.shape[1], activation='softmax')(x)

# BERT Model
bert_model = keras.Model(input, output)
bert_model.summary()

bert_model.compile(optimizer="adam",
              loss="categorical_crossentropy",
              metrics=["accuracy"])

bert_model.fit(
    x=features_train[:200], y=y_train[:200],
    epochs=20, batch_size=32,
    verbose=1,
)
print("\n*** Test accuracy with 200 Examples %.4f ***\n" % bert_model.evaluate(x=features_test, y=y_test)[1])

bert_model.fit(
    x=features_train, y=y_train,
    epochs=20, batch_size=32,
    verbose=1,
)
print("\n*** Test accuracy with All Examples %.4f ***\n" % bert_model.evaluate(x=features_test, y=y_test)[1])

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_14 (InputLayer)       [(None, 768)]             0         
                                                                 
 dense (Dense)               (None, 128)               98432     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 46566)             6007014   
                                                                 
Total params: 6121958 (23.35 MB)
Trainable params: 6121958 

In [None]:
# Assess top 10 list
model_predictions = bert_model.predict(features_test)

def get_top_n_predictions(predictions, n=10):
    top_n_indices = np.argsort(predictions, axis=1)[:, -n:][:, ::-1]
    return top_n_indices

top_10_predictions = get_top_n_predictions(model_predictions, 10)

def top_10_accuracy(y_true, y_pred):
    top_10 = get_top_n_predictions(y_pred, 10)
    # Assuming y_true is one-hot encoded, convert it to indices
    y_true_indices = np.argmax(y_true, axis=1)
    correct_answer_in_top_10 = [y_true_indices[i] in top_10[i] for i in range(len(y_true_indices))]
    return np.mean(correct_answer_in_top_10)

top_10_acc = top_10_accuracy(y_test, model_predictions)
print(f"Top 10 Accuracy: {top_10_acc:.4f}")

Top 10 Accuracy: 0.0550
