## Setup

### Set environment variables

In [None]:
import os
from google.colab import userdata

# Note: `userdata.get` is a Colab API. If you're not using Colab, set the env
# vars as appropriate for your system.

os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')
os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')

In [None]:
from pprint import pprint

### Install dependencies

In [None]:
# Install Keras 3 last. See https://keras.io/getting_started/ for more details.
!pip install -q -U keras-nlp
!pip install -q -U keras>=3

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.15.0 requires keras<2.16,>=2.15.0, but you have keras 3.0.5 which is incompatible.[0m[31m
[0m

### Select a backend

In [None]:
os.environ["KERAS_BACKEND"] = "jax"  # Or "torch" or "tensorflow".
# Avoid memory fragmentation on JAX backend.
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="1.00"

### Import packages

In [None]:
import keras
import keras_nlp

import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns
from datetime import datetime


## Load and Preprocess Dataset

In [None]:
!wget -q -O nytcrosswords.csv 'https://www.dropbox.com/scl/fi/frj3j6vyrg36cjb4rvdtm/nytcrosswords.csv?rlkey=0wsqemquskwy6fta48mjk46f2&dl=0'

In [None]:
# Import and clean data

try:
    data = pd.read_csv('nytcrosswords.csv', encoding='latin1')
except UnicodeDecodeError:
    try:
        data = pd.read_csv('nytcrosswords.csv', encoding='ISO-8859-1')
    except UnicodeDecodeError:
        data = pd.read_csv('nytcrosswords.csv', encoding='utf-8-sig')

data = data.astype("string")
data['word_length'] = data['Word'].str.len()
data = data.dropna()

# Only select words of length 3-7
data = data[(data['word_length'] >= 3) & (data['word_length'] <= 8)]
data = data[data.duplicated('Word', keep=False)]
data = data.drop_duplicates(subset=['Word','Clue'])
data = data[~data['Clue'].str.contains(r'\b\d+-(across|down)\b', case=False)]
data

  data = data[~data['Clue'].str.contains(r'\b\d+-(across|down)\b', case=False)]


Unnamed: 0,Date,Word,Clue,word_length
0,10/31/2021,PAT,"Action done while saying ""Good dog""",3
1,10/31/2021,RASCALS,Mischief-makers,7
2,10/31/2021,PEN,It might click for a writer,3
3,10/31/2021,SEP,Fall mo.,3
4,10/31/2021,ECO,Kind to Mother Nature,3
...,...,...,...,...
781562,11/21/1993,NIOBE,Tantalus's daughter,5
781563,11/21/1993,IRAQI,Kirkuk native,5
781564,11/21/1993,ARS,"""___ magna"" (anagrams, appropriately)",3
781567,11/21/1993,ACE,King's superior,3


#Reducing Dimensionality

### DO NOT RUN THIS

In [None]:
import pandas as pd
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def find_similar_groups(df, similarity_threshold=0.9):
    similar_groups = []

    for i, row1 in df.iterrows():
        group = [i]
        for j, row2 in df.iterrows():
            if i != j:
                clue_similarity = similar(row1['Clue'], row2['Clue'])
                if row1['Word'] == row2['Word'] and clue_similarity >= similarity_threshold:
                    group.append(j)
        if len(group) > 1:
            similar_groups.append(group)

    return similar_groups

similar_groups = find_similar_groups(data)
similar_groups


KeyboardInterrupt: 

In [None]:
import pandas as pd
from datasketch import MinHashLSHEnsemble, MinHash
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def find_similar_groups_2(df, similarity_threshold=0.9, num_perm=128, threshold=0.5):
    lsh = MinHashLSHEnsemble(threshold=threshold, num_perm=num_perm)
    minhashes = {}

    # Create MinHash objects for each row
    for i, row in df.iterrows():
        minhash = MinHash(num_perm=num_perm)
        for word in row['Clue'].split():
            minhash.update(word.encode('utf-8'))
        lsh.insert(str(i), minhash)
        minhashes[i] = minhash

    # Find similar groups
    similar_groupings = []
    for i, row in df.iterrows():
        minhash = minhashes[i]
        result = lsh.query(minhash)
        group = [int(x) for x in result if similar(df.iloc[i]['Word'], df.iloc[int(x)]['Word']) >= similarity_threshold]
        if len(group) > 1:
            similar_groupings.append(group)

    return similar_groupings

similar_groupings = find_similar_groups_2(data)
similar_groupings


AttributeError: 'MinHashLSHEnsemble' object has no attribute 'insert'

1789104

In [None]:
train_df

Unnamed: 0,Date,Word,Clue,word_length,Token Count
0,10/31/2021,PAT,"Action done while saying ""Good dog""",3,6
1,10/31/2021,RASCALS,Mischief-makers,7,1
2,10/31/2021,PEN,It might click for a writer,3,6
3,10/31/2021,SEP,Fall mo.,3,2
4,10/31/2021,ECO,Kind to Mother Nature,3,4
...,...,...,...,...,...
687780,3/29/1997,TENT,Intensive care room sight,4,4
687781,3/29/1997,ASTAGE,"""A kingdom for ___"": ""Henry V""",6,6
687782,3/29/1997,BARTER,Simple commerce,6,2
687783,3/29/1997,CRIMEA,Where the Light Brigade charged,6,5


#Finetuning Gemma

## Train Test Split

In [None]:
data['Token Count'] = data['Clue'].apply(lambda x: len(x.split()))

# If you want the total number of tokens across all clues
total_tokens = data['Token Count'].sum()

total_tokens

1754762

In [None]:
# Split dataset into training and validation sets
n = len(data)
train_df = data[0:int(0.9*n)]
test_df = data[int(0.9*n):n]

In [None]:
print(len(train_df))
print(len(test_df))

472580
52509


In [None]:
# take a small sample to train on
data_sample = train_df[69:420]

In [None]:
# format data for training
formatted_data = []
for _, row in data_sample.iterrows():
    instruction = f"Instruction:\nImagine you are the best New York Times crossword solver in the world. Given the following clue, what is your best guess for the answer: {row['Clue']}\n\nResponse:\n{row['Word']}"
    formatted_data.append(instruction)


In [None]:
# print examples
import random
for d in random.sample(formatted_data, 3):
  print(d, "\n", 50*"=", "\n")

Instruction:
Imagine you are the best New York Times crossword solver in the world. Given the following clue, what is your best guess for the answer: Some fine art

Response:
OILS 

Instruction:
Imagine you are the best New York Times crossword solver in the world. Given the following clue, what is your best guess for the answer: Vote by ___

Response:
MAIL 

Instruction:
Imagine you are the best New York Times crossword solver in the world. Given the following clue, what is your best guess for the answer: The brainy bunch?

Response:
MENSA 



## Load Model

KerasNLP provides access to many [pretrained models](https://keras.io/api/keras_nlp/models/). Let's take a quick look.

In this notebook, we'll fine-tune a 2-billion-parameter base [Gemma LLM ](https://blog.google/technology/developers/gemma-open-models/). Note that this model is a **base** LLM. It has **not** been instruction tuned.

We first download the model using the `from_preset` method.

In [None]:
gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma_2b_en")
gemma_lm.summary()

Attaching 'config.json' from model 'keras/gemma/keras/gemma_2b_en/2' to your Colab notebook...
Attaching 'config.json' from model 'keras/gemma/keras/gemma_2b_en/2' to your Colab notebook...
Attaching 'model.weights.h5' from model 'keras/gemma/keras/gemma_2b_en/2' to your Colab notebook...
Attaching 'tokenizer.json' from model 'keras/gemma/keras/gemma_2b_en/2' to your Colab notebook...
Attaching 'assets/tokenizer/vocabulary.spm' from model 'keras/gemma/keras/gemma_2b_en/2' to your Colab notebook...


In [None]:
template = "Instruction:\n{instruction}\n\nResponse:\n{response}"
prompt = template.format(
    instruction="Pretend you are the best New York Times crossword solver. Given the crossword clue 'Part of the body to slap', what is your best guess for the answer? The answer must be 4 letters long.",
    response="",
)

print(prompt)

Instruction:
Pretend you are the best New York Times crossword solver. Given the crossword clue 'Part of the body to slap', what is your best guess for the answer? The answer must be 4 letters long.

Response:



In [None]:
sampler = keras_nlp.samplers.TopKSampler(k=5, seed=2)
gemma_lm.compile(sampler=sampler)
print(gemma_lm.generate(prompt, max_length=256))

Instruction:
Pretend you are the best New York Times crossword solver. Given the crossword clue 'Part of the body to slap', what is your best guess for the answer? The answer must be 4 letters long.

Response:
Your best guess is the word 'LIPS'.

Explanation:
In order to solve this puzzle, we need to use our knowledge of the English alphabet, which has 26 letters. The clue "Part of the body to slap" gives us the first two letters of the answer, "LIP", which means the word "Lips" fits the clue. The remaining two letters, which must be added to form the 4-letter-long word "LIPS", can be found by looking at the remaining letters in the alphabet: "S". Therefore, the answer is "LIPS".

In summary, to solve the New York Times crossword puzzle, we need to read the clue and use the alphabet to find the answer. The answer to this puzzle is "LIPS".


In [None]:
# Enable LoRA for the model and set the LoRA rank to 4.
gemma_lm.backbone.enable_lora(rank=4)
gemma_lm.summary()

In [None]:
# Limit the input sequence length to 512 (to control memory usage).
gemma_lm.preprocessor.sequence_length = 512


# Use AdamW (a common optimizer for transformer models).
optimizer = keras.optimizers.AdamW(
    learning_rate=5e-5,
    weight_decay=0.01,
)
# Exclude layernorm and bias terms from decay.
optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])
# if you use Adam instead of AdamW, comment out the line above


gemma_lm.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=optimizer,
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
)

# we will make just one pass through the data
# and each batch will just be one example

gemma_lm.fit(formatted_data, epochs=1, batch_size=1)

[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m564s[0m 1s/step - loss: 0.2265 - sparse_categorical_accuracy: 0.4939


<keras.src.callbacks.history.History at 0x7afd12d49c60>

In [None]:
# retry prompt with finetuned Gemma
template = "Instruction:\n{instruction}\n\nResponse:\n{response}"
prompt = template.format(
    instruction="Pretend you are the best New York Times crossword solver. Given the crossword clue 'Part of the body to slap', what is your best guess for the answer? The answer must be 4 letters long.",
    response="",
)

sampler = keras_nlp.samplers.TopKSampler(k=5, seed=2)
gemma_lm.compile(sampler=sampler)
pprint(gemma_lm.generate(prompt, max_length=256))

('Instruction:\n'
 'Pretend you are the best New York Times crossword solver. Given the '
 "crossword clue 'Part of the body to slap', what is your best guess for the "
 'answer? The answer must be 4 letters long.\n'
 '\n'
 'Response:\n'
 'CHEST')


# Accuracy

In [None]:
# take a subset of the test set to test on

subset = train_df[500:550]

In [None]:
# run the finetuned gemma on test clues
answers = []

for i, row in subset.iterrows():
    clue = row['Clue']
    word_length = row['word_length']
    # prompt = f"Here is a crossword clue: {clue}. The answer has {word_length} letters. Give the answer in one word?"

    #prompt = f"Imagine you are the best New York Times crossword solver in the world. Given the following clue, what is your best guess for the answer which must have {word_length} letters: {clue}"
    prompt = template.format(
    instruction= f"Pretend you are the best New York Times crossword solver. Given the crossword clue '{clue}', what is the answer? The answer must be {word_length} letters long.",
    response="",)

    answer_raw = gemma_lm.generate(prompt, max_length=100)
    answers.append(answer_raw)
subset["raw_answers"] = answers


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset["raw_answers"] = answers


In [None]:
subset

Unnamed: 0,Date,Word,Clue,word_length,raw_answers
537,10/25/2021,SHAKE,"Malted, e.g.",5,Instruction:\nPretend you are the best New Yor...
538,10/25/2021,THYME,Spice whose name consists of two consecutive p...,5,Instruction:\nPretend you are the best New Yor...
539,10/25/2021,YIP,Bark like a lap dog,3,Instruction:\nPretend you are the best New Yor...
540,10/25/2021,PEAT,Fuel from a bog,4,Instruction:\nPretend you are the best New Yor...
541,10/25/2021,CFL,North-of-the-border sports org.,3,Instruction:\nPretend you are the best New Yor...
542,10/25/2021,IRE,Fury,3,Instruction:\nPretend you are the best New Yor...
543,10/25/2021,LAD,Young fellow,3,Instruction:\nPretend you are the best New Yor...
544,10/25/2021,ART,Good name for a museum curator?,3,Instruction:\nPretend you are the best New Yor...
545,10/25/2021,SAY,"""___ what?""",3,Instruction:\nPretend you are the best New Yor...
546,10/24/2021,ALLAH,"One known as ""the Alive, the Eternal""",5,Instruction:\nPretend you are the best New Yor...


In [None]:
# save results
subset.to_csv('50_train10_test.csv', index=False)

In [None]:
subset.to_csv('1000_train_100_test.csv', index=False)