In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns
from datetime import datetime

# ChatGPT 3.5 Turbo API Loading

In [None]:
!pip install --upgrade openai
!pip install tiktoken




We will use `gpt-3.5-turbo` in this colab; this is the GPT variant that powered the initial release of ChatGPT and remains a potential backend for that service.




---
We will be using **pre-trained contextual embeddings** as well. For that, we will
use the `text-embedding-ada-002` model ([link](https://openai.com/blog/new-and-improved-embedding-model)).


---

Finally, let's set the OpenAI API key. You can get yours [here](https://platform.openai.com/account/api-keys), and then enter it under `OPENAI_API_KEY` in your Colab secrets. We will create an OpenAI API client using this key.


In [None]:
# Models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

import os
from google.colab import userdata, drive

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

from openai import OpenAI
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "<your OpenAI API key if not set as env var>"))

# Load and Preprocess Dataset

In [None]:
# Import data

!wget -q -O nytcrosswords.csv 'https://www.dropbox.com/scl/fi/frj3j6vyrg36cjb4rvdtm/nytcrosswords.csv?rlkey=0wsqemquskwy6fta48mjk46f2&dl=0'

In [None]:
# Clean data

try:
    data = pd.read_csv('nytcrosswords.csv', encoding='latin1')
except UnicodeDecodeError:
    try:
        data = pd.read_csv('nytcrosswords.csv', encoding='ISO-8859-1')
    except UnicodeDecodeError:
        data = pd.read_csv('nytcrosswords.csv', encoding='utf-8-sig')

data = data.astype("string")
data['word_length'] = data['Word'].str.len()
data = data.dropna()

# Only select words of length 3-7
data = data[(data['word_length'] >= 3) & (data['word_length'] <= 8)]
data = data[data.duplicated('Word', keep=False)]
data = data.drop_duplicates(subset=['Word','Clue'])
data = data[~data['Clue'].str.contains(r'\b\d+-(across|down)\b', case=False)]
data.to_csv('preprocessed.csv', index=False)

In [None]:
# For evaluation
subset = data[2000:3000]

# Zero Shot Learning

In [None]:
answers = []

for i, row in subset.iterrows():
    clue = row['Clue']
    word_length = row['word_length']
    # prompt = f"Here is a crossword clue: {clue}. The answer has {word_length} letters. Give the answer in one word?"

    # prompt = f"You are the best New York Times crossword solver. Clue: {clue} ({word_length}) Answer: "
    prompt = f"Clue: {clue} ({word_length}) Answer: "

    response = client.chat.completions.create(
      messages=[
        {'role': 'system', 'content': 'You solve New York Times crossword clues.'},
        {'role': 'user', 'content': prompt},
      ],
      model=GPT_MODEL,
      temperature=0,
    )

    answer = response.choices[0].message.content.strip()
    answers.append(answer)

    if i % 100 == 0:
        print(f"Processed {i} rows")


subset['answers'] = answers
subset['answers']

Processed 2200 rows
Processed 2300 rows
Processed 2400 rows
Processed 2500 rows
Processed 2600 rows
Processed 2700 rows
Processed 2800 rows
Processed 2900 rows
Processed 3000 rows
Processed 3200 rows


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['answers'] = answers


2145                                                  Bah
2146                                                  AGO
2147                                                  BLT
2148    For the clue "Definitely" with 3 letters, the ...
2149                                                SAILS
                              ...                        
3212                                                A-one
3213                                                  GTO
3214                                                 Cole
3215                                                 Song
3216                                                 Prom
Name: answers, Length: 1000, dtype: object

In [None]:
zero = subset
zero['answers'] = zero['answers'].str.split().str[-1]
zero['answers'] = zero['answers'].str.replace(r'[^\w\s]', '', regex=True)
zero['answers'] = zero['answers'].str.upper()
zero

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zero['answers'] = zero['answers'].str.split().str[-1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zero['answers'] = zero['answers'].str.replace(r'[^\w\s]', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zero['answers'] = zero['answers'].str.upper()


Unnamed: 0,Date,Word,Clue,word_length,answers
2145,10/4/2021,BAH,"Scrooge's ""Phooey!""",3,BAH
2146,10/4/2021,AGO,In the past,3,AGO
2147,10/4/2021,BLT,"Diner sandwich, for short",3,BLT
2148,10/4/2021,YES,"""Definitely""",3,AYE
2149,10/3/2021,SAILS,Goes wherever the wind blows?,5,SAILS
...,...,...,...,...,...
3212,9/20/2021,AONE,Top-notch,4,AONE
3213,9/20/2021,GTOS,Classic Pontiac muscle cars,4,GTO
3214,9/20/2021,COLE,Rapper J. ___,4,COLE
3215,9/20/2021,SONG,"Serenade, e.g.",4,SONG


In [None]:
correct = 0
for i, row in zero.iterrows():
  if row['Word'] == row['answers']:
    correct += 1

print(correct/len(zero))

0.468


In [None]:
# zero.to_csv('gpt35_zero_subset1.csv')

# Few Shot Learning

In [None]:
few_shot = data[2000:3000]

# Clue-Answer
context = 'You are the best New York Times crossword solver. Clue: Fitness center? (4) Answer: CORE. Clue: M.D. org. (3) Answer: AMA. Clue: Friends of Pierre (4) Answer: AMIS. Clue: "Get out of here!" (8) Answer: LEAVENOW. Clue: "Oh, what\'s the ___?" (3) Answer: USE.'

# Clue-Answer with Explanation
# context = 'You are the best New York Times crossword solver. Clue: Fitness center? (4) Answer: The answer contains 4 letters, must be a pun because of the "?" and be a noun. Therefore, the answer is CORE. Clue: M.D. org. (3) Answer: Because of the abbreviated clue, the answer is an abbreviation. Therefore it is AMA. Clue: Friends of Pierre (4) Answer: Pierre implies a French response, and friends is a plural noun. Therefore the answer is AMIS. Clue: "Get out of here!" (8) Answer: The clue is a spoken phrase. An 8 letter phrase for this clue is LEAVENOW. Clue: "Oh, what\'s the ___?" (3) Answer: This is a fill in the blank clue. Because it is three letters and a spoken phrase, the answer is USE.'

# Rules
# context = f'The puzzle follows a number of conventions: Any time a clue contains the tag "Abbr." or an abbreviation more significant than "e.g.", the answer will be an abbreviation (EXAMPLE: [M.D. org. (3 letters)] for AMA). Any time a clue ends in a question mark, the answer is a play on words (e.g., [Fitness center? (4 letters)] for CORE). French-, Spanish-, or Latin-language answers, and more rarely answers from other languages are indicated either by a tag in the clue giving the answer language (EXAMPLE: [Summer: Fr. (3 letters)] for ETE) or by the use in the clue of a word from that language, often a personal or place name (EXAMPLE: [Friends of Pierre (4 letters)] for AMIS) or (EXAMPLE: [The ocean, e.g., in Orleans (3 letters)] for EAU). Clues and answers must always match in part of speech, tense, number, and degree. Thus a plural clue always indicates a plural answer (and the same for singular), a clue in the past tense will always be matched by an answer in the same tense, and a clue containing a comparative or superlative will always be matched by an answer in the same degree. The answer word (or any of the answer words, if it consists of multiple words) will not appear in the clue itself. Unlike in some easier puzzles in other outlets, the number of words in the answer is not given in the clue—so a one-word clue can have a multiple-word answer. Words that might appear elsewhere in the newspaper, such as well-known brand names, pop culture figures, or current phrases of the moment, are fair game. Spoken phrases are always indicated by enclosure in quotation marks, (EXAMPLE: ["Get out of here!" (8 letters)] for LEAVENOW). When the answer can only be substituted for the clue when preceding a specific other word, this other word is indicated in parentheses. For example, [Think (over)] can be MULL, since "mull" only means "think" when preceding the word "over" (i.e., "think over" and "mull over" are synonymous, but "think" and "mull" are not necessarily synonymous otherwise). When the answer needs an additional word in order to fit the clue, this other word is indicated with the use of "with". For example, [Become understood, with "in"] can be SINK, since "Sink in" (but not "Sink" alone) means "to become understood."'

context

In [None]:
answer_few_shot = []

for i, row in few_shot.iterrows():
    clue = row['Clue']
    word_length = row['word_length']
    prompt = f"{context} Clue: {clue} ({word_length}) Answer: "

    response = client.chat.completions.create(
      messages=[
        {'role': 'system', 'content': 'You solve New York Times crossword clues.'},
        {'role': 'user', 'content': prompt},
      ],
      model=GPT_MODEL,
      temperature=0,
    )

    answer = response.choices[0].message.content.strip()
    answer_few_shot.append(answer)

    if i % 10 == 0:
        print(f"Processed {i} rows")


few_shot['answers'] = answer_few_shot
few_shot['answers']

Processed 2150 rows
Processed 2160 rows
Processed 2170 rows
Processed 2180 rows
Processed 2190 rows
Processed 2200 rows
Processed 2210 rows
Processed 2220 rows
Processed 2230 rows
Processed 2240 rows
Processed 2250 rows
Processed 2260 rows
Processed 2270 rows
Processed 2280 rows
Processed 2290 rows
Processed 2300 rows
Processed 2310 rows
Processed 2320 rows
Processed 2350 rows
Processed 2360 rows
Processed 2370 rows
Processed 2380 rows
Processed 2390 rows
Processed 2400 rows
Processed 2410 rows
Processed 2420 rows
Processed 2430 rows
Processed 2440 rows
Processed 2450 rows
Processed 2460 rows
Processed 2470 rows
Processed 2480 rows
Processed 2490 rows
Processed 2500 rows
Processed 2510 rows
Processed 2520 rows
Processed 2530 rows
Processed 2540 rows
Processed 2550 rows
Processed 2560 rows
Processed 2570 rows
Processed 2580 rows
Processed 2600 rows
Processed 2610 rows
Processed 2620 rows
Processed 2630 rows
Processed 2640 rows
Processed 2650 rows
Processed 2660 rows
Processed 2670 rows


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  few_shot['answers'] = answer_few_shot


2145       BAH
2146       AGO
2147       BLT
2148       YES
2149    SAILS.
         ...  
3212     ACES.
3213      GTO.
3214     COLE.
3215     SONG.
3216     PROM.
Name: answers, Length: 1000, dtype: object

In [None]:
fewresults=few_shot
fewresults

Unnamed: 0,Date,Word,Clue,word_length,answers
2145,10/4/2021,BAH,"Scrooge's ""Phooey!""",3,BAH
2146,10/4/2021,AGO,In the past,3,AGO
2147,10/4/2021,BLT,"Diner sandwich, for short",3,BLT
2148,10/4/2021,YES,"""Definitely""",3,YES
2149,10/3/2021,SAILS,Goes wherever the wind blows?,5,SAILS.
...,...,...,...,...,...
3212,9/20/2021,AONE,Top-notch,4,ACES.
3213,9/20/2021,GTOS,Classic Pontiac muscle cars,4,GTO.
3214,9/20/2021,COLE,Rapper J. ___,4,COLE.
3215,9/20/2021,SONG,"Serenade, e.g.",4,SONG.


In [None]:
fewresults['answers'] = fewresults['answers'].str.split().str[-1]
fewresults['answers'] = fewresults['answers'].str.replace(r'[^\w\s]', '', regex=True)
fewresults['answers'] = fewresults['answers'].str.upper()
fewresults

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fewpen['answers'] = fewpen['answers'].str.split().str[-1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fewpen['answers'] = fewpen['answers'].str.replace(r'[^\w\s]', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fewpen['answers'] = fewpen['answers'].str.upper()


Unnamed: 0,Date,Word,Clue,word_length,answers
2145,10/4/2021,BAH,"Scrooge's ""Phooey!""",3,BAH
2146,10/4/2021,AGO,In the past,3,AGO
2147,10/4/2021,BLT,"Diner sandwich, for short",3,BLT
2148,10/4/2021,YES,"""Definitely""",3,YES
2149,10/3/2021,SAILS,Goes wherever the wind blows?,5,SAILS
...,...,...,...,...,...
3212,9/20/2021,AONE,Top-notch,4,ACES
3213,9/20/2021,GTOS,Classic Pontiac muscle cars,4,GTO
3214,9/20/2021,COLE,Rapper J. ___,4,COLE
3215,9/20/2021,SONG,"Serenade, e.g.",4,SONG


In [None]:
# fewresults.to_csv('gpt35_fewexamples_subset1.csv')

In [None]:
correctfew = 0
for i, row in fewresults.iterrows():
  if row['Word'] == row['answers']:
    correctfew += 1

print(correctfew/len(fewresults))

0.423


## Generated knowledge

In [None]:
few_shot = data[2000:3000]

# context = 'You are the best New York Times crossword solver. Clue: Fitness center? (4) Answer: CORE. Clue: M.D. org. (3) Answer: AMA. Clue: Friends of Pierre (4) Answer: AMIS. Clue: "Get out of here!" (8) Answer: LEAVENOW. Clue: "Oh, what\'s the ___?" (3) Answer: USE.'

# context = 'You are the best New York Times crossword solver.'

context = 'You are the best New York Times crossword solver. Clue: Fitness center? (4) Answer: The answer contains 4 letters, must be a pun because of the "?" and be a noun. Therefore, the answer is CORE. Clue: M.D. org. (3) Answer: Because of the abbreviated clue, the answer is an abbreviation. Therefore it is AMA. Clue: Friends of Pierre (4) Answer: Pierre implies a French response, and friends is a plural noun. Therefore the answer is AMIS. Clue: "Get out of here!" (8) Answer: The clue is a spoken phrase. An 8 letter phrase for this clue is LEAVENOW. Clue: "Oh, what\'s the ___?" (3) Answer: This is a fill in the blank clue. Because it is three letters and a spoken phrase, the answer is USE.'

context_few_shot = []
answer_few_shot = []


for i, row in few_shot.iterrows():
    clue = row['Clue']
    word_length = row['word_length']

    prompt = f"Clue: {clue} ({word_length}) Answer: "

    response = client.chat.completions.create(
      messages=[
        {'role': 'system', 'content': 'You interpret multiple meaning behind clues.'},
        {'role': 'user', 'content': prompt},
      ],
      model=GPT_MODEL,
      temperature=0,
    )

    clue_context = response.choices[0].message.content.strip()
    context_few_shot.append(clue_context)


    next_prompt = context + clue_context + ' ' + prompt + '. Letters: ' + str(word_length)

    next_response = client.chat.completions.create(
      messages=[
        {'role': 'system', 'content': 'You solve New York Times crossword clues.'},
        {'role': 'user', 'content': next_prompt},
      ],
      model=GPT_MODEL,
      temperature=0,
    )


    answer = next_response.choices[0].message.content.strip()
    answer_few_shot.append(answer)

    if i % 10 == 0:
        print(f"Processed {i} rows")


few_shot['context'] = context_few_shot
few_shot['answers'] = answer_few_shot
few_shot

Processed 2150 rows
Processed 2160 rows
Processed 2170 rows
Processed 2180 rows
Processed 2190 rows
Processed 2200 rows
Processed 2210 rows
Processed 2220 rows
Processed 2230 rows
Processed 2240 rows
Processed 2250 rows
Processed 2260 rows
Processed 2270 rows
Processed 2280 rows
Processed 2290 rows
Processed 2300 rows
Processed 2310 rows
Processed 2320 rows
Processed 2350 rows
Processed 2360 rows
Processed 2370 rows
Processed 2380 rows
Processed 2390 rows
Processed 2400 rows
Processed 2410 rows
Processed 2420 rows
Processed 2430 rows
Processed 2440 rows
Processed 2450 rows
Processed 2460 rows
Processed 2470 rows
Processed 2480 rows
Processed 2490 rows
Processed 2500 rows
Processed 2510 rows
Processed 2520 rows
Processed 2530 rows
Processed 2540 rows
Processed 2550 rows
Processed 2560 rows
Processed 2570 rows
Processed 2580 rows
Processed 2600 rows
Processed 2610 rows
Processed 2620 rows
Processed 2630 rows
Processed 2640 rows
Processed 2650 rows
Processed 2660 rows
Processed 2670 rows


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  few_shot['context'] = context_few_shot
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  few_shot['answers'] = answer_few_shot


Unnamed: 0,Date,Word,Clue,word_length,context,answers
2145,10/4/2021,BAH,"Scrooge's ""Phooey!""",3,Bah\n\nInterpretation: This clue could be refe...,"The answer to the clue ""Scrooge's 'Phooey!'"" w..."
2146,10/4/2021,AGO,In the past,3,"1. The answer could be ""history,"" referring to...","The answer to the clue ""In the past (3)"" is ""A..."
2147,10/4/2021,BLT,"Diner sandwich, for short",3,"BLT\n\nInterpretation: The clue ""Diner sandwic...","The answer to the clue ""Diner sandwich, for sh..."
2148,10/4/2021,YES,"""Definitely""",3,"The word ""definitely"" could be interpreted in ...",Great job on solving those crossword clues! It...
2149,10/3/2021,SAILS,Goes wherever the wind blows?,5,"The answer could be ""sails"" as it can refer to...","The answer to the clue ""Goes wherever the wind..."
...,...,...,...,...,...,...
3212,9/20/2021,AONE,Top-notch,4,Best,"The answer to the clue ""Top-notch (4)"" is ACES."
3213,9/20/2021,GTOS,Classic Pontiac muscle cars,4,"The answer could be ""GTO"" which stands for Gra...",Great job on solving those crossword clues! Yo...
3214,9/20/2021,COLE,Rapper J. ___,4,"The answer could be ""Cole"" or ""Jay-Z"" as both ...","The answer to the clue ""Rapper J. ___ (4)"" is ..."
3215,9/20/2021,SONG,"Serenade, e.g.",4,"Song\n\nInterpretation: The word ""serenade"" su...",Great job on solving those New York Times cros...


In [None]:
gk=few_shot
gk['answers'] = gk['answers'].str.split().str[-1]
gk['answers'] = gk['answers'].str.replace(r'[^\w\s]', '', regex=True)
gk['answers'] = gk['answers'].str.upper()
gk

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fewpen['answers'] = fewpen['answers'].str.split().str[-1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fewpen['answers'] = fewpen['answers'].str.replace(r'[^\w\s]', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fewpen['answers'] = fewpen['answers'].str.upper()


Unnamed: 0,Date,Word,Clue,word_length,context,answers
2145,10/4/2021,BAH,"Scrooge's ""Phooey!""",3,Bah\n\nInterpretation: This clue could be refe...,BAH
2146,10/4/2021,AGO,In the past,3,"1. The answer could be ""history,"" referring to...",AGO
2147,10/4/2021,BLT,"Diner sandwich, for short",3,"BLT\n\nInterpretation: The clue ""Diner sandwic...",BLT
2148,10/4/2021,YES,"""Definitely""",3,"The word ""definitely"" could be interpreted in ...",ASK
2149,10/3/2021,SAILS,Goes wherever the wind blows?,5,"The answer could be ""sails"" as it can refer to...",SAILS
...,...,...,...,...,...,...
3212,9/20/2021,AONE,Top-notch,4,Best,ACES
3213,9/20/2021,GTOS,Classic Pontiac muscle cars,4,"The answer could be ""GTO"" which stands for Gra...",WORK
3214,9/20/2021,COLE,Rapper J. ___,4,"The answer could be ""Cole"" or ""Jay-Z"" as both ...",COLE
3215,9/20/2021,SONG,"Serenade, e.g.",4,"Song\n\nInterpretation: The word ""serenade"" su...",WORK


In [None]:
gk.to_csv('gpt35_kgexampleexplanations_subset1.csv')

In [None]:
correctgk = 0
for i, row in gk.iterrows():
  if row['Word'] == row['answers']:
    correctgk += 1

print(correctgk/len(gk))

0.263
