In [5]:
import requests

url_path = "https://wordle.votee.dev:8000/word/{word}"
url_daily = "https://wordle.votee.dev:8000/daily"
url_random = "https://wordle.votee.dev:8000/random"
words = requests.get(url_path)
words = words.json()

In [7]:
daily = requests.get(url_daily).json()
votee_random = requests.get(url_random).json()

In [3]:
target = words["detail"][0]
target

{'loc': ['query', 'guess'],
 'msg': 'field required',
 'type': 'value_error.missing'}

In [32]:
"""
Considering all API request is failed, I will simulate the given word by ["loc"] in response
"""
data = target["loc"]
data = "I _ a cat, it is beautifully and I play _ _ all day."

#### Split the chunk given the data

In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

if len(target)>200:
    chunk_size = 200
    chunk_overlap = 15
    chunk_ = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, is_separator_regex=False
    )
    if isinstance(data, list):
        docuement = chunk_.create_documents(data)
    else: 
        docuement = chunk_.create_documents([data])
else:
    docuement = data

#### Call Bert to play the random word puzzle

In [47]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
import copy

model_name = "readerbench/RoBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)



In [54]:
data = data.split(" ")
mask_token = tokenizer.mask_token
mask_id = tokenizer.mask_token_id

for i in range(len(data)):
    if data[i] == "_":
        data[i] = mask_token
data = " ".join(data)

input_token = tokenizer(data, return_tensors="pt")
tokenid = input_token["input_ids"]
mask_id = torch.where(tokenid[0] == mask_id)[0]

In [55]:
mask_id

tensor([ 2, 15, 16])

In [56]:
with torch.no_grad():
    output = model(**input_token)

In [70]:
predicts = output.logits[0, mask_id, :]
masked_list = []
topk = 3
for i in range(topk):
    l1 = predicts[i]
    topk = torch.topk(l1.reshape(1,-1), k=3, dim=1).indices[0].tolist()
    masked_list.append(topk)

In [73]:
for f1 in range(3):
    for f2 in range(3):
        masked_list[f1][f2] = tokenizer.decode(masked_list[f1][f2])

masked_list

[['have', 'am', 'need'], ['the', 'to', 'a'], ['##y', '##e', 'it']]

In [None]:
val_idx = []
data = data.split()
for i in range(len(data)):
    if data[i] == mask_token:
        val_idx.append(i)
val_idx

[1, 10, 11]

In [80]:
for p1, p2, p3 in zip(*masked_list):
    text_copy = copy.deepcopy(data)
    text_copy[val_idx[0]] = p1
    text_copy[val_idx[1]] = p2
    text_copy[val_idx[2]] = p3
    print(" ".join(text_copy))
    del text_copy

I have a cat, it is beautifully and I play the ##y all day.
I am a cat, it is beautifully and I play to ##e all day.
I need a cat, it is beautifully and I play a it all day.


#### It is also avaliable to use GPT-2 for random word prediction, however, GPT-2 is decoder-only model; which is not appliable for masked coding but only **text generation**

In [84]:
from transformers import AutoModelWithLMHead, AutoTokenizer

path = r"./Toy_model/GPT_2"
model = AutoModelWithLMHead.from_pretrained(path)
tokenizer = AutoTokenizer.from_pretrained(path)

In [94]:
gpt_data = "I love dogs, I have a dogs with grey color and willing to play"

In [95]:
gpt_tokenizer = tokenizer(gpt_data, return_tensors="pt")
output = model.generate(**gpt_tokenizer, max_length = 20, pad_token_id = tokenizer.eos_token_id)

In [96]:
tokenizer.decode(output[0])

'I love dogs, I have a dogs with grey color and willing to play with them. I love'