In [1]:
from hangman import hangman, get_dictionary
import wordfreq
import json

In [2]:
from decimal import Decimal

# Makes the dictionary with the word frequencies
big_dictionary = get_dictionary("words.txt")

# Convert to a dictionary with higher precision
freq_dict = {}
for word in big_dictionary:
    weight = Decimal(wordfreq.word_frequency(word, "en", wordlist='small', minimum=0.0))
    if weight > 0.0 and len(word) > 1:
        freq_dict[word] = float(weight)  # Store as float to ensure JSON compatibility

# Write it to a JSON file with high precision
with open('webppl/word_freq.json', 'w') as f:
    json.dump(freq_dict, f, indent=2)


In [3]:
#get the letter frequencies from freq_dict 

letter_freq = {}
for word in freq_dict:
    for letter in word:
        if letter in letter_freq:
            letter_freq[letter] += freq_dict[word]
        else:
            letter_freq[letter] = freq_dict[word]
            
#normalize it 
total = sum(letter_freq.values())
for letter in letter_freq:
    letter_freq[letter] /= total
    
# Write it to a JSON file with high precision
with open('webppl/letter_freq.json', 'w') as f:
    json.dump(letter_freq, f, indent=2)

In [53]:
""" turn csv into data of the form 

var partial = ["_", "o", "_", "_", "e", "_", "_", "_", "_"]
var in_letters = ["o", "e"]
var not_in_letters = ["s"]
var previousGuesses = in_letters.concat(not_in_letters)

var data = [
  {
    previousGuesses: previousGuesses,
    actualGuess: "l",
    partialWordPattern: partial,
    word: "wonderful"
  },
]


where each guess is piece of data. each row of the csv is a series of guesses eg. 
8	calendar	e	t 	a	l	n	i	o	s	r	c	d	
""" 
import csv
import json

# Define the input CSV file path

def make_data_from_csv(csv_file_path, data_file_path):

    # Helper function to update the partial word pattern
    def update_partial(partial, word, guess):
        return [char if char == guess or char in partial else "_" for char in word]

    # Initialize variables
    output_data = []

    # Read the CSV file
    with open(csv_file_path, "r") as csv_file:
        csv_reader = csv.reader(csv_file, delimiter="\t")
        #skip the first row
        next(csv_reader)
        for row in csv_reader:
            row = row[0].split(",")
            print(row)
            print(len(row))
            if len(row) < 4:
                continue
            if row[0] == "":
                continue
            # Parse the data from the row
            
            step_number, word, *guesses = row
            
            #clean up the whitespace or commas
            word = word.strip()
            guesses = [guess.strip() for guess in guesses]
            guesses = [guess for guess in guesses if guess != ""]
            
            
            step_number = int(step_number)

            # Initialize variables for this row
            partial = ["_" for _ in word]
            in_letters = []
            not_in_letters = []

            # Process each guess in the row
            for i, guess in enumerate(guesses):
               
                if guess in word:
                    in_letters.append(guess)
                    partial = update_partial(partial, word, guess)
                else:
                    not_in_letters.append(guess)

                previous_guesses = in_letters + not_in_letters

                # Create the data point for this guess
                data_point = {
                    "previousGuesses": previous_guesses,
                    "actualGuess": guess,
                    "partialWordPattern": partial.copy(),
                    "word": word
                }

                output_data.append(data_point)

  
    with open(data_file_path, "w") as output_file:
        json.dump(output_data, output_file, indent=2)


In [55]:
# Example Usage
make_data_from_csv("dataset.csv", "webppl/data.js")


['8', 'calendar', 'e', 't ', 'a', 'l', 'n', 'i', 'o', 's', 'r', 'c', 'd', '', '', '', '', '', '', '', '', '', '', '', '', '']
26
['9', 'Custodian', 'e', 't', 'a', 'o', 'i', 'l', 'n', 'r', 's', 'y', 'u', 'd', 'c', '', '', '', '', '', '', '', '', '', '', '']
26
['', 'deliquescent', 'e', 't', 'n', ' a', 's', ' h', '"v', ' "', 'b', 'g', 'd', 'c', 'q', '', '', '', '', '', '', '', '', '', '', '', '']
27
['9', 'Education', 'e', 'o', 'n', 's', 'a', 'x', 'm', 'r', 'l', 'd', 'u', 'c', 't', 'i', '', '', '', '', '', '', '', '', '', '']
26
['8', 'elephant', 'e', 't ', 'l ', 'p', 'h', 'a', 'n', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
26
['9', 'Excellent', 'e', 't', 'n', 'l', 'c', 'x', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
26
['8', 'moonrise', 'e', 't ', 'a', 'o', 'i', 'n', 's', 'r', 'm', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
26
['', 'opportunity', 'e', 'a', 'i', 'o', 'c', 'n', 's', 't', 'p', 'r', 'u', 'y', '', '', ''