In [1]:
import pandas as pd 
import numpy as np 
import os, sys, time
import random
print("Time Last Script Run: " + time.asctime())

Time Last Script Run: Sun Mar 10 12:00:15 2024


# Train Test Split

In [3]:
# reading all words from the dictionary. 
dictionaryFile = open("./data/raw/words_250000_train.txt","r") 
wordList = list(map(lambda x : x.strip() ,dictionaryFile.readlines()))
lenDict = len(wordList) 
dictionaryFile.close()
print(str(lenDict) + " words found")

227300 words found


In [4]:
# make the train and test folders to segregate data for training and validation. 
os.makedirs("./data/train", exist_ok=True) 
os.makedirs("./data/test", exist_ok=True)

In [39]:
# shuffling data from the source file.
random.shuffle(wordList)
train_size = int(0.8*lenDict)
trainWordList = wordList[:train_size]
testWordList = wordList[train_size:]

# saving files: 
train_path = "./data/train/train.txt" 
with open(train_path, 'w') as f: 
    for word in trainWordList: 
        f.write(word + '\n') 
test_path = "./data/test/test.txt" 
with open(test_path, 'w') as f: 
    for word in testWordList: 
        f.write(word + '\n') 
            
print(f"creating training and testing splits: \nTrain: {len(trainWordList)} \nTest: {len(testWordList)}")
print(f"Saving train and test files to {train_path} & {test_path} respectively")

creating training and testing splits: 
Train: 181840 
Test: 45460
Saving train and test files to ./data/train/train.txt & ./data/test/test.txt respectively


___
# Training Data Preparation

In [3]:
import string 
import itertools
# import torch
# import torch.nn.functional as Fun
from tqdm import tqdm
ascii_lowercase = string.ascii_lowercase

In [4]:
def allUniqCombinations(word, drop_combinations = 0):
    uniq_chars = np.unique(list(word)).tolist()
    num_uniq_chars = len(uniq_chars)
    uniq_combos = []
    for r in range(1,num_uniq_chars+1):
        combinations = list(itertools.combinations(uniq_chars,r)) 
        uniq_combos = uniq_combos + combinations

    N = len(uniq_combos) 
    drop = int(N*drop_combinations)
    combos = random.sample(uniq_combos,k=N-drop) 
    return combos

def wordDecay(word,chars_to_remove): 
    original_word = word
    for ch in chars_to_remove: 
        word = word.replace(ch,'_') 

    alphabet = string.ascii_lowercase
    for ch in list(word):
        alphabet = alphabet.replace(ch,'') 
    
    return f"{original_word},{word},{''.join(chars_to_remove)},{alphabet}"


def prepareDataset(source_path,save_path, drop_combinations = 0):
    with open(source_path,'r') as s: 
        wordList = list(map(lambda x : x.strip(), s.readlines()))
    
    file = open(save_path,'w') 
    for word in tqdm(wordList):
        uniq_removal_combinations = allUniqCombinations(word, drop_combinations) 
        dataset_word = []
        for combo in uniq_removal_combinations:
            dataset_word.append(wordDecay(word,combo)) 

        to_write = '\n'.join(dataset_word) 
        file.write(to_write) 
        file.write('\n')

    file.close()
    print("Loading File from storage") 
    df = pd.read_csv(save_path,names=["word","input","target","available"])
    print("Shuffling data")
    df = df.sample(frac=1) 
    df.to_csv(save_path[:-4] + ".csv",index=False)
    print("Saving data as a csv file.")

In [6]:
prepareDataset(source_path="./data/train/train.txt",save_path="./data/train/train_io.txt", drop_combinations=0.95)

100%|██████████| 181840/181840 [00:31<00:00, 5733.75it/s]


Loading File from storage
Shuffling data
Saving data as a csv file.
