In [1]:
import os
import urllib
import sys

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

os.makedirs("data", exist_ok=True)

def download(url):
    file_name = url.split("/")[-1]
    download_path = os.path.join("./data", file_name)
    
    if os.path.exists(download_path):
        print("Already downloaded!")
        
    else:
        # ============================================ download
        print("Downloading, sit tight!")

        def _progress(count, block_size, total_size):
            sys.stdout.write(
                f"\r>> Downloading {file_name} {float(count * block_size) / float(total_size) * 100.0}%")
            sys.stdout.flush()

        file_path, _ = urllib.request.urlretrieve(
            url, download_path, _progress)
        print()
        print(
            f"Successfully downloaded {file_name} {os.stat(file_path).st_size} bytes")
        
download(url)

Already downloaded!


In [2]:
import string

def read_data(file_path="./data/input.txt"):
    assert os.path.exists(file_path)
    
    with open(file_path, "r") as f:
        data = f.read()
    
    data = data.replace("\n", " ")
    data = data.translate(str.maketrans('', '', string.punctuation))
    return data
    
data = read_data()

In [3]:
from tqdm.auto import tqdm

def create_vocab(data=data):    
    tokens = data.split(" ")
    tokens = [tok for tok in tokens if tok != ""]
    words = set(tokens)
    
    word_to_idx = dict()
    idx_to_word = dict()
    for idx, word in tqdm(enumerate(words), total=len(words)) :
        word_to_idx[word] = idx
        idx_to_word[idx] = word
    
    return tokens, words, word_to_idx, idx_to_word
    
    
toks, words, wti, itw = create_vocab()

  0%|          | 0/14745 [00:00<?, ?it/s]

In [4]:
print(f"Vocab Size: {len(words)}")

Vocab Size: 14745


In [5]:
import numpy as np

# an array containing the frequency of all words
# use itw and wti for index reference

vocab_size = len(words)
freqs = np.zeros(shape=(vocab_size, ), dtype=np.int32)

In [6]:
# populate counts
from collections import Counter

counts = Counter(toks)

In [7]:
for idx, w in tqdm(enumerate(words), total=vocab_size):
    c = counts[w]
    freqs[idx] = c
    
freqs

  0%|          | 0/14745 [00:00<?, ?it/s]

array([10,  7, 39, ...,  1,  4,  1], dtype=int32)

In [8]:
probs = freqs / float(vocab_size)
probs

array([6.78195999e-04, 4.74737199e-04, 2.64496439e-03, ...,
       6.78195999e-05, 2.71278399e-04, 6.78195999e-05])

In [9]:
from scipy.special import softmax

prob_dist = softmax(probs, axis=0)

np.sum(prob_dist)
    

0.9999999999999998