In [None]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt

nltk.download("punkt")

nltk.data.path.append('.')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import re
import jieba

path = "sample.txt"

with open(path, 'r', encoding='utf-8') as file:
    data = file.read()

data = re.sub(r'[，。！？；]', '。', data)

data_tokens = jieba.cut(data)

data_tokens = [token.lower() for token in data_tokens if token.isalpha() or token == '。']

print(data_tokens[:10])

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.530 seconds.
DEBUG:jieba:Loading model cost 1.530 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


['中国', '是', '世界', '上', '历史悠久', '且', '文化', '丰富', '的', '国家']


In [None]:
print(f"The Number of tokens = {len(data_tokens)} \n {data[:20]}")

The Number of tokens = 389 
 中国是世界上历史悠久且文化丰富的国家之一


In [None]:
fdist = nltk.FreqDist(word for word in data_tokens)
print("Size of vocabulary: ",len(fdist) )
print("Most frequent tokens: ",fdist.most_common(20) )

Size of vocabulary:  182
Most frequent tokens:  [('。', 40), ('的', 34), ('和', 18), ('中国', 16), ('了', 11), ('在', 7), ('上', 5), ('历史', 5), ('其', 5), ('全球', 5), ('世界', 4), ('文化', 4), ('国家', 4), ('经济', 4), ('如', 4), ('发展', 4), ('等', 4), ('重要', 4), ('之一', 3), ('悠久', 3)]


In [None]:
import jieba

def get_dict(data_tokens):
    words = sorted(list(set(data_tokens)))
    word2Ind = {word: idx for idx, word in enumerate(words)}
    Ind2word = {idx: word for idx, word in enumerate(words)}
    return word2Ind, Ind2word

In [None]:
path = "sample.txt"

with open(path, 'r', encoding='utf-8') as file:
    data = file.read()

data_tokens = list(jieba.cut(data))

word2Ind, Ind2word = get_dict(data_tokens)

V = len(word2Ind)
print("Size of vocabulary: ", V)

print("Index of '在' in vocabulary: ", word2Ind.get('在'))
print("Word corresponding to index 0: ", Ind2word.get(68))


print("Index of the word '中国' :  ",word2Ind['中国'] )
print("Word which has index 2743:  ",Ind2word[68] )

Size of vocabulary:  185
Index of '在' in vocabulary:  68
Word corresponding to index 0:  在
Index of the word '中国' :   14
Word which has index 2743:   在


In [None]:
import numpy as np

def init_parameters(N: int, V: int, random_seed: int = 1) -> dict:
    np.random.seed(random_seed)
    parameters = {}

    # Assuming V represents the vocabulary size in Mandarin (characters or tokens)
    parameters["W1"] = np.random.rand(N, V)
    parameters["b1"] = np.zeros(shape=(N, 1))

    # W2 would typically map from the hidden layer (N) back to the vocabulary size (V)
    parameters["W2"] = np.random.rand(V, N)
    parameters["b2"] = np.zeros(shape=(V, 1))

    return parameters

In [None]:
tmp_N = 4
tmp_V = 10
params = init_parameters(tmp_N,tmp_V)
assert params['W1'].shape == ((tmp_N,tmp_V))
assert params['W2'].shape == ((tmp_V,tmp_N))
print(f"W1 shape: {params['W1'].shape}")
print(f"W2 shape: {params['W2'].shape}")
print(f"b1 shape: {params['b1'].shape}")
print(f"b2 shape: {params['b2'].shape}")

W1 shape: (4, 10)
W2 shape: (10, 4)
b1 shape: (4, 1)
b2 shape: (10, 1)


In [None]:
from typing import List, Union

def softmax(z: Union[float, List[float]]) -> Union[float, List[float]]:
    y = np.exp(z) / np.sum(np.exp(z), 0, keepdims=True)

    return y

def sigmoid(z: Union[float, List[float]]) -> Union[float, List[float]]:
    return 1.0 / (1.0 + np.exp(-z))

def relu(z: Union[float, List[float]]) -> Union[float, List[float]]:
    return np.maximum(z, 0)


In [None]:
tmp = np.array([[1,2,3],
                [1,1,1]
               ])
tmp_sm = softmax(tmp)
display(tmp_sm)

array([[0.5       , 0.73105858, 0.88079708],
       [0.5       , 0.26894142, 0.11920292]])

In [None]:
def forward_propagation(x: np.ndarray, params: dict) -> tuple:
    z1 = np.dot(params['W1'], x) + params['b1']
    h = relu(z1)

    z2 = np.dot(params['W2'], h) + params['b2']
    y_hat = z2

    return y_hat, h

In [None]:
tmp_x = np.array([[0,1,0,0,0,0,0,0,0,0]]).T
tmp_z, tmp_h = forward_propagation(tmp_x, params)
print("call forward_prop")
print()
# Look at output
print(f"z has shape {tmp_z.shape}")
print("z has values:")
print(tmp_z)

call forward_prop

z has shape (10, 1)
z has values:
[[1.82887324]
 [1.38466287]
 [0.60100484]
 [0.83284144]
 [1.37936774]
 [1.20420827]
 [1.26273995]
 [2.0149547 ]
 [1.10825153]
 [1.93910889]]


In [None]:
def cross_entropy_loss(y_hat: np.ndarray, y: np.ndarray,
                       batch_size: int) -> Union[float, List[float]]:
    logprobs = np.multiply(np.log(y_hat), y)
    cost = -1/batch_size * np.sum(logprobs)
    cost = np.squeeze(cost)
    return cost



def backward_propagation(x: np.ndarray, y: np.ndarray, y_hat: np.ndarray, h: np.ndarray,
                         params: dict, batch_size: int) -> dict:
    grads_params = {}

    l1 = np.dot(params['W2'].T ,(y_hat - y))

    grads_params['W1'] = np.dot(l1, x.T) / batch_size
    grads_params['W2'] = np.dot((y_hat - y), h.T) / batch_size

    grads_params['b1'] = np.sum(l1, axis=1, keepdims= True) / batch_size
    grads_params['b2'] = np.sum((y_hat - y), axis= 1, keepdims= True) / batch_size

    return grads_params

In [None]:
from collections import defaultdict
import numpy as np
import jieba

def get_idx(words, word2Ind):
    idx = []
    for word in words:
        idx.append(word2Ind[word])
    return idx

def pack_idx_with_frequency(context_words, word2Ind):
    freq_dict = defaultdict(int)
    for word in context_words:
        freq_dict[word] += 1
    idxs = get_idx(context_words, word2Ind)
    packed = []
    for i in range(len(idxs)):
        idx = idxs[i]
        freq = freq_dict[context_words[i]]
        packed.append((idx, freq))
    return packed

def get_vectors(data, word2Ind, V, C):
    i = C
    while True:
        y = np.zeros(V)
        x = np.zeros(V)
        center_word = data[i]
        y[word2Ind[center_word]] = 1
        context_words = data[(i - C) : i] + data[(i + 1) : (i + C + 1)]
        num_ctx_words = len(context_words)
        for idx, freq in pack_idx_with_frequency(context_words, word2Ind):
            x[idx] = freq / num_ctx_words
        yield x, y
        i += 1
        if i >= len(data) - C:
            print("i is being set to", C)
            i = C

def get_batches(data, word2Ind, V, C, batch_size):
    batch_x = []
    batch_y = []
    for x, y in get_vectors(data, word2Ind, V, C):
        if len(batch_x) < batch_size:
            batch_x.append(x)
            batch_y.append(y)
        else:
            yield np.array(batch_x).T, np.array(batch_y).T
            batch_x = []
            batch_y = []

In [None]:
import numpy as np

def gradient_descent(data: list[str], word2Ind: dict, N: int,
                     V: int, epochs: int, alpha: float = 0.03,
                     random_seed: int = 282) -> dict:

    param = init_parameters(N, V, random_seed=random_seed)

    batch_size = 128
    epoch = 0
    C = 2

    for x, y in get_batches(data, word2Ind, V, C, batch_size):

        z, h = forward_propagation(x, param)
        yhat = softmax(z)

        cost = cross_entropy_loss(yhat, y, batch_size)
        if (epoch + 1) % 10 == 0:
            print(f"Iteration: {epoch + 1} Cost: {cost:.6f}")

        grads = backward_propagation(x, y, yhat, h, param, batch_size)

        param['W1'] -= alpha * grads['W1']
        param['W2'] -= alpha * grads['W2']
        param['b1'] -= alpha * grads['b1']
        param['b2'] -= alpha * grads['b2']

        epoch += 1
        if epoch == epochs:
            break
        if epoch % 100 == 0:
            alpha *= 0.66

    return param



C = 2
N = 50
word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
num_iters = 150
print("Call gradient_descent")
params = gradient_descent(data, word2Ind, N, V, num_iters)

Call gradient_descent
i is being set to 2
Iteration: 10 Cost: 5.977854
i is being set to 2
i is being set to 2
Iteration: 20 Cost: 6.006075
i is being set to 2
i is being set to 2
Iteration: 30 Cost: 6.037121
i is being set to 2
i is being set to 2
Iteration: 40 Cost: 5.966813
i is being set to 2
i is being set to 2
Iteration: 50 Cost: 5.864473
i is being set to 2
i is being set to 2
Iteration: 60 Cost: 5.844921
i is being set to 2
i is being set to 2
Iteration: 70 Cost: 5.797549
i is being set to 2
Iteration: 80 Cost: 5.738867
i is being set to 2
i is being set to 2
Iteration: 90 Cost: 5.619266
i is being set to 2
i is being set to 2
Iteration: 100 Cost: 5.624046
i is being set to 2
i is being set to 2
Iteration: 110 Cost: 5.600146
i is being set to 2
i is being set to 2
Iteration: 120 Cost: 5.590284
i is being set to 2
i is being set to 2
Iteration: 130 Cost: 5.626671
i is being set to 2
i is being set to 2
Iteration: 140 Cost: 5.582384
i is being set to 2
i is being set to 2
Iterati