# This is Data preprocessing


In [8]:
import re
import nltk

nltk.download('punkt')

import emoji
import numpy as np
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [40]:
corpus = 'I am happy because I am learning'
corpus

'I am happy because I am learning'

In [41]:
def tokenize(corpus):
    data = re.sub(r'[,!?;-]+','.',corpus)
    data = nltk.word_tokenize(data)
    data = [ ch.lower() for ch in data
             if ch.isalpha()
             or ch == '.'
           ]
    return data

In [52]:
words = tokenize(corpus)
print(words)

['i', 'am', 'happy', 'because', 'i', 'am', 'learning']


In [48]:
def get_window(wordlist,C=2):
    i = C
    while i < len(wordlist) - C:
        center_word = wordlist[i]
        context_word = wordlist[i-C:i] + wordlist[i+1 : i+C+1]
        yield center_word, context_word
        i+=1


In [59]:
print(words)
for center_word, context_word in get_window(words):
    print(f"center: {center_word} \t\t context: {context_word}")

['i', 'am', 'happy', 'because', 'i', 'am', 'learning']
center: happy 		 context: ['i', 'am', 'because', 'i']
center: because 		 context: ['am', 'happy', 'i', 'am']
center: i 		 context: ['happy', 'because', 'am', 'learning']


In [77]:
def get_dict(wordlist):
    vocab = set(wordlist)
    v = len(vocab)
    vocab = sorted(vocab)
    word2IncDic = dict()
    Inc2WordDic = dict()
    for w,i in zip(vocab, range(v)):
        word2IncDic[w] = i
        Inc2WordDic[i] = w
    return word2IncDic,Inc2WordDic,v

In [79]:
word2IncDic,Inc2WordDic,V = get_dict(words)
print(word2IncDic)
print(Inc2WordDic)
print(V)

{'am': 0, 'because': 1, 'happy': 2, 'i': 3, 'learning': 4}
{0: 'am', 1: 'because', 2: 'happy', 3: 'i', 4: 'learning'}
5


In [83]:
def get_vector_center(word, word2IncDic, V):
    vec = np.zeros(V)
    vec[word2IncDic[word]] = 1
    return vec;

In [84]:
def get_vector_context(wordList, word2IncDic, V):
    vec = [get_vector_center(w,word2IncDic,V) for w in wordList]
    return np.mean(vec,axis=0);

In [86]:
vec_center = get_vector_center('happy',word2IncDic,V)
vec_context = get_vector_context(['i','am','because','i'],word2IncDic,V)
print(vec_center)
print(vec_context)

[0. 0. 1. 0. 0.]
[0.25 0.25 0.   0.5  0.  ]


In [87]:
def get_all_training_data(corpus):
    data = tokenize(corpus)
    word2IncDic,_,V = get_dict(data)
    for center_word, context_word in get_window(data):
        vec_center = get_vector_center(center_word,word2IncDic,V)
        vec_context = get_vector_context(context_word,word2IncDic,V)
        yield vec_center, vec_context

In [88]:
for x,y in get_all_training_data(corpus):
    print(f'center vector {x}')
    print(f'context vector {y}')

center vector [0. 0. 1. 0. 0.]
context vector [0.25 0.25 0.   0.5  0.  ]
center vector [0. 1. 0. 0. 0.]
context vector [0.5  0.   0.25 0.25 0.  ]
center vector [0. 0. 0. 1. 0.]
context vector [0.25 0.25 0.25 0.   0.25]
