In [1]:
import spacy
import numpy as np
nlp = spacy.load('en_core_web_sm')

In [2]:
L = list(nlp.vocab.strings)

In [3]:
numWords = len(L)
print(numWords)

84780


In [4]:
W2I = dict(zip(L, np.arange(numWords)))
I2W = dict(zip(np.arange(numWords), L))

In [5]:
W2I['game']

np.int64(49865)

In [6]:
I2W[49865]

'game'

In [7]:
I2W[84779]

'￥dd'

# One Hot Encoding

In [8]:
def oneHotVector(word, W2I, numWords):
    v = np.zeros(numWords)
    if word in W2I:
        v[W2I[word]] = 1
    return v

In [9]:
v = oneHotVector('game', W2I, numWords)

In [10]:
v.shape

(84780,)

In [11]:
v[W2I['game']]

np.float64(1.0)

## Term Frequency (TF)

In [12]:
doc = 'How are you today. I know most of ￥28 ￥dd the time how you feel. ￥28 ￥28 ￥dd' #￥28 is the 3rd last number
tokens = [token.text for token in nlp(doc)]

In [13]:
v = np.zeros(numWords)
for token in tokens:
    v += oneHotVector(token,W2I,numWords)

In [14]:
v

array([0., 0., 0., ..., 0., 3., 2.], shape=(84780,))

## TFIDF

In [15]:
from sklearn.datasets import fetch_20newsgroups as getData

In [16]:
corpus = getData(subset='train',remove=('headers','footers','qoutes'))

In [17]:
docs = corpus.data

In [18]:
len(docs)

11314

In [19]:
docs[0]

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

In [None]:
df = np.zeros(numWords)

for term in L:
    dft = 0
    # for doc in docs[:100]:  #100 docs for testing only(full docs(11314) takes much time)
    for doc in docs:
        if term in doc:
            dft += 1
    df[W2I[term]] = dft

In [None]:
df

In [None]:
N = len(docs)
N

In [None]:
Idf = np.log10(N/(df+1)) #df+1 because some values are 0; so creates error

In [None]:
doc = 'How are you today. I know most of the time how you feel.' #as above TF ^^
v = np.zeros(numWords)

for token in nlp(doc):
    v += oneHotVector(token.text,W2I,numWords)

# tokens = [token.text for token in nlp(doc)]
# for token in tokens:
#     v += oneHotVector(token,W2I,numWords)

In [None]:
tf = np.log10(v+1)

In [None]:
tfidf = tf * Idf
tfidf

In [None]:
np.sum(tfidf != 0)