# Vector Space Model

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

## Tokenize

In [6]:
text = """
TO REVISE THE CHARTER; Governor Soon to Announce His Choice of Commissioners.
The Commissioners declared that
"""

In [7]:
import nltk
from nltk.stem.porter import *
from collections import Counter

In [8]:
tokens = nltk.word_tokenize(text)
print(tokens)

['TO', 'REVISE', 'THE', 'CHARTER', ';', 'Governor', 'Soon', 'to', 'Announce', 'His', 'Choice', 'of', 'Commissioners', '.', 'The', 'Commissioners', 'declared', 'that']


In [9]:
stemmer = PorterStemmer()
norm = [stemmer.stem(x.lower()) for x in tokens]
print(norm)

['to', 'revis', 'the', 'charter', ';', 'governor', 'soon', 'to', 'announc', 'hi', 'choic', 'of', 'commission', '.', 'the', 'commission', 'declar', 'that']


In [10]:
bag = dict(Counter(norm).most_common())
print(bag)

{'to': 2, 'the': 2, 'commission': 2, 'revis': 1, 'charter': 1, ';': 1, 'governor': 1, 'soon': 1, 'announc': 1, 'hi': 1, 'choic': 1, 'of': 1, '.': 1, 'declar': 1, 'that': 1}


In [11]:
i = pd.Series(bag).to_frame()
display(i.T)

Unnamed: 0,to,the,commission,revis,charter,;,governor,soon,announc,hi,choic,of,.,declar,that
0,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1


## Spacy

In [12]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [13]:
tokenize = lambda x: [t.lemma_ for t in nlp(x.lower()) if t.pos_ not in ['SPACE', 'PUNCT', 'DET']]

## 20 News

In [14]:
from collections import defaultdict
from sklearn.datasets import fetch_20newsgroups

In [16]:
data_train = fetch_20newsgroups(subset='train', remove=['headers', 'footers', 'quotes'], data_home='/tmp/')

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [17]:
corpus = data_train.data[:1000]

In [18]:
I = defaultdict(lambda: defaultdict(lambda: 0))

for i, doc in tqdm(list(enumerate(corpus))):
    for t in tokenize(doc):
        I[i][t] += 1

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [19]:
m = pd.DataFrame(I).T

In [20]:
m.fillna(0, inplace = True)
m.head()

Unnamed: 0,i,be,wonder,if,anyone,out,there,could,enlighten,-PRON-,...,timer,macine,tantrumy,stair,akron,afoul,ye,colossians,3:12,angrily
0,3.0,8.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8.0,12.0,1.0,4.0,0.0,1.0,1.0,3.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
Tf = (m.T / m.max(axis = 1)).T

In [22]:
Tf.loc[10].sort_values(ascending=False)[:10]

-PRON-    1.000000
and       0.833333
i         0.666667
be        0.666667
out       0.500000
leak      0.333333
of        0.333333
with      0.333333
$         0.333333
/         0.333333
Name: 10, dtype: float64

In [23]:
data_train.target_names[data_train.target[10]]

'rec.motorcycles'

In [24]:
Idf = np.log(1000 / np.count_nonzero(m, axis=0))

In [25]:
TfIdf = Tf*Idf

In [26]:
TfIdf

Unnamed: 0,i,be,wonder,if,anyone,out,there,could,enlighten,-PRON-,...,timer,macine,tantrumy,stair,akron,afoul,ye,colossians,3:12,angrily
0,0.159243,0.166055,0.399273,0.234123,0.523893,0.182090,0.155599,0.229856,0.690183,0.082437,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.141549,0.110703,0.000000,0.312164,0.000000,0.000000,0.000000,0.000000,0.000000,0.054958,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.283099,0.166055,0.266182,0.312164,0.000000,0.121393,0.103733,0.459713,0.000000,0.123656,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.164875,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.166055,0.000000,0.234123,0.000000,0.000000,0.000000,0.000000,0.000000,0.164875,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.424648,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
996,0.070775,0.055352,0.000000,0.156082,0.000000,0.242786,0.000000,0.000000,0.000000,0.164875,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
997,0.424648,0.166055,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.164875,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
998,0.000000,0.166055,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.041219,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [27]:
TfIdf.loc[0].sort_values(ascending=False)[:10]

car           1.585043
60s/          0.863469
2-door        0.863469
tellme        0.863469
bricklin      0.776826
bumper        0.726143
funky         0.726143
enlighten     0.690183
spec          0.639499
production    0.620231
Name: 0, dtype: float64