In [1]:
import spacy
import numpy as np
nlp = spacy.load('en_core_web_sm')

In [2]:
L = list(nlp.vocab.strings)

In [3]:
numWords = len(L)
print(numWords)

83431


In [4]:
W2I = dict(zip(L,np.arange(numWords)))
I2W = dict(zip(np.arange(numWords),L))

In [5]:
W2I['game']

5700

In [7]:
I2W[5700]

'game'

In [8]:
def oneHotVector(word,W2I,numWords):
    v = np.zeros(numWords)
    v[W2I[word]] = 1
    return v

In [9]:
v = oneHotVector('game',W2I,numWords)

In [10]:
v

array([0., 0., 0., ..., 0., 0., 0.])

In [11]:
v.shape

(83431,)

In [12]:
v[W2I['game']]

1.0

In [13]:
doc = 'How are you today. I know most of the time how you feel.'
tokens = [token.text for token in nlp(doc)]

In [14]:
tokens

['How',
 'are',
 'you',
 'today',
 '.',
 'I',
 'know',
 'most',
 'of',
 'the',
 'time',
 'how',
 'you',
 'feel',
 '.']

In [16]:
v = np.zeros(numWords)
for token in tokens:
    v += oneHotVector(token,W2I,numWords)

In [17]:
v

array([0., 0., 0., ..., 0., 0., 0.])

In [18]:
v[W2I['.']]

2.0

In [19]:
v.shape

(83431,)

In [21]:
from sklearn.datasets import fetch_20newsgroups as getData

In [22]:
corpus = getData(subset='train',remove=('headers','footers','qoutes'))

In [24]:
docs = corpus.data

In [25]:
len(docs)

11314

In [27]:
print(docs[0])

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [28]:
df = np.zeros(numWords)

In [30]:
for term in L:
    dft = 0
    for doc in docs[:100]:
        if term in doc:
            dft += 1
    df[W2I[term]] = dft

In [31]:
df

array([20., 89., 99., ...,  0.,  0.,  0.])

In [32]:
N = 100

In [34]:
Idf = np.log10(N/(df+1))

In [40]:
doc = 'How are you today. I know most of the time how you feel.'
v = np.zeros(numWords)
for token in nlp(doc):
    v += oneHotVector(token.text,W2I,numWords)

In [41]:
tf = np.log10(v+1)

In [42]:
tfidf = tf*Idf

In [43]:
tfidf

array([0., 0., 0., ..., 0., 0., 0.])

In [45]:
np.sum(tfidf != 0)

13

In [47]:
from sklearn.datasets import fetch_20newsgroups as getData
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import GaussianNB as NB

In [50]:
def loadCorpus():
    corpus = getData(subset = 'train',
                    remove=('header','footer','quotes'),
                    categories=['alt.atheism',
                               'comp.graphics',
                               'soc.religion.christian'])
    return corpus

In [51]:
corpus = loadCorpus()

In [54]:
def buildTFIDFModel(docs):
    cv = CountVectorizer()
    ct = cv.fit(docs)
    counts = ct.transform(docs)
    tfidf = TfidfTransformer().fit(counts)
    return ct,tfidf

In [55]:
ct,tfidf = buildTFIDFModel(corpus.data)

In [56]:
def computeTFIDFFeatures(docs,ct,tfidf):
    counts = ct.transform(docs)
    xF = tfidf.transform(counts)
    return xF.toarray()

In [57]:
xF = computeTFIDFFeatures(corpus.data,ct,tfidf)

In [58]:
xF.shape

(1663, 25692)

In [59]:
corpus.target_names

['alt.atheism', 'comp.graphics', 'soc.religion.christian']

In [61]:
corpus.target_names[corpus.target[100]]

'comp.graphics'

In [62]:
clf = NB().fit(xF,corpus.target)

In [63]:
docs_new = ['God loves everyone',
           'OpenGL works fast',
           'No one is there']
xF_new = computeTFIDFFeatures(docs_new,ct,tfidf)

In [64]:
xF_new.shape

(3, 25692)

In [65]:
predicted = clf.predict(xF_new)

In [69]:
print(corpus.target_names[predicted[2]])

alt.atheism
