In [1]:
import spacy
import numpy as np
nlp = spacy.load('en_core_web_sm')

In [2]:
L = list(nlp.vocab.strings)

In [3]:
numWords = len(L)
print(numWords)

84780


In [4]:
W2I = dict(zip(L, np.arange(numWords)))
I2W = dict(zip(np.arange(numWords), L))

In [5]:
W2I['game']

np.int64(49865)

In [6]:
I2W[49865]

'game'

In [7]:
I2W[84779]

'￥dd'

# One Hot Encoding

In [8]:
def oneHotVector(word, W2I, numWords):
    v = np.zeros(numWords)
    if word in W2I:
        v[W2I[word]] = 1
    return v

In [9]:
v = oneHotVector('game', W2I, numWords)

In [10]:
v.shape

(84780,)

In [11]:
v[W2I['game']]

np.float64(1.0)

## Term Frequency (TF)

In [12]:
doc = 'How are you today. I know most of ￥28 ￥dd the time how you feel. ￥28 ￥28 ￥dd' #￥28 is the 3rd last number
tokens = [token.text for token in nlp(doc)]

In [13]:
v = np.zeros(numWords)
for token in tokens:
    v += oneHotVector(token,W2I,numWords)

In [14]:
v

array([0., 0., 0., ..., 0., 3., 2.], shape=(84780,))

## TFIDF

In [15]:
from sklearn.datasets import fetch_20newsgroups as getData

In [16]:
corpus = getData(subset='train',remove=('headers','footers','qoutes'))

In [17]:
docs = corpus.data

In [18]:
len(docs)

11314

In [19]:
docs[0]

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

In [20]:
df = np.zeros(numWords)

for term in L:
    dft = 0
    # for doc in docs[:100]:  #100 docs for testing only(full docs(11314) takes much time)
    for doc in docs:
        if term in doc:
            dft += 1
    df[W2I[term]] = dft

In [21]:
df

array([2243.,  873.,    0., ...,    0.,    0.,    0.], shape=(84780,))

In [22]:
N = len(docs)
N

11314

In [23]:
Idf = np.log10(N/(df+1)) #df+1 because some values are 0; so creates error

In [24]:
doc = 'How are you today. I know most of the time how you feel.' #as above TF ^^
v = np.zeros(numWords)

for token in nlp(doc):
    v += oneHotVector(token.text,W2I,numWords)

# tokens = [token.text for token in nlp(doc)]
# for token in tokens:
#     v += oneHotVector(token,W2I,numWords)

In [25]:
tf = np.log10(v+1)

In [26]:
tfidf = tf * Idf
tfidf

array([0., 0., 0., ..., 0., 0., 0.], shape=(84780,))

In [27]:
np.sum(tfidf != 0)

np.int64(13)

## TFIDF using Scikit-learn

In [116]:
from sklearn.datasets import fetch_20newsgroups as getData
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import GaussianNB as NB

In [117]:
def loadCorpus():
    corpus = getData(subset = 'train',
                    remove=('header','footer','quotes'),
                    categories=['sci.electronics',
                                'rec.motorcycles',
                               'comp.graphics',
                               'talk.religion.misc',
                               'talk.politics.guns'])
    return corpus

In [118]:
corpus = loadCorpus()
corpus.target_names

['comp.graphics',
 'rec.motorcycles',
 'sci.electronics',
 'talk.politics.guns',
 'talk.religion.misc']

In [119]:
# def loadTFIDFModel(docs):
#     cv = CountVectorizer()
#     ct = cv.fit(docs)  #this is model for TF
#     counts = ct.transform(docs)  ##this is TF
#     tfidf = TfidfTransformer().fit(counts)
#     return ct,tfidf

def loadTFIDFModel(docs):
    cv = CountVectorizer()
    cv.fit(docs)  #this is model for TF
    counts = cv.transform(docs)  ##this is TF
    tfidf = TfidfTransformer()
    tfidf.fit(counts)
    return cv,tfidf

In [120]:
ct,tfidf = loadTFIDFModel(corpus.data)

In [121]:
def computeTFIDFFeatures(docs,ct,tfidf):
    counts = ct.transform(docs)
    xF = tfidf.transform(counts)
    return xF.toarray()

In [122]:
xF = computeTFIDFFeatures(corpus.data,ct,tfidf) #xF = TFIDF of given data
xF.shape

(2696, 37909)

In [123]:
corpus.target_names

['comp.graphics',
 'rec.motorcycles',
 'sci.electronics',
 'talk.politics.guns',
 'talk.religion.misc']

In [124]:
corpus.target_names[corpus.target[100]]

'rec.motorcycles'

## 1

### 1.a

### 1.b

## 2

In [125]:
Xtrain = xF
ytrain = corpus.target
clf = NB().fit(Xtrain,ytrain)

In [140]:
docs_new = ['Trust in the Lord with all your heart.',
           'OpenGL works fast',
            'Honda cd 125 was most popular in Bangladesh',
           'No one is there',
            '4 inch revolver is very cute',
           'Iphone 13 mini is a very handy smartphone']
xF_new = computeTFIDFFeatures(docs_new,ct,tfidf)
xF_new.shape

(6, 37909)

In [141]:
Xtest = xF_new
predicted = clf.predict(Xtest)

In [142]:
i = 0
for value in predicted:
    print(docs_new[i]+'\n= '+corpus.target_names[value])
    i+=1

Trust in the Lord with all your heart.
= talk.religion.misc
OpenGL works fast
= comp.graphics
Honda cd 125 was most popular in Bangladesh
= rec.motorcycles
No one is there
= talk.religion.misc
4 inch revolver is very cute
= talk.politics.guns
Iphone 13 mini is a very handy smartphone
= sci.electronics
