In [1]:
# !pip show spacy-transformers transformers
# !pip install transformers==4.36.0
# !pip install gensim

In [2]:
from gensim import corpora,models,similarities
from sklearn.datasets import fetch_20newsgroups as getData
from sklearn.model_selection import train_test_split
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')
import re
import numpy as np

In [3]:
corpus = getData(subset = 'train',
                    remove=('header','footer','quotes'),
                    categories=['sci.electronics',
                                'rec.motorcycles',
                               'comp.graphics',
                               'talk.religion.misc',
                               'talk.politics.guns'])

# corpus = getData(subset='train', 
#                              remove=('headers', 'footers', 'quotes'),
#                             categories=['alt.atheism','comp.graphics',
#                                        'soc.religion.christian'])
    
X = corpus.data
y = corpus.target
y_names = corpus.target_names

In [4]:
print(X[0],len(X),y_names[y[0]])

Organization: Penn State University
From: <LRR105@psuvm.psu.edu>
Subject: Re: Tools Tools Tools
 <1993Apr1.162709.16643@osf.org> <1993Apr2.235809.3241@kronos.arc.nasa.gov>
 <1993Apr5.165548.21479@research.nj.nec.com>
Lines: 1

WHAT IS THE FLANK DRIVE EVERYONES TALKING ABOUT?
 2696 rec.motorcycles


In [5]:
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=0.1,shuffle=True)

In [6]:
stoplist = set('for a of the and to in'.split(' '))
texts = [[word for word in re.split('\W+',doc.lower()) if word not in stoplist] for doc in Xtrain]

In [7]:
print(len(texts))

2426


In [8]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

cutOffValue = 10
processed_corpus = [[token for token in text if frequency[token] > cutOffValue] 
                    for text in texts]

In [9]:
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary<5023 unique tokens: ['', '13', 'am', 'an', 'any']...>


In [10]:
print(dictionary.token2id)



In [11]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]  #bow = bag of corpus

In [12]:
print(bow_corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 3), (11, 2), (12, 1), (13, 1), (14, 3), (15, 1), (16, 1), (17, 1), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 3), (28, 1), (29, 1), (30, 2), (31, 1), (32, 2), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 3), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 2), (50, 1), (51, 2), (52, 2), (53, 1), (54, 1), (55, 1), (56, 2)]


In [13]:
model = models.TfidfModel(bow_corpus)
#model = models.LsiModel(bow_corpus)
#model = models.LdaModel(bow_corpus)

In [14]:
index = similarities.SparseMatrixSimilarity(model[bow_corpus],num_features=len(dictionary))

In [20]:
t = np.random.randint(len(Xtest))
query_document = re.split('\W+',Xtest[t].lower())
query_bow = dictionary.doc2bow(query_document)
sims = index[model[query_bow]]
docNumber = sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[0][0]
print('Predicted:',y_names[ytrain[docNumber]])
print('Ground Truth:',y_names[ytest[t]])

Predicted: comp.graphics
Ground Truth: comp.graphics
