In [12]:
import logging
from pprint import pprint
from gensim import corpora
from gensim import models
from gensim import similarities
from gensim.test.utils import get_tmpfile
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore") 

In [13]:
def generateSimilarityIndex(corpus, num_topics=100):
    ##############################################################
    # Create TFIDF and LSI Models on the corpus
    ##############################################################
    tfidfModel = models.TfidfModel(corpus)
    corpus_tfidf = tfidfModel[corpus]

    # Reduce to 100 dimensions
    lsiModel = models.LsiModel(corpus_tfidf, id2word=dictionary,
                               num_topics=num_topics)  # initialize an LSI transformation
    # lsi_model = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=10)  # initialize an LSI transformation
    # lsi_model = models.HdpModel(corpus_tfidf, id2word=dictionary)  # initialize an LSI transformation
    corpus_lsi = lsiModel[corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

    # Print the topics generated by the lsi model
    lsiModel.print_topics()

    # Create an index on which if we hit we will get the
    # similarity of the hitting object with all these documents
    # in LSI space

    # In memory computation - better for small datasets that fit in memory
    # index = similarities.MatrixSimilarity(corpus_lsi)  # transform corpus to LSI space and index it

    # Non In memory computation - better for big datasets that dont fit in memory
    index_temp = get_tmpfile("lsimodel")  # create a temporary file named lsimodel to save things
    # Note that you need to give num_topics again here as num_features
    index = similarities.Similarity(index_temp, corpus_lsi,
                                    num_features=num_topics)  # transform corpus to LSI space and index it

    return [index, tfidfModel, lsiModel]

In [14]:
# Import the logging module for gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Load the dataset
f = open('./rawData.csv', 'rb')
data = f.readlines()

documents = []
for document in data:
    documents.append(document)

In [15]:
# Remove common words and tokenize
stoplist = set('for a of the and to in'.split())

texts = []
i = 0
idToDocumentDict = {}  # id-> document
for document in documents:
    s = [word for word in document.lower().split() if word not in stoplist]
    s = [x for x in s if x not in set(stopwords.words('english'))]
    texts.append(s)
    idToDocumentDict[i] = document
    i += 1

In [16]:
# Create a dictionary
dictionary = corpora.Dictionary(texts)
# pprint(dictionary)

# Create a corpus
corpus = [dictionary.doc2bow(text) for text in texts]
# pprint(corpus)

# Create the TFIDF and LSI Models
[index, tfidfModel, lsiModel] = generateSimilarityIndex(corpus, num_topics=100)

# Transform the search keyword to lsi space
search_text = 'kidnap'.lower().split()

2019-11-16 16:54:00,545 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-11-16 16:54:00,600 : INFO : built Dictionary(10190 unique tokens: ['"in', '"the', '100', '4:00pm', '500']...) from 300 documents (total 59881 corpus positions)
2019-11-16 16:54:00,652 : INFO : collecting document frequencies
2019-11-16 16:54:00,653 : INFO : PROGRESS: processing document #0
2019-11-16 16:54:00,659 : INFO : calculating IDF weights for 300 documents and 10189 features (38568 matrix non-zeros)
2019-11-16 16:54:00,681 : INFO : using serial LSI version on this node
2019-11-16 16:54:00,682 : INFO : updating model with new documents
2019-11-16 16:54:00,814 : INFO : preparing a new chunk of documents
2019-11-16 16:54:00,822 : INFO : using 100 extra samples and 2 power iterations
2019-11-16 16:54:00,822 : INFO : 1st phase: constructing (10190, 200) action matrix
2019-11-16 16:54:00,833 : INFO : orthonormalizing (10190, 200) action matrix
2019-11-16 16:54:01,320 : INFO : 2nd phase: running

In [17]:
# Get its bag of words
corpus_search_keyword = dictionary.doc2bow(search_text)
# print 'corpus_search_keyword:', corpus_search_keyword

tfidf_search_keyword = tfidfModel[corpus_search_keyword]
# print 'tfidf_search_keyword:', tfidf_search_keyword

lsi_search_keyword = lsiModel[tfidf_search_keyword]
# print 'lsi_search_keyword:',lsi_search_keyword

#################################################################
# find the similarity of search keyword to the documents in lsi space

sims = index[lsi_search_keyword]  # perform a similarity query against the corpus
# print(list(enumerate(sims)))  # print (document_number, document_similarity) 2-tuples

sims = sorted(enumerate(sims), key=lambda item: -item[1])

# sims = sorted(enumerate(sims), key=lambda item: -item[-1][1])

count = 0
articles2bsent = []
maxArticlesTobSent = 10
for internal_id, (document_id, lsicosine) in enumerate(sims):
    article = idToDocumentDict[document_id]
    # print(internal_id, (document_id, lsicosine), article)

    articles2bsent.append(article)
    if count >= maxArticlesTobSent:
        break

    count += 1

pprint(articles2bsent)

2019-11-16 16:54:01,618 : INFO : creating matrix with 300 documents and 100 features
2019-11-16 16:54:01,619 : INFO : creating dense shard #0
2019-11-16 16:54:01,620 : INFO : saving index shard to C:\Users\sasid\AppData\Local\Temp\lsimodel.0
2019-11-16 16:54:01,620 : INFO : saving MatrixSimilarity object under C:\Users\sasid\AppData\Local\Temp\lsimodel.0, separately None
2019-11-16 16:54:01,622 : INFO : saved C:\Users\sasid\AppData\Local\Temp\lsimodel.0
2019-11-16 16:54:01,623 : INFO : loading MatrixSimilarity object from C:\Users\sasid\AppData\Local\Temp\lsimodel.0
2019-11-16 16:54:01,631 : INFO : loaded C:\Users\sasid\AppData\Local\Temp\lsimodel.0


[b'A British man has been found guilty by a unanimous verdict of the kidnap and'
 b' murder of an eight-year-old schoolgirl whose death in July 2000 shocked Bri'
 b'tain and set off a rampage of anti-paedophile vigilantes. Roy Whiting was se'
 b'ntenced to life imprisonment for the abduction and murder of eight-year-old '
 b'Sarah Payne with a recommendation by trial judge Justice Richard Curtis that'
 b' he never be released. "You are indeed an evil man. You are in no way mental'
 b'ly unwell. I have seen you for a month and in my view you are a glib and cun'
 b'ning liar," Justice Curtis said. There were cheers of delight as the verdict'
 b's were read out by the foreman at Lewes Crown Court. The jury of nine men an'
 b'd three women had been deliberating for nine hours. As soon as the verdicts '
 b"were declared, the court heard details of Whiting's previous conviction for "
 b'the kidnap and indecent assault of a nine-year-old girl in 1995. Prosecutor '
 b'Timothy Langdale told the