In [None]:
# Table of content

# text processing
# document similarity based on TF-IDF
# document similarity based on LDA

In [2]:
# text processing ############################

from gensim import corpora, models, similarities,matutils
from gensim.parsing.preprocessing import preprocess_string

# load data
# each line is text description of each project
with open('ProjectTexts.txt', 'r', encoding='utf-8') as f:
    train = []
    for line in f.readlines():
        # preprocessing, such as lower case, stopword removal, stemming
        # return a list of stemmed terms
        line=preprocess_string(line)
        train.append(line)
        
print('training data: ',train[0])

# build a dictionary
dictionary = corpora.Dictionary(train)

# load test text
with open('ProjectTexts_TestText.txt', 'r', encoding='utf-8')as fr:
    line = fr.readline()
    test=preprocess_string(line)
    test_vec = dictionary.doc2bow(test)
print('test texts: ',test)

training data:  ['credit', 'card', 'fraud', 'detect', 'import', 'credit', 'card', 'compani', 'abl', 'recogn', 'fraudul', 'credit', 'card', 'transact', 'custom', 'charg', 'item', 'purchas', 'dataset', 'contain', 'transact', 'credit', 'card', 'septemb', 'european', 'cardhold', 'dataset', 'present', 'transact', 'occur', 'dai', 'fraud', 'transact', 'dataset', 'highli', 'unbalanc', 'posit', 'class', 'fraud', 'account', 'transact', 'contain', 'numer', 'input', 'variabl', 'result', 'pca', 'transform', 'unfortun', 'confidenti', 'issu', 'provid', 'origin', 'featur', 'background', 'inform', 'data', 'featur', 'princip', 'compon', 'obtain', 'pca', 'featur', 'transform', 'pca', 'time', 'featur', 'time', 'contain', 'second', 'elaps', 'transact', 'transact', 'dataset', 'featur', 'transact', 'featur', 'exampl', 'depend', 'cost', 'senstiv', 'learn', 'featur', 'class', 'respons', 'variabl', 'take', 'valu', 'case', 'fraud']
test texts:  ['health', 'insur', 'marketplac', 'public', 'us', 'file', 'contain',

In [3]:
# document similarity by using TF-IDF weighted matrix ############################


count = len(dictionary.token2id)  # number of unique terms
print('total terms: ',count)
dictionary.save('dict.txt')  # save dict
# build corpus using bag of words (bow)
corpus_tf = [dictionary.doc2bow(text) for text in train]
print('corpus size:',len(corpus_tf))
# build tf-idf model
corpus_tfidf = models.TfidfModel(corpus_tf)
# save tf-idf data
with open('ProjectTexts_TFIDF.txt', 'w', encoding='utf-8') as fr:
    for doc in corpus_tfidf[corpus_tf]:
        # each line is TF-IDF vector for a document
        # the vector only saves the terms in the document
        fr.write(doc.__str__() + '\n')
    print('TF-IDF file saved.')
    
# print first term-frequency of first line
print('TF in doc1:',corpus_tf[0])
# print tf-idf of first line
print('TF-idf in doc1',corpus_tfidf[corpus_tf[0]])

from sklearn.metrics.pairwise import cosine_similarity
from gensim.similarities.docsim import MatrixSimilarity

# get simialarities
index = similarities.MatrixSimilarity(corpus_tfidf[corpus_tf], num_features=count)
# get simiarlity between the test docs, and all docs in train
sims = index[test_vec]
print('\nSimilarity between test text and others:\n',list(enumerate(sims)))

doc0=corpus_tf[0]
sims = index[doc0]
print('\nSimilarity between doc0 and others:\n',list(enumerate(sims)))

# get top-10 best similar documents
index = similarities.MatrixSimilarity(corpus_tfidf[corpus_tf], num_features=count, num_best=10)
sims = index[test_vec]
print('\nSimilarity between test text and others (top-10):\n',sims)

total terms:  1289
corpus size: 50
TF-IDF file saved.
TF in doc1: [(0, 1), (1, 1), (2, 1), (3, 4), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 3), (12, 1), (13, 4), (14, 1), (15, 1), (16, 1), (17, 4), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 7), (24, 4), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 3), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 2), (51, 8), (52, 2), (53, 1), (54, 1), (55, 1), (56, 2)]
TF-idf in doc1 [(0, 0.04730669241449516), (1, 0.043560889562723706), (2, 0.08037265141657753), (3, 0.23120649190054587), (4, 0.08037265141657753), (5, 0.043560889562723706), (6, 0.05780162297513647), (7, 0.13226383600832953), (8, 0.04730669241449516), (9, 0.06613191800416476), (10, 0.08037265141657753), (11, 0.07022915330077939), (12, 0.05780162297513647), (13, 0.26452767201665905), (14, 0.06613191800416476), (15, 0.0403938583

In [4]:
# document similarity by using LDA ############################

corpus_tfidfdata=corpus_tfidf[corpus_tf]
# build LDA based on 10 topics
num_topics=10
lda = models.LdaModel(corpus_tfidfdata, id2word=dictionary, num_topics=10)
print('\nPrint topics:\n',lda.print_topics(num_topics))

# represent documents by using the distribution over topics
corpus_doc2topic=lda[corpus_tfidfdata]
print('\nPrint topic-vector for doc0:\n',corpus_doc2topic[0])

# get simialarities
index = similarities.MatrixSimilarity(corpus_doc2topic, num_features=num_topics)
# get simiarlity between the test docs, and all docs in train
test_vec_lda=lda[test_vec]
sims = index[test_vec_lda]
print('\nSimilarity between test text and others:\n',list(enumerate(sims)))

doc0=corpus_doc2topic[0]
sims = index[doc0]
print('\nSimilarity between doc0 and others:\n',list(enumerate(sims)))

# get top-10 best similar documents
index = similarities.MatrixSimilarity(corpus_tfidf[corpus_tf], num_features=count, num_best=10)
sims = index[test_vec_lda]
print('\nSimilarity between test text and others (top-10):\n',sims)


Print topics:
 [(0, '0.005*"user" + 0.005*"anim" + 0.004*"transact" + 0.004*"chicago" + 0.004*"gym" + 0.004*"homicid" + 0.004*"iri" + 0.004*"french" + 0.003*"predict" + 0.003*"featur"'), (1, '0.003*"metacrit" + 0.003*"activ" + 0.003*"elect" + 0.003*"sale" + 0.003*"game" + 0.003*"candid" + 0.003*"correl" + 0.003*"download" + 0.003*"republican" + 0.003*"observ"'), (2, '0.006*"sale" + 0.004*"deputi" + 0.004*"movi" + 0.003*"epidem" + 0.003*"zika" + 0.003*"menu" + 0.003*"actor" + 0.003*"director" + 0.003*"nutrit" + 0.003*"game"'), (3, '0.005*"appoint" + 0.005*"postcod" + 0.005*"visa" + 0.004*"feder" + 0.004*"happi" + 0.004*"return" + 0.003*"flood" + 0.003*"rate" + 0.003*"tax" + 0.003*"risk"'), (4, '0.005*"emot" + 0.004*"homicid" + 0.004*"physic" + 0.003*"murder" + 0.003*"datasheet" + 0.003*"chang" + 0.003*"perpetr" + 0.002*"project" + 0.002*"victim" + 0.002*"relationship"'), (5, '0.005*"contest" + 0.005*"voic" + 0.005*"review" + 0.005*"mushroom" + 0.004*"world" + 0.004*"indic" + 0.004*"ama