# Loading

In [None]:
import requests
import json
import pandas as pd

#increase the max column length
pd.set_option('display.max_colwidth', 200)

corpus_df = pd.read_csv('https://www.dropbox.com/s/pag5jseq2e9wcvb/corpus.csv?raw=1',usecols=['title','text'])

length =corpus_df.shape[0]
corpus_df.head()

Unnamed: 0,title,text
0,Cross-Evaluation of Term Extraction Tools by Measuring Terminological Saturation,Synopsis of the Refinements and Extensions Compared to the Publication in the Conference Proceedings This submission is a refined and extended paper based on the ICTERI 2017 PhD Symposium paper...
1,Enhancing Public Procurement in the European Union through Constructing and Exploiting an Integrated Knowledge Graph,"Enhancing Public Procurement in the European Union through Constructing and Exploiting an Integrated Knowledge Graph Ahmet Soylu1, Oscar Corcho2, Brian Elvesæter1, Carlos Badenes-Olmedo2, Francisc..."
2,Drugs4Covid: Making drug information available from scientific publications,"Drugs4Covid: Making drug information available from scientific publications Carlos Badenes-Olmedo1, David Chaves-Fraga1, Mar´ıa Poveda-Villal´on1, Ana Iglesias-Molina1, Pablo Calleja1, Socorro Ber..."
3,Distributing Text Mining tasks with librAIry,"Distributing Text Mining tasks with librAIry Carlos Badenes-Olmedo cbadenes@f.upm.es Universidad Polit´ecnica de Madrid Ontology Engineering Group Boadilla del Monte, Spain Jos´e Luis Redondo-Garc..."
4,Large-Scale Semantic Exploration of Scientific Literature using Topic-based Hashing Algorithms,"Semantic Web 0 (0) 1 1 IOS Press Large-Scale Semantic Exploration of Scientific Literature using Topic-based Hashing Algorithms Editor(s): Tomi Kauppinen, Aalto University, Finland; Daniel Garijo,..."


# Preprocessing

Change: adding a stemmer to get similar terms grouped together

Change: also tokenizing the title

In [None]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from nltk.stem.porter import *

def is_valid(token):
  return len(token.text) > 1 and not token.is_digit and not token.is_stop

nlp = spacy.load("en_core_web_sm")
stemmer = PorterStemmer()

def preprocess(text):
  tokens = []
  for token in nlp(text):
    if is_valid(token):
      word = stemmer.stem(token.lemma_)
      tokens.append(word)
  return tokens

corpus_df["tokens"] = corpus_df['title'].apply(preprocess)
corpus_df['tokens'] += corpus_df['text'].apply(preprocess)

corpus_df.head()

Unnamed: 0,title,text,tokens
0,Cross-Evaluation of Term Extraction Tools by Measuring Terminological Saturation,Synopsis of the Refinements and Extensions Compared to the Publication in the Conference Proceedings This submission is a refined and extended paper based on the ICTERI 2017 PhD Symposium paper...,"[cross, evalu, term, extract, tool, measur, terminolog, satur, synopsi, refin, extens, compar, public, confer, proceed, submiss, refin, extend, paper, base, icteri, phd, symposium, paper, kosa, et..."
1,Enhancing Public Procurement in the European Union through Constructing and Exploiting an Integrated Knowledge Graph,"Enhancing Public Procurement in the European Union through Constructing and Exploiting an Integrated Knowledge Graph Ahmet Soylu1, Oscar Corcho2, Brian Elvesæter1, Carlos Badenes-Olmedo2, Francisc...","[enhanc, public, procur, european, union, construct, exploit, integr, knowledg, graph, enhanc, public, procur, european, union, construct, exploit, integr, knowledg, graph, ahmet, soylu1, oscar, c..."
2,Drugs4Covid: Making drug information available from scientific publications,"Drugs4Covid: Making drug information available from scientific publications Carlos Badenes-Olmedo1, David Chaves-Fraga1, Mar´ıa Poveda-Villal´on1, Ana Iglesias-Molina1, Pablo Calleja1, Socorro Ber...","[drugs4covid, make, drug, inform, avail, scientif, public, drugs4covid, make, drug, inform, avail, scientif, public, carlo, baden, olmedo1, david, chave, fraga1, mar´ıa, poveda, villal´on1, ana, i..."
3,Distributing Text Mining tasks with librAIry,"Distributing Text Mining tasks with librAIry Carlos Badenes-Olmedo cbadenes@f.upm.es Universidad Polit´ecnica de Madrid Ontology Engineering Group Boadilla del Monte, Spain Jos´e Luis Redondo-Garc...","[distribut, text, mine, task, librairi, distribut, text, mine, task, librairi, carlo, baden, olmedo, cbadenes@f.upm., universidad, polit´ecnica, de, madrid, ontolog, engin, group, boadilla, del, m..."
4,Large-Scale Semantic Exploration of Scientific Literature using Topic-based Hashing Algorithms,"Semantic Web 0 (0) 1 1 IOS Press Large-Scale Semantic Exploration of Scientific Literature using Topic-based Hashing Algorithms Editor(s): Tomi Kauppinen, Aalto University, Finland; Daniel Garijo,...","[larg, scale, semant, explor, scientif, literatur, topic, base, hash, algorithm, semant, web, io, press, larg, scale, semant, explor, scientif, literatur, topic, base, hash, algorithm, editor(, to..."


# TF-IDF 

TF-IDF calculates the relative importance of a term for a document, based on the number of times the term appears in the document itself (term frequency - TF) and the number of documents in the corpus, which contain the term (document frequency - DF)

Cons:

1.   Context around the words is not detected



In [None]:
from collections import defaultdict
from nltk.text import TextCollection

texts  = TextCollection(corpus_df['tokens'])
vectors = []

for doc in corpus_df['tokens']:
  features = defaultdict(int)
  for term in doc:
    features[term]=texts.tf_idf(term, doc)
  vectors.append(features)

corpus_df['tf-idf'] = vectors
corpus_df.head()

Unnamed: 0,title,text,tokens,tf-idf
0,Cross-Evaluation of Term Extraction Tools by Measuring Terminological Saturation,Synopsis of the Refinements and Extensions Compared to the Publication in the Conference Proceedings This submission is a refined and extended paper based on the ICTERI 2017 PhD Symposium paper...,"[cross, evalu, term, extract, tool, measur, terminolog, satur, synopsi, refin, extens, compar, public, confer, proceed, submiss, refin, extend, paper, base, icteri, phd, symposium, paper, kosa, et...","{'cross': 0.0008525532959078879, 'evalu': 0.0010744572670414395, 'term': 0.0, 'extract': 0.009039702474861145, 'tool': 0.003630116741873373, 'measur': 0.006548445887300987, 'terminolog': 0.0118674..."
1,Enhancing Public Procurement in the European Union through Constructing and Exploiting an Integrated Knowledge Graph,"Enhancing Public Procurement in the European Union through Constructing and Exploiting an Integrated Knowledge Graph Ahmet Soylu1, Oscar Corcho2, Brian Elvesæter1, Carlos Badenes-Olmedo2, Francisc...","[enhanc, public, procur, european, union, construct, exploit, integr, knowledg, graph, enhanc, public, procur, european, union, construct, exploit, integr, knowledg, graph, ahmet, soylu1, oscar, c...","{'enhanc': 0.005489528400065187, 'public': 0.007984810058742447, 'procur': 0.043521518417300634, 'european': 0.0011908273166532537, 'union': 0.0014806804913326697, 'construct': 0.00228211296978383..."
2,Drugs4Covid: Making drug information available from scientific publications,"Drugs4Covid: Making drug information available from scientific publications Carlos Badenes-Olmedo1, David Chaves-Fraga1, Mar´ıa Poveda-Villal´on1, Ana Iglesias-Molina1, Pablo Calleja1, Socorro Ber...","[drugs4covid, make, drug, inform, avail, scientif, public, drugs4covid, make, drug, inform, avail, scientif, public, carlo, baden, olmedo1, david, chave, fraga1, mar´ıa, poveda, villal´on1, ana, i...","{'drugs4covid': 0.018349366948258117, 'make': 0.00029242766861206117, 'drug': 0.03913563113495007, 'inform': 0.0, 'avail': 0.0003792569138219235, 'scientif': 0.0049542744373601705, 'public': 0.002..."
3,Distributing Text Mining tasks with librAIry,"Distributing Text Mining tasks with librAIry Carlos Badenes-Olmedo cbadenes@f.upm.es Universidad Polit´ecnica de Madrid Ontology Engineering Group Boadilla del Monte, Spain Jos´e Luis Redondo-Garc...","[distribut, text, mine, task, librairi, distribut, text, mine, task, librairi, carlo, baden, olmedo, cbadenes@f.upm., universidad, polit´ecnica, de, madrid, ontolog, engin, group, boadilla, del, m...","{'distribut': 0.0005788067599452932, 'text': 0.0009003660710260116, 'mine': 0.0015249160720076154, 'task': 0.0009478372930061125, 'librairi': 0.005725979251001631, 'carlo': 6.431186221614367e-05, ..."
4,Large-Scale Semantic Exploration of Scientific Literature using Topic-based Hashing Algorithms,"Semantic Web 0 (0) 1 1 IOS Press Large-Scale Semantic Exploration of Scientific Literature using Topic-based Hashing Algorithms Editor(s): Tomi Kauppinen, Aalto University, Finland; Daniel Garijo,...","[larg, scale, semant, explor, scientif, literatur, topic, base, hash, algorithm, semant, web, io, press, larg, scale, semant, explor, scientif, literatur, topic, base, hash, algorithm, editor(, to...","{'larg': 0.00046468098343569044, 'scale': 0.0004537994030901811, 'semant': 0.0002546584782801699, 'explor': 0.0004289362924021758, 'scientif': 0.0012613477701726682, 'literatur': 0.000561779380380..."


# Doc2Vector

Pros:
1.   It takes into consideration the ordering of words within a narrow context, similar to an n-gram model
2.   The combined result generalizes and has a lower dimensionality

Cons:

1. Importance of the terms is not taken into account

In [None]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus_df['tokens'])]
model = Doc2Vec(documents, vector_size=5, min_count=0, window=2, workers=4)

docvecs = []
for pos in range(length):
  docvecs.append(model.docvecs[pos])

corpus_df['d2v'] = docvecs
corpus_df.head()

Unnamed: 0,title,text,tokens,tf-idf,d2v
0,Cross-Evaluation of Term Extraction Tools by Measuring Terminological Saturation,Synopsis of the Refinements and Extensions Compared to the Publication in the Conference Proceedings This submission is a refined and extended paper based on the ICTERI 2017 PhD Symposium paper...,"[cross, evalu, term, extract, tool, measur, terminolog, satur, synopsi, refin, extens, compar, public, confer, proceed, submiss, refin, extend, paper, base, icteri, phd, symposium, paper, kosa, et...","{'cross': 0.0008525532959078879, 'evalu': 0.0010744572670414395, 'term': 0.0, 'extract': 0.009039702474861145, 'tool': 0.003630116741873373, 'measur': 0.006548445887300987, 'terminolog': 0.0118674...","[-11.09809, 13.234375, -0.89116114, -1.018338, -11.218707]"
1,Enhancing Public Procurement in the European Union through Constructing and Exploiting an Integrated Knowledge Graph,"Enhancing Public Procurement in the European Union through Constructing and Exploiting an Integrated Knowledge Graph Ahmet Soylu1, Oscar Corcho2, Brian Elvesæter1, Carlos Badenes-Olmedo2, Francisc...","[enhanc, public, procur, european, union, construct, exploit, integr, knowledg, graph, enhanc, public, procur, european, union, construct, exploit, integr, knowledg, graph, ahmet, soylu1, oscar, c...","{'enhanc': 0.005489528400065187, 'public': 0.007984810058742447, 'procur': 0.043521518417300634, 'european': 0.0011908273166532537, 'union': 0.0014806804913326697, 'construct': 0.00228211296978383...","[-8.81651, 14.339468, 0.6958606, -0.16466422, -1.2553015]"
2,Drugs4Covid: Making drug information available from scientific publications,"Drugs4Covid: Making drug information available from scientific publications Carlos Badenes-Olmedo1, David Chaves-Fraga1, Mar´ıa Poveda-Villal´on1, Ana Iglesias-Molina1, Pablo Calleja1, Socorro Ber...","[drugs4covid, make, drug, inform, avail, scientif, public, drugs4covid, make, drug, inform, avail, scientif, public, carlo, baden, olmedo1, david, chave, fraga1, mar´ıa, poveda, villal´on1, ana, i...","{'drugs4covid': 0.018349366948258117, 'make': 0.00029242766861206117, 'drug': 0.03913563113495007, 'inform': 0.0, 'avail': 0.0003792569138219235, 'scientif': 0.0049542744373601705, 'public': 0.002...","[-6.0409255, 13.622596, -0.20537792, -0.8264259, 1.3080983]"
3,Distributing Text Mining tasks with librAIry,"Distributing Text Mining tasks with librAIry Carlos Badenes-Olmedo cbadenes@f.upm.es Universidad Polit´ecnica de Madrid Ontology Engineering Group Boadilla del Monte, Spain Jos´e Luis Redondo-Garc...","[distribut, text, mine, task, librairi, distribut, text, mine, task, librairi, carlo, baden, olmedo, cbadenes@f.upm., universidad, polit´ecnica, de, madrid, ontolog, engin, group, boadilla, del, m...","{'distribut': 0.0005788067599452932, 'text': 0.0009003660710260116, 'mine': 0.0015249160720076154, 'task': 0.0009478372930061125, 'librairi': 0.005725979251001631, 'carlo': 6.431186221614367e-05, ...","[-6.459851, 12.050546, -0.52066594, -1.0036011, -0.20109762]"
4,Large-Scale Semantic Exploration of Scientific Literature using Topic-based Hashing Algorithms,"Semantic Web 0 (0) 1 1 IOS Press Large-Scale Semantic Exploration of Scientific Literature using Topic-based Hashing Algorithms Editor(s): Tomi Kauppinen, Aalto University, Finland; Daniel Garijo,...","[larg, scale, semant, explor, scientif, literatur, topic, base, hash, algorithm, semant, web, io, press, larg, scale, semant, explor, scientif, literatur, topic, base, hash, algorithm, editor(, to...","{'larg': 0.00046468098343569044, 'scale': 0.0004537994030901811, 'semant': 0.0002546584782801699, 'explor': 0.0004289362924021758, 'scientif': 0.0012613477701726682, 'literatur': 0.000561779380380...","[-7.239603, 14.807331, -2.3573773, -1.5848103, 5.0019665]"


#Final model

Combining tf-idf with word sequences

In [None]:
import math
from scipy import spatial

def importance(x,dic,dic2):
  score = dic[x]**2 + dic2[x]**2
  #score = dic[x]*dic2[x]
  #score = (dic[x]+dic2[x])/abs(dic[x]-dic2[x])
  return math.sqrt(score)

for pos in range(length):
  dic = corpus_df["tf-idf"][pos]
  vector = corpus_df["d2v"][pos]
  result=[]
  
  for pos2 in range(length):
    dic2 = corpus_df["tf-idf"][pos2]
    score=0
    
    for x in dic:
      if x in dic2:
        score += importance(x,dic,dic2)
        continue

    similarity = 1- spatial.distance.cosine(vector, corpus_df["d2v"][pos2]) 
    result.append([corpus_df["title"][pos2], score*similarity])

  result.sort(key=lambda x:x[1],reverse=True)
  print(corpus_df["title"][pos])
  for x in result:
    print(x[0],":",x[1])
  print("_____________________________________________________________________________________")

Cross-Evaluation of Term Extraction Tools by Measuring Terminological Saturation
Cross-Evaluation of Term Extraction Tools by Measuring Terminological Saturation : 1.3878805463235038
Semantic Saturation in Retrospective Text Document  Collections : 0.6024896789620394
Enhancing Public Procurement in the European Union through Constructing and Exploiting an Integrated Knowledge Graph : 0.27306372824701103
Drugs4Covid: Making drug information available from scientific publications : 0.24544624361459577
Large-Scale Semantic Exploration of Scientific Literature using Topic-based Hashing Algorithms : 0.2293212707536847
Efficient Clustering from Distributions over Topics : 0.21369515001332887
Scalable Cross-lingual Document Similarity through Language-specific Concept Hierarchies : 0.20896840660942304
An initial Analysis of Topic-based Similarity among Scientific Documents based on their Rhetorical Discourse Parts : 0.2029907912913551
Legal Documents Retrieval Across Languages: Topic Hierarch