In [None]:
title = "VECTORIZATION OF TEXT USING DATA MINING METHODS"
text = "In the text mining tasks, textual representation should be not only efficient but also interpretable, as this enables an understanding of the operational logic underlying the data mining models. Traditional text vectorization methods such as TF-IDF and bag-of-words are effective and characterized by intuitive interpretability, but suffer from the «curse of dimensionality», and they are unable to capture the meanings of words. On the other hand, modern distributed methods effectively capture the hidden semantics, but they are computationally intensive, time-consuming, and uninterpretable. This article proposes a new text vectorization method called Bag of weighted Concepts BoWC that presents a document according to the concepts’ information it contains. The proposed method creates concepts by clustering word vectors (i.e. word embedding) then uses the frequencies of these concept clusters to represent document vectors. To enrich the resulted document representation, a new modified weighting function is proposed for weighting concepts based on statistics extracted from word embedding information. The generated vectors are characterized by interpretability, low dimensionality, high accuracy, and low computational costs when used in data mining tasks. The proposed method has been tested on five different benchmark datasets in two data mining tasks; document clustering and classification, and compared with several baselines, including Bag-of-words, TF-IDF, Averaged GloVe, Bag-of-Concepts, and VLAC. The results indicate that BoWC outperforms most baselines and gives 7% better accuracy on average"
full_text = title +", "+ text 
print("The whole text to be used\n",full_text)

The whole text to be used
 VECTORIZATION OF TEXT USING DATA MINING METHODS, In the text mining tasks, textual representation should be not only efficient but also interpretable, as this enables an understanding of the operational logic underlying the data mining models. Traditional text vectorization methods such as TF-IDF and bag-of-words are effective and characterized by intuitive interpretability, but suffer from the «curse of dimensionality», and they are unable to capture the meanings of words. On the other hand, modern distributed methods effectively capture the hidden semantics, but they are computationally intensive, time-consuming, and uninterpretable. This article proposes a new text vectorization method called Bag of weighted Concepts BoWC that presents a document according to the concepts’ information it contains. The proposed method creates concepts by clustering word vectors (i.e. word embedding) then uses the frequencies of these concept clusters to represent document v

#Rake

In [None]:
pip install multi_rake

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting multi_rake
  Downloading multi_rake-0.0.2-py3-none-any.whl (31 kB)
Collecting pycld2>=0.41
  Downloading pycld2-0.41.tar.gz (41.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pycld2
  Building wheel for pycld2 (setup.py) ... [?25l[?25hdone
  Created wheel for pycld2: filename=pycld2-0.41-cp38-cp38-linux_x86_64.whl size=9833523 sha256=2138ddbb217bc40e44ac99376ea6c9f1619e27e8476c9600f88a887de6f2a19f
  Stored in directory: /root/.cache/pip/wheels/2b/3a/82/d990040cbe6c3527732e931e2925785e83fe9aaa5a11c313ca
Successfully built pycld2
Installing collected packages: pycld2, multi_rake
Successfully installed multi_rake-0.0.2 pycld2-0.41


In [None]:
from multi_rake import Rake
rake = Rake()
keywords = rake.apply(full_text)

keywords[:10]

[('data mining methods', 9.0),
 ('operational logic underlying', 9.0),
 ('data mining models', 9.0),
 ('modified weighting function', 9.0),
 ('weighting concepts based', 9.0),
 ('data mining tasks', 9.0),
 ('weighted concepts bowc', 8.5),
 ('low computational costs', 8.5),
 ('text mining tasks', 8.0),
 ('represent document vectors', 7.916666666666666)]

#Genssim

In [None]:
#We can use gensim as well for extracting keyword from a given text

from gensim.summarization import keywords


print(keywords(full_text,words = 10,scores = True, lemmatize = True))

[('method', 0.29745612425605206), ('document', 0.29361417291600866), ('concept', 0.26033869092639655), ('mining', 0.2046815733679113), ('vectors', 0.2017427642608579), ('low', 0.17066803843219266), ('text', 0.16283467751402456), ('logic', 0.16190888956939853), ('benchmark', 0.16190888956939722), ('bag', 0.15586763594148423)]


#YAKE

In [None]:
pip install git+https://github.com/LIAAD/yake

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/LIAAD/yake
  Cloning https://github.com/LIAAD/yake to /tmp/pip-req-build-mqljzq7a
  Running command git clone --filter=blob:none --quiet https://github.com/LIAAD/yake /tmp/pip-req-build-mqljzq7a
  Resolved https://github.com/LIAAD/yake to commit 8d71d94ded93fb77f1361f62e5264f19b9c91cd7
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting segtok
  Downloading segtok-1.5.11-py3-none-any.whl (24 kB)
Collecting jellyfish
  Downloading jellyfish-0.9.0.tar.gz (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 KB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: yake, jellyfish
  Building wheel for yake (setup.py) ... [?25l[?25hdone
  Created wheel for yake: filename=yake-0.4.8-py2.py3-none-any.whl size=62600 sha256=5cc1ed5fe9

In [None]:
import yake
kw_extractor = yake.KeywordExtractor(top=10, stopwords=None)
keywords = kw_extractor.extract_keywords(full_text)

for kw, v in keywords:
  print("Keyphrase: ",kw, ": score", v)

Keyphrase:  operational logic underlying : score 0.008502958451052589
Keyphrase:  text vectorization methods : score 0.015613284939549285
Keyphrase:  text vectorization : score 0.02310717508615897
Keyphrase:  Traditional text vectorization : score 0.02325791341228692
Keyphrase:  data mining models : score 0.02830809004349318
Keyphrase:  data mining tasks : score 0.033863083795882626
Keyphrase:  DATA MINING : score 0.03618462463953267
Keyphrase:  text mining tasks : score 0.037652251074155374
Keyphrase:  enables an understanding : score 0.04036782511075581
Keyphrase:  operational logic : score 0.04036782511075581


#textrank

In [None]:
pip install summa

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting summa
  Downloading summa-1.2.0.tar.gz (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.9/54.9 KB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: summa
  Building wheel for summa (setup.py) ... [?25l[?25hdone
  Created wheel for summa: filename=summa-1.2.0-py3-none-any.whl size=54410 sha256=cd5e6732ccc1a96c137958543f4693a67274ce1848a7a9080c77c583ef0d553e
  Stored in directory: /root/.cache/pip/wheels/fd/6a/dd/209eb19d5f2266b9cfd06827539bf70435b0ad5fe8244e52d3
Successfully built summa
Installing collected packages: summa
Successfully installed summa-1.2.0


In [None]:
from summa import keywords


TR_keywords = keywords.keywords(full_text, scores=True)
TR_keywords[0:10]

[('methods', 0.29585314188985434),
 ('method', 0.29585314188985434),
 ('document', 0.29300649554724484),
 ('concepts', 0.2597209892723852),
 ('concept', 0.2597209892723852),
 ('mining', 0.20425273810869513),
 ('vectorization', 0.20080655873686565),
 ('word vectors', 0.18267366210822228),
 ('computationally', 0.16718186386765732),
 ('computational', 0.16718186386765732)]

#Keybert

In [None]:
pip install keybert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keybert
  Downloading keybert-0.7.0.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.3.8
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rich>=10.4.0
  Downloading rich-13.1.0-py3-none-any.whl (238 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.4/238.4 KB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting commonmark<0.10.0,>=0.9.0
  Downloading commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.1/51.1 KB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.25.1-py3-none-any.whl (

In [None]:
!pip install keybert
from keybert import KeyBERT

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#You can select any model from sentence-transformers [here](https://www.sbert.net/docs/pretrained_models.html) and pass it through KeyBERT with model:

NOTE: For a full overview of all possible transformer models see sentence-transformer. I would advise either "all-MiniLM-L6-v2" for English documents or "paraphrase-multilingual-MiniLM-L12-v2" for multi-lingual documents or any other language.




In [None]:
kw_model = KeyBERT(model='all-mpnet-base-v2')
#all-mpnet-base-v2
#all-MiniLM-L6-v2

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
keywords = kw_model.extract_keywords(full_text, 
                                     keyphrase_ngram_range=(1, 2), 
                                     stop_words='english', 
                                     highlight=True,
                                     top_n=10)
