In [8]:
import pandas as pd
from keyphrase_vectorizers import KeyphraseCountVectorizer
from keybert import KeyBERT
from langdetect import detect

def extract_keywords(text):
    # Initialize the vectorizer
    vectorizer = KeyphraseCountVectorizer()

    # Fit and transform the text to get keyphrases
    document_keyphrase_matrix = vectorizer.fit_transform([text]).toarray()
    keyphrases = vectorizer.get_feature_names_out()

    # Initialize the KeyBERT model
    kw_model = KeyBERT()

    # Extract keywords using KeyBERT
    keywords = kw_model.extract_keywords(docs=[text], keyphrase_ngram_range=(1, 2))
    keyword_dict = dict(keywords)
    return keyword_dict

def filter_keywords_by_language(keywords, target_language='en'):
    # Filter keywords based on language
    language_filtered_keywords = {}

    for keyword in keywords:
        if detect(keyword) == target_language:
            language_filtered_keywords[keyword] = keywords[keyword]

    return language_filtered_keywords

def get_ngram_theme(text):
    try:
        # Extract keywords from the text
        extracted_keywords = extract_keywords(text)

        # Filter keywords by language
        filtered_by_language = filter_keywords_by_language(extracted_keywords)

        # Limit to the top 5 unique keywords
        if len(filtered_by_language) >= 5:
            filtered_by_language = dict(list(filtered_by_language.items())[:5])

        return filtered_by_language
    except:
        return {"Null": 0.5}

# Example usage
if __name__ == "__main__":
    text = """
    Supervised learning is the machine learning task of learning a function that
    maps an input to an output based on example input-output pairs. It infers a
    function from labeled training data consisting of a set of training examples.
    In supervised learning, each example is a pair consisting of an input object
    (typically a vector) and a desired output value (also called the supervisory signal).
    A supervised learning algorithm analyzes the training data and produces an inferred function,
    which can be used for mapping new examples. An optimal scenario will allow for the
    algorithm to correctly determine the class labels for unseen instances. This requires
    the learning algorithm to generalize from the training data to unseen situations in a
    'reasonable' way (see inductive bias).
    """

    ngram_theme = get_ngram_theme(text)
    print(ngram_theme)


2023-10-19 10:36:54,151 - KeyphraseVectorizer - INFO - It looks like you do not have downloaded a list of stopwords yet. It is attempted to download the stopwords now.
INFO:KeyphraseVectorizer:It looks like you do not have downloaded a list of stopwords yet. It is attempted to download the stopwords now.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

{'supervised learning': 0.6779, 'labeled training': 0.6013}


In [6]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993224 sha256=a6ff46fe3e1a475e5ca1711efbe915487dac67ec0ca300fa879446014f4cf0c1
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [2]:
!pip install keyphrase_vectorizers

Collecting keyphrase_vectorizers
  Downloading keyphrase_vectorizers-0.0.11-py3-none-any.whl (29 kB)
Collecting spacy-transformers>=1.1.6 (from keyphrase_vectorizers)
  Downloading spacy_transformers-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (197 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.8/197.8 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting transformers<4.35.0,>=3.4.0 (from spacy-transformers>=1.1.6->keyphrase_vectorizers)
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers>=1.1.6->keyphrase_vectorizers)
  Downloading spacy_alignments-0.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (313 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.0/314.0 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
Coll

In [4]:
!pip install keybert

Collecting keybert
  Downloading keybert-0.8.3.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.3.8 (from keybert)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers>=0.3.8->keybert)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: keybert, sentence-transformers
  Building wheel for keybert (setup.py) ... [?25l[?25hdone
  Created wheel for keybert: filename=keybert-0.8.3-py3-none-any.whl size=39124 sha256=424621292982da2867452b7d77cf61508e07f0c9613bc684775911b5081c520b
  Stored in direct

In [None]:
# I separated the keyword extraction, language filtering, and similarity filtering into distinct functions.
# It's important to adjust the similarity_threshold to control the degree of similarity for removing keywords.
# This should make your code more organized and easier to maintain.

In [12]:
from itertools import combinations
import nltk
from nltk.util import ngrams

nltk.download('punkt')  # Download NLTK punkt tokenizer data

def get_ngram_theme(text):
    try:
        # Extract keywords from the text
        extracted_keywords = extract_keywords(text)

        # Filter keywords by language
        filtered_by_language = filter_keywords_by_language(extracted_keywords)

        # Remove duplicate words
        unique_keywords = list(filtered_by_language.keys())

        # Tokenize the text
        words = nltk.word_tokenize(text)

        # Generate Uni, Bi, and Tri-grams
        unigrams = list(ngrams(words, 1))
        bigrams = list(ngrams(words, 2))
        trigrams = list(ngrams(words, 3))

        # Limit to the top 5 unique n-gram combinations for each
        if len(unigrams) >= 5:
            unigrams = unigrams[:5]
        if len(bigrams) >= 5:
            bigrams = bigrams[:5]
        if len(trigrams) >= 5:
            trigrams = trigrams[:5]

        return unique_keywords, unigrams, bigrams, trigrams
    except:
        return {"Null": 0.5}, [], [], []

# Example usage with additional functionality
if __name__ == "__main__":
    text = """
    Supervised learning is the machine learning task of learning a function that
    maps an input to an output based on example input-output pairs. It infers a
    function from labeled training data consisting of a set of training examples.
    In supervised learning, each example is a pair consisting of an input object
    (typically a vector) and a desired output value (also called the supervisory signal).
    A supervised learning algorithm analyzes the training data and produces an inferred function,
    which can be used for mapping new examples. An optimal scenario will allow for the
    algorithm to correctly determine the class labels for unseen instances. This requires
    the learning algorithm to generalize from the training data to unseen situations in a
    'reasonable' way (see inductive bias).
    """

    keywords, unigrams, bigrams, trigrams = get_ngram_theme(text)
    print("Keywords:", keywords)
    print("Unigrams:", unigrams)
    print("Bigrams:", bigrams)
    print("Trigrams:", trigrams)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Keywords: ['supervised learning', 'labeled training']
Unigrams: [('Supervised',), ('learning',), ('is',), ('the',), ('machine',)]
Bigrams: [('Supervised', 'learning'), ('learning', 'is'), ('is', 'the'), ('the', 'machine'), ('machine', 'learning')]
Trigrams: [('Supervised', 'learning', 'is'), ('learning', 'is', 'the'), ('is', 'the', 'machine'), ('the', 'machine', 'learning'), ('machine', 'learning', 'task')]


In [None]:
# The get_ngram_theme function now returns both the unique keywords (after language filtering and removing duplicates)
#  and the important n-grams (Uni, Bi, Tri) based on the input text.