In [1]:
import pandas as pd
import string
import operator
from collections import defaultdict
from gensim import corpora
from gensim.models import LdaModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define a list of stopwords and punctuation
stop_words = set(stopwords.words('english'))
punctuation = string.punctuation
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Preprocess the text by tokenizing, removing stopwords, punctuation, and lemmatizing.

    Args:
    text (str): Input text.

    Returns:
    list: List of preprocessed words.
    """
    # Tokenize the text
    words = word_tokenize(text.lower())

    # Remove stopwords, punctuation, and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and word not in punctuation]

    return words

def perform_lda(text_data, num_topics=3, num_words=5, passes=15):
    try:
        preprocessed_data = [preprocess_text(doc) for doc in text_data]
        dictionary = corpora.Dictionary(preprocessed_data)
        corpus = [dictionary.doc2bow(text) for text in preprocessed_data]
        lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes)
        topics = lda_model.print_topics(num_words=num_words)
        words_list = []
        scores_list = []
        for topic in topics:
            topic_number, word_score_pairs = topic
            word_score_pairs = word_score_pairs.split('+')
            words = [pair.split('*')[1].strip() for pair in word_score_pairs]
            scores = [float(pair.split('*')[0]) for pair in word_score_pairs]
            words_list.append(words)
            scores_list.append(scores)
        return words_list, scores_list
    except:
        return [["Null"]], [[0.5]]

def get_unique_words_and_max_scores(text_data):
    try:
        text_data = [text_data]
        num_topics = 2
        num_words = 5
        passes = 15
        words_list, scores_list = perform_lda(text_data, num_topics, num_words, passes)
        all_words = [word for sublist in words_list for word in sublist]
        word_max_scores = defaultdict(float)
        for words, scores in zip(words_list, scores_list):
            for word, score in zip(words, scores):
                if word not in word_max_scores or score > word_max_scores[word]:
                    word_max_scores[word] = score

        singular_word_max_scores = {}

        for word, score in word_max_scores.items():
            singular_word = lemmatizer.lemmatize(word)
            if singular_word not in singular_word_max_scores or score > singular_word_max_scores[singular_word]:
                singular_word_max_scores[singular_word] = score

        res = dict(sorted(singular_word_max_scores.items(), key=operator.itemgetter(1), reverse=True))
        return res
    except:
        return {"Null": 0.5}

# Your new text data
new_text_data = """
(D)ecreasing of energy consumption and environmentally friendly energy resources are the issues in the foreground nowadays. As the electric energy consumed for the illumination is high, long-lasting and low-consumption LED (light-emitting diode) technology gets prominent. There have been made much research regarding the use of photovoltaic systems in meeting the energy demand in housing and industry. However, there is need for more research with regards to photovoltaic systems' integration with energy efficiency systems. In this study, for the environments which have different lighting levels due to daylight factor, there has been proposed a low-cost PV (photovoltaics) based and distributed sensor smart LED illuminating system and there has been acquired 72.075% more energy saving in comparison with conventional LED illuminating system. (C) 2017 Elsevier Inc. All rights reserved
"""

# Get unique words and their maximum scores
word_scores_dict = get_unique_words_and_max_scores(new_text_data)

# Print the result
print(word_scores_dict)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


{'"energy"': 0.058, '"system"': 0.049, '"led"': 0.031, '"research"': 0.022, '"illuminating"': 0.022, '"photovoltaic"': 0.015}
