In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF


In [2]:
with open('../data/documents.txt', 'r', encoding='utf-8') as file:
    text = file.read()
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]

In [3]:
# Define a custom list of common words to remove (can be modified as needed)
custom_stopwords = [
    'science', 'understanding', 'continues', 'however', 'also', 'among',
    'using', 'within', 'based', 'many', 'different', 'new', 'including',
    'related', 'often', 'such', 'one', 'two', 'first', 'second'
]

In [4]:
# Convert paragraphs to numerical representation using TF-IDF with enhancements
vectorizer = TfidfVectorizer(
    stop_words='english',    # Remove default English stop words
    max_df=0.85,             # Ignore words appearing in more than 85% of paragraphs
    min_df=2,                # Ignore words appearing in fewer than 2 paragraphs
    max_features=None        # No limit on maximum number of words
)

In [5]:
# Merge custom stop words list with the default stop words list
vectorizer.stop_words_ = vectorizer.get_stop_words().union(custom_stopwords)

X = vectorizer.fit_transform(paragraphs)

# Determine the required number of topics
n_topics = 5

In [6]:
nmf_model = NMF(n_components=n_topics, random_state=42, init='nndsvda', max_iter=500)
W = nmf_model.fit_transform(X)
H = nmf_model.components_

In [7]:
# print the important topics
def print_topics(H, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(H):
        print(f"Topic {topic_idx + 1}:")
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        print(", ".join(top_features))
        print()


In [8]:
feature_names = vectorizer.get_feature_names_out()
print_topics(H, feature_names)

Topic 1:
practical, remain, significant, despite, advances, applications, challenges, continues, sustainable, chronological

Topic 2:
epidemiological, security, navigation, entanglement, augmentation, continues, enhancement, prosthetics, expansion, circuitry

Topic 3:
approaches, promise, traditionally, new, field, combining, dominated, singularities, continues, astrophysical

Topic 4:
biotechnological, dialects, preservation, ecosystems, synthesis, currencies, kinetics, approaches, continues, expansion

Topic 5:
renewable, nanotechnological, robotics, enhancement, security, energy, dialects, biodiversity, civilizations, realities

