In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim import corpora
import string

nltk.download('stopwords')
nltk.download('wordnet')
import requests
from bs4 import BeautifulSoup


[nltk_data] Downloading package stopwords to /Users/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/user/nltk_data...


In [6]:
def get_wikipedia_summary(topic):
    try:
        url = f"https://en.wikipedia.org/wiki/{topic.replace('_', ' ')}"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        summary = soup.find('p').get_text()
        return summary
    except Exception as e:
        return "Error or no summary available"

In [9]:
def preprocess_data(doc_set):
    """
    Perform basic pre-processing of the text data.
    """
    # Preparing the stopwords list
    stop = set(stopwords.words('english'))
    exclude = set(string.punctuation) 
    lemma = WordNetLemmatizer()

    # Cleaning and preprocessing
    def clean(doc):
        stop_free = " ".join([word for word in doc.lower().split() if word not in stop])
        punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
        normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
        return normalized

    doc_clean = [clean(doc).split() for doc in doc_set]  
    return doc_clean

In [12]:
documents = preprocess_data([get_wikipedia_summary('Machine_learning')])

In [35]:

# Assume 'documents' is a list of preprocessed text data for each topic
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]

# LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=22)

# Viewing the topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.035*"algorithm" + 0.035*"perform" + 0.035*"approach" + 0.035*"study" + 0.035*"artificial" + 0.035*"machine" + 0.035*"learning" + 0.020*"email" + 0.020*"concerned" + 0.020*"needed"
Topic: 1 
Words: 0.022*"machine" + 0.022*"learning" + 0.022*"approach" + 0.022*"study" + 0.022*"perform" + 0.022*"computer" + 0.022*"field" + 0.022*"develop" + 0.022*"artificial" + 0.022*"previous"
Topic: 2 
Words: 0.022*"artificial" + 0.022*"learning" + 0.022*"study" + 0.022*"machine" + 0.022*"approach" + 0.022*"algorithm" + 0.022*"perform" + 0.022*"statistical" + 0.022*"language" + 0.022*"large"
