In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load the CSV file into a DataFrame
df = pd.read_csv('blueprint-health-blogs.csv')

# Preprocessing function
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

# Apply text preprocessing to the "Description" column in the DataFrame
df['processed_text'] = df['Description'].apply(preprocess_text)

# Create a dictionary from the preprocessed texts
dictionary = corpora.Dictionary(df['processed_text'])

# Create a corpus (bag of words) representation of the texts
corpus = [dictionary.doc2bow(text) for text in df['processed_text']]

# Train an LDA model using the corpus and dictionary
lda_model = models.LdaModel(corpus, id2word=dictionary, num_topics=5, passes=10)

# Get the topics and their corresponding keywords
topics = lda_model.print_topics(num_topics=5)

# Print the topics and keywords
for topic in topics:
    print(topic)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\13477\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\13477\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\13477\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


(0, '0.081*"," + 0.041*"." + 0.017*"client" + 0.009*"care" + 0.009*"therapist" + 0.006*"treatment" + 0.006*"clinician" + 0.006*":" + 0.006*"data" + 0.006*"help"')
(1, '0.046*"," + 0.034*"." + 0.014*"care" + 0.011*")" + 0.009*"(" + 0.007*"health" + 0.007*"code" + 0.007*"payer" + 0.006*"client" + 0.006*"measurement-based"')
(2, '0.041*"," + 0.020*"." + 0.017*"session" + 0.014*"client" + 0.013*"therapist" + 0.009*"blueprint" + 0.008*"time" + 0.007*"clinician" + 0.006*"care" + 0.006*"week"')
(3, '0.037*"," + 0.012*"." + 0.010*"clinician" + 0.010*"blueprint" + 0.010*"way" + 0.009*"care" + 0.007*"client" + 0.007*"mental" + 0.007*"measurement-based" + 0.007*"health"')
(4, '0.043*"," + 0.032*"." + 0.018*"session" + 0.017*"client" + 0.012*":" + 0.008*"week" + 0.008*"mbc" + 0.008*"feel" + 0.007*"care" + 0.007*"measurement-based"')
