In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("blueprint-health-blogs.csv")

# PREPROCESSING

Text preprocessing: Clean and preprocess the text data by removing stopwords, punctuation, and performing other necessary text cleaning steps. You may also want to perform tokenization and lemmatization.

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Download NLTK resources (if not already downloaded)
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

# Create stopwords list
stopwords = set(stopwords.words("english"))

# Create lemmatizer
lemmatizer = WordNetLemmatizer()

# Text preprocessing function
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords and punctuation
    tokens = [token for token in tokens if token not in stopwords and token not in string.punctuation]
    
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join the tokens back into a single string
    processed_text = " ".join(tokens)
    
    return processed_text

# Apply text preprocessing to the "text" column in the DataFrame
df["processed_text"] = df["Description"].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\13477\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\13477\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\13477\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
##import nltk
##nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\13477\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# VECTORIZATION

Vectorize the text data: Convert the preprocessed text data into numerical vectors using techniques like TF-IDF or CountVectorizer.

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Vectorize the processed text data
X = vectorizer.fit_transform(df["processed_text"])


# LDA

Topic modeling using LDA: Apply Latent Dirichlet Allocation (LDA) to the vectorized text data to extract topics.

In [7]:
from sklearn.decomposition import LatentDirichletAllocation

# Create an LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=42)

# Fit the LDA model to the vectorized data
lda.fit(X)

# Get the most important words for each topic
feature_names = vectorizer.get_feature_names()
num_top_words = 10

for topic_idx, topic in enumerate(lda.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
    print(f"Topic #{topic_idx + 1}:")
    print(", ".join(top_words))
    print()


Topic #1:
dropout, care, payer, code, symptom, mbc, client, insurance, cpt, based

Topic #2:
blueprint, weâ, care, body, mind, clinician, co, client, health, free

Topic #3:
client, session, youâ, ve, itâ, item, mbc, treatment, feel, measurement

Topic #4:
effectiveness, therapist, ema, behavior, strength, plant, mood, sleep, patient, client

Topic #5:
client, friend, trail, session, instead, language, alliance, therapist, person, say



