In [2]:
import os
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.corpora import Dictionary
from gensim.models import LdaModel, TfidfModel
from nltk.stem import PorterStemmer

# Download stopwords and tokenize data
nltk.download('stopwords')
nltk.download('punkt_tab')

# Define the folder containing the text files
folder_path = 'doc'

# Set up stop words and preprocessing function
stop_words = set(stopwords.words('english'))
downweight_words = {"assessment", "graded", "credit", "requires", "cmkl" }  # Words to down-weight



def preprocess(text):
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

# Read and preprocess each file in the folder
documents = []
competency_labels = []
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        # Extract competency name from the first line
        first_line = f.readline().strip()
        competency_name = re.findall(r'\((.*?)\)', first_line)
        competency_labels.append(competency_name[0] if competency_name else "Unknown")

        text = f.read()
        documents.append(preprocess(text))

# Check the first processed document
print(documents[0])

# Create a dictionary from the processed documents
dictionary = Dictionary(documents)

# Adjust the frequency of specific words in the dictionary
for word in downweight_words:
    if word in dictionary.token2id:
        token_id = dictionary.token2id[word]
        dictionary.dfs[token_id] *= 0.1  # Down-weight by 90%

# Convert documents into a bag-of-words format
corpus_bow = [dictionary.doc2bow(doc) for doc in documents]

# Create the TF-IDF model
tfidf_model = TfidfModel(corpus_bow)

# Apply the TF-IDF model to the corpus to get the TF-IDF representation
corpus_tfidf = tfidf_model[corpus_bow]

# Train the LDA model using the TF-IDF corpus
num_topics = 50  # Number of topics
lda_model = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=num_topics, passes=300)

# Print topics with keywords
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx + 1}: {topic}")

# Group documents by top 5 topics
document_groups = {}  # Dictionary to store groups of documents by top 5 topics

print("\nDocument Grouping by Top 5 Topics:")
for i, doc in enumerate(corpus_tfidf):
    # Get topic probabilities for the document
    topic_probabilities = lda_model.get_document_topics(doc)
    # Sort topics by probability in descending order and select the top 5 topics
    top_5_topics = sorted(topic_probabilities, key=lambda x: x[1], reverse=True)[:5]

    # Create a sorted tuple of the top 5 topic IDs to use as a unique key
    top_5_topic_ids = tuple(sorted([topic_id for topic_id, prob in top_5_topics]))

    # Group documents by the unique combination of top 5 topics
    if top_5_topic_ids not in document_groups:
        document_groups[top_5_topic_ids] = []
    document_groups[top_5_topic_ids].append(f"Document {i + 1} (Original: {competency_labels[i]})")

# Print each group of documents
for topic_ids, docs in document_groups.items():
    print(f"\nGroup with Top 5 Topics {topic_ids}:")
    for doc in docs:
        print(f"  {doc}")

# Save the model and dictionary
lda_model.save("lda_model.model")
dictionary.save("dictionary.dict")

# Example: Find top topics for the first document
document_topics = lda_model.get_document_topics(corpus_tfidf[0])
top_5_topics = sorted(document_topics, key=lambda x: x[1], reverse=True)[:5]
print("\nTop 5 topics for the first document:", top_5_topics)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


['identity', 'code', 'required', 'false', 'credits', 'graded', 'responsible', 'instructor', 'puttha', 'sakkaplangkul', 'prerequisites', 'none', 'distribution', 'areas', 'skills', 'assessments', 'skill', 'code', 'knowledge', 'topic', 'differentiation', 'assessment', 'type', 'imported', 'assessment', 'assessment', 'title', 'assessment', 'concept', 'limits', 'derivatives', 'assessment', 'description', 'take', 'quiz', 'show', 'solve', 'problems', 'definition', 'limits', 'derivatives', 'problem', 'solving', 'class', 'complexity', 'level', 'understand', 'skill', 'code', 'knowledge', 'topic', 'differentiation', 'assessment', 'type', 'imported', 'assessment', 'assessment', 'title', 'assessment', 'limits', 'derivatives', 'problem', 'solving', 'assessment', 'description', 'take', 'quiz', 'show', 'solve', 'problems', 'definition', 'limits', 'derivatives', 'class', 'complexity', 'level', 'apply']
Topic 1: 0.008*"random" + 0.004*"ecosystem" + 0.004*"events" + 0.004*"variable" + 0.004*"tournaments" 

In [None]:
!pip install bertopic


Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.7/143.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB

Need to do stemming and input more down weight words

testing changing the valuable like topic number

Use Stemmer


In [9]:
import os
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.corpora import Dictionary
from gensim.models import LdaModel, TfidfModel
from nltk.stem import PorterStemmer

# Download stopwords and tokenize data (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt_tab')

# Define the folder containing the text files
folder_path = 'doc'

# Set up stop words and preprocessing function
stop_words = set(stopwords.words('english'))
downweight_words = {"assessment", "graded", "credit", "requires", "cmkl", "complex", "level", "detail", "competency", "identity", "description", "instructor" }  # Words to down-weight
#initialize stemmer
stemmer = PorterStemmer()


def preprocess(text):
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    tokens = [stemmer.stem(word) for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

# Read and preprocess each file in the folder
documents = []
competency_labels = []
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        # Extract competency name from the first line
        first_line = f.readline().strip()
        competency_name = re.findall(r'\((.*?)\)', first_line)
        competency_labels.append(competency_name[0] if competency_name else "Unknown")

        text = f.read()
        documents.append(preprocess(text))

# Check the first processed document
print(documents[0])

# Create a dictionary from the processed documents
dictionary = Dictionary(documents)

# Adjust the frequency of specific words in the dictionary
for word in downweight_words:
    if word in dictionary.token2id:
        token_id = dictionary.token2id[word]
        dictionary.dfs[token_id] *= 0.1  # Down-weight by 90%

# Convert documents into a bag-of-words format
corpus_bow = [dictionary.doc2bow(doc) for doc in documents]

# Create the TF-IDF model
tfidf_model = TfidfModel(corpus_bow)

# Apply the TF-IDF model to the corpus to get the TF-IDF representation
corpus_tfidf = tfidf_model[corpus_bow]

# Train the LDA model using the TF-IDF corpus
num_topics = 50  # Number of topics
lda_model = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=num_topics, passes=300)

# Print topics with keywords
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx + 1}: {topic}")

# Group documents by top 5 topics
document_groups = {}  # Dictionary to store groups of documents by top 5 topics

print("\nDocument Grouping by Top 5 Topics:")
for i, doc in enumerate(corpus_tfidf):
    # Get topic probabilities for the document
    topic_probabilities = lda_model.get_document_topics(doc)
    # Sort topics by probability in descending order and select the top 5 topics
    top_5_topics = sorted(topic_probabilities, key=lambda x: x[1], reverse=True)[:5]

    # Create a sorted tuple of the top 5 topic IDs to use as a unique key
    top_5_topic_ids = tuple(sorted([topic_id for topic_id, prob in top_5_topics]))

    # Group documents by the unique combination of top 5 topics
    if top_5_topic_ids not in document_groups:
        document_groups[top_5_topic_ids] = []
    document_groups[top_5_topic_ids].append(f"Document {i + 1} (Original: {competency_labels[i]})")

# Print each group of documents
for topic_ids, docs in document_groups.items():
    print(f"\nGroup with Top 5 Topics {topic_ids}:")
    for doc in docs:
        print(f"  {doc}")

# Save the model and dictionary
lda_model.save("lda_model.model")
dictionary.save("dictionary.dict")

# Example: Find top topics for the first document
document_topics = lda_model.get_document_topics(corpus_tfidf[0])
top_5_topics = sorted(document_topics, key=lambda x: x[1], reverse=True)[:5]
print("\nTop 5 topics for the first document:", top_5_topics)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


['ident', 'code', 'requir', 'fals', 'credit', 'grade', 'respons', 'instructor', 'puttha', 'sakkaplangkul', 'prerequisit', 'none', 'distribut', 'area', 'skill', 'assess', 'skill', 'code', 'knowledg', 'topic', 'differenti', 'assess', 'type', 'import', 'assess', 'assess', 'titl', 'assess', 'concept', 'limit', 'deriv', 'assess', 'descript', 'take', 'quiz', 'show', 'solv', 'problem', 'definit', 'limit', 'deriv', 'problem', 'solv', 'class', 'complex', 'level', 'understand', 'skill', 'code', 'knowledg', 'topic', 'differenti', 'assess', 'type', 'import', 'assess', 'assess', 'titl', 'assess', 'limit', 'deriv', 'problem', 'solv', 'assess', 'descript', 'take', 'quiz', 'show', 'solv', 'problem', 'definit', 'limit', 'deriv', 'class', 'complex', 'level', 'appli']
Topic 1: 0.014*"magnet" + 0.012*"storag" + 0.011*"platform" + 0.011*"field" + 0.011*"oversea" + 0.010*"ml" + 0.009*"limit" + 0.009*"websit" + 0.008*"storytel" + 0.008*"studi"
Topic 2: 0.009*"leadership" + 0.006*"inclus" + 0.005*"persuas" + 

Use Lemmatizer

In [18]:
import os
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.corpora import Dictionary
from gensim.models import LdaModel, TfidfModel
from nltk.stem import WordNetLemmatizer

# Download stopwords and tokenize data (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

# Define the folder containing the text files
folder_path = 'doc'

# Set up stop words and preprocessing function
stop_words = set(stopwords.words('english'))
downweight_words = {"assessment", "graded", "credit", "require", "cmkl", "complex", "level", "detail", "competency", "identity", "description", "instructor"}  # Words to down-weight
#initialize stemmer
stemmer = WordNetLemmatizer()


def preprocess(text):
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    tokens = [stemmer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

# Read and preprocess each file in the folder
documents = []
competency_labels = []
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        # Extract competency name from the first line
        first_line = f.readline().strip()
        competency_name = re.findall(r'\((.*?)\)', first_line)
        competency_labels.append(competency_name[0] if competency_name else "Unknown")

        text = f.read()
        documents.append(preprocess(text))

# Check the first processed document
print(documents[0])

# Create a dictionary from the processed documents
dictionary = Dictionary(documents)

#remove low value token
dictionary.filter_extremes(no_below=5, no_above=0.5)  # Remove words in <5 docs or >50% of docs

# Adjust the frequency of specific words in the dictionary
for word in downweight_words:
    if word in dictionary.token2id:
        token_id = dictionary.token2id[word]
        dictionary.dfs[token_id] *= 0.1  # Down-weight by 90%

# Convert documents into a bag-of-words format
corpus_bow = [dictionary.doc2bow(doc) for doc in documents]

# Create the TF-IDF model
tfidf_model = TfidfModel(corpus_bow)

# Apply the TF-IDF model to the corpus to get the TF-IDF representation
corpus_tfidf = tfidf_model[corpus_bow]

# Train the LDA model using the TF-IDF corpus
num_topics = 50  # Number of topics
lda_model = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=num_topics, passes=300)

# Print topics with keywords
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx + 1}: {topic}")

# Group documents by top 5 topics
document_groups = {}  # Dictionary to store groups of documents by top 5 topics

print("\nDocument Grouping by Top 5 Topics:")
for i, doc in enumerate(corpus_tfidf):
    # Get topic probabilities for the document
    topic_probabilities = lda_model.get_document_topics(doc)
    # Sort topics by probability in descending order and select the top 5 topics
    top_5_topics = sorted(topic_probabilities, key=lambda x: x[1], reverse=True)[:5]

    # Print the top 5 topics for the document
    print(f"Document {i + 1} (Original: {competency_labels[i]}):")
    for topic_id, prob in top_5_topics:
        print(f"  Topic {topic_id}: {prob:.4f}")

    # Create a sorted tuple of the top 5 topic IDs to use as a unique key
    top_5_topic_ids = tuple(sorted([topic_id for topic_id, prob in top_5_topics]))

    # Group documents by the unique combination of top 5 topics
    if top_5_topic_ids not in document_groups:
        document_groups[top_5_topic_ids] = []
    document_groups[top_5_topic_ids].append(f"Document {i + 1} (Original: {competency_labels[i]})")


# Print each group of documents
for topic_ids, docs in document_groups.items():
    print(f"\nGroup with Top 5 Topics {topic_ids}:")
    for doc in docs:
        print(f"  {doc}")

# Save the model and dictionary
lda_model.save("lda_model.model")
dictionary.save("dictionary.dict")

# Example: Find top topics for the first document
document_topics = lda_model.get_document_topics(corpus_tfidf[0])
top_5_topics = sorted(document_topics, key=lambda x: x[1], reverse=True)[:5]



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['identity', 'code', 'required', 'false', 'credit', 'graded', 'responsible', 'instructor', 'puttha', 'sakkaplangkul', 'prerequisite', 'none', 'distribution', 'area', 'skill', 'assessment', 'skill', 'code', 'knowledge', 'topic', 'differentiation', 'assessment', 'type', 'imported', 'assessment', 'assessment', 'title', 'assessment', 'concept', 'limit', 'derivative', 'assessment', 'description', 'take', 'quiz', 'show', 'solve', 'problem', 'definition', 'limit', 'derivative', 'problem', 'solving', 'class', 'complexity', 'level', 'understand', 'skill', 'code', 'knowledge', 'topic', 'differentiation', 'assessment', 'type', 'imported', 'assessment', 'assessment', 'title', 'assessment', 'limit', 'derivative', 'problem', 'solving', 'assessment', 'description', 'take', 'quiz', 'show', 'solve', 'problem', 'definition', 'limit', 'derivative', 'class', 'complexity', 'level', 'apply']
Topic 1: 0.042*"must" + 0.034*"safety" + 0.030*"written" + 0.021*"full" + 0.017*"prototype" + 0.015*"political" + 0.0

with only 10 topic

In [11]:
import os
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.corpora import Dictionary
from gensim.models import LdaModel, TfidfModel
from nltk.stem import WordNetLemmatizer

# Download stopwords and tokenize data (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

# Define the folder containing the text files
folder_path = 'doc'

# Set up stop words and preprocessing function
stop_words = set(stopwords.words('english'))
downweight_words = {"assessment", "graded", "credit", "requires", "cmkl","complex", "level", "detail", "competency", "identity", "description", "instructor"}  # Words to down-weight
#initialize stemmer
stemmer = WordNetLemmatizer()


def preprocess(text):
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    tokens = [stemmer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

# Read and preprocess each file in the folder
documents = []
competency_labels = []
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        # Extract competency name from the first line
        first_line = f.readline().strip()
        competency_name = re.findall(r'\((.*?)\)', first_line)
        competency_labels.append(competency_name[0] if competency_name else "Unknown")

        text = f.read()
        documents.append(preprocess(text))

# Check the first processed document
print(documents[0])

# Create a dictionary from the processed documents
dictionary = Dictionary(documents)

#remove low value token
dictionary.filter_extremes(no_below=5, no_above=0.5)  # Remove words in <5 docs or >50% of docs

# Adjust the frequency of specific words in the dictionary
for word in downweight_words:
    if word in dictionary.token2id:
        token_id = dictionary.token2id[word]
        dictionary.dfs[token_id] *= 0.1  # Down-weight by 90%

# Convert documents into a bag-of-words format
corpus_bow = [dictionary.doc2bow(doc) for doc in documents]

# Create the TF-IDF model
tfidf_model = TfidfModel(corpus_bow)

# Apply the TF-IDF model to the corpus to get the TF-IDF representation
corpus_tfidf = tfidf_model[corpus_bow]

# Train the LDA model using the TF-IDF corpus
num_topics = 10  # Number of topics
lda_model = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=num_topics, passes=300)

# Print topics with keywords
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx + 1}: {topic}")

# Group documents by top 5 topics
document_groups = {}  # Dictionary to store groups of documents by top 5 topics

print("\nDocument Grouping by Top 5 Topics:")
for i, doc in enumerate(corpus_tfidf):
    # Get topic probabilities for the document
    topic_probabilities = lda_model.get_document_topics(doc)
    # Sort topics by probability in descending order and select the top 5 topics
    top_5_topics = sorted(topic_probabilities, key=lambda x: x[1], reverse=True)[:5]

    # Print the top 5 topics for the document
    print(f"Document {i + 1} (Original: {competency_labels[i]}):")
    for topic_id, prob in top_5_topics:
        print(f"  Topic {topic_id}: {prob:.4f}")

    # Create a sorted tuple of the top 5 topic IDs to use as a unique key
    top_5_topic_ids = tuple(sorted([topic_id for topic_id, prob in top_5_topics]))

    # Group documents by the unique combination of top 5 topics
    if top_5_topic_ids not in document_groups:
        document_groups[top_5_topic_ids] = []
    document_groups[top_5_topic_ids].append(f"Document {i + 1} (Original: {competency_labels[i]})")


# Print each group of documents
for topic_ids, docs in document_groups.items():
    print(f"\nGroup with Top 5 Topics {topic_ids}:")
    for doc in docs:
        print(f"  {doc}")

# Save the model and dictionary
lda_model.save("lda_model.model")
dictionary.save("dictionary.dict")

# Example: Find top topics for the first document
document_topics = lda_model.get_document_topics(corpus_tfidf[0])
top_5_topics = sorted(document_topics, key=lambda x: x[1], reverse=True)[:5]



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['identity', 'code', 'required', 'false', 'credit', 'graded', 'responsible', 'instructor', 'puttha', 'sakkaplangkul', 'prerequisite', 'none', 'distribution', 'area', 'skill', 'assessment', 'skill', 'code', 'knowledge', 'topic', 'differentiation', 'assessment', 'type', 'imported', 'assessment', 'assessment', 'title', 'assessment', 'concept', 'limit', 'derivative', 'assessment', 'description', 'take', 'quiz', 'show', 'solve', 'problem', 'definition', 'limit', 'derivative', 'problem', 'solving', 'class', 'complexity', 'level', 'understand', 'skill', 'code', 'knowledge', 'topic', 'differentiation', 'assessment', 'type', 'imported', 'assessment', 'assessment', 'title', 'assessment', 'limit', 'derivative', 'problem', 'solving', 'assessment', 'description', 'take', 'quiz', 'show', 'solve', 'problem', 'definition', 'limit', 'derivative', 'class', 'complexity', 'level', 'apply']
Topic 1: 0.008*"industry" + 0.006*"business" + 0.005*"participant" + 0.005*"project" + 0.005*"domain" + 0.004*"strate

with 20 topic

In [15]:
import os
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.corpora import Dictionary
from gensim.models import LdaModel, TfidfModel
from nltk.stem import WordNetLemmatizer

# Download stopwords and tokenize data (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

# Define the folder containing the text files
folder_path = 'doc'

# Set up stop words and preprocessing function
stop_words = set(stopwords.words('english'))
downweight_words = {"assessment", "graded", "credit", "requires", "cmkl","complex", "level", "detail", "competency", "identity", "description", "instructor" }  # Words to down-weight
#initialize stemmer
stemmer = WordNetLemmatizer()


def preprocess(text):
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    tokens = [stemmer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

# Read and preprocess each file in the folder
documents = []
competency_labels = []
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        # Extract competency name from the first line
        first_line = f.readline().strip()
        competency_name = re.findall(r'\((.*?)\)', first_line)
        competency_labels.append(competency_name[0] if competency_name else "Unknown")

        text = f.read()
        documents.append(preprocess(text))

# Check the first processed document
print(documents[0])

# Create a dictionary from the processed documents
dictionary = Dictionary(documents)

#remove low value token
dictionary.filter_extremes(no_below=5, no_above=0.5)  # Remove words in <5 docs or >50% of docs

# Adjust the frequency of specific words in the dictionary
for word in downweight_words:
    if word in dictionary.token2id:
        token_id = dictionary.token2id[word]
        dictionary.dfs[token_id] *= 0.1  # Down-weight by 90%

# Convert documents into a bag-of-words format
corpus_bow = [dictionary.doc2bow(doc) for doc in documents]

# Create the TF-IDF model
tfidf_model = TfidfModel(corpus_bow)

# Apply the TF-IDF model to the corpus to get the TF-IDF representation
corpus_tfidf = tfidf_model[corpus_bow]

# Train the LDA model using the TF-IDF corpus
num_topics = 20  # Number of topics
lda_model = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=num_topics, passes=300)

# Print topics with keywords
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx + 1}: {topic}")

# Group documents by top 5 topics
document_groups = {}  # Dictionary to store groups of documents by top 5 topics

print("\nDocument Grouping by Top 5 Topics:")
for i, doc in enumerate(corpus_tfidf):
    # Get topic probabilities for the document
    topic_probabilities = lda_model.get_document_topics(doc)
    # Sort topics by probability in descending order and select the top 5 topics
    top_5_topics = sorted(topic_probabilities, key=lambda x: x[1], reverse=True)[:5]

    # Print the top 5 topics for the document
    print(f"Document {i + 1} (Original: {competency_labels[i]}):")
    for topic_id, prob in top_5_topics:
        print(f"  Topic {topic_id}: {prob:.4f}")

    # Create a sorted tuple of the top 5 topic IDs to use as a unique key
    top_5_topic_ids = tuple(sorted([topic_id for topic_id, prob in top_5_topics]))

    # Group documents by the unique combination of top 5 topics
    if top_5_topic_ids not in document_groups:
        document_groups[top_5_topic_ids] = []
    document_groups[top_5_topic_ids].append(f"Document {i + 1} (Original: {competency_labels[i]})")


# Print each group of documents
for topic_ids, docs in document_groups.items():
    print(f"\nGroup with Top 5 Topics {topic_ids}:")
    for doc in docs:
        print(f"  {doc}")

# Save the model and dictionary
lda_model.save("lda_model.model")
dictionary.save("dictionary.dict")

# Example: Find top topics for the first document
document_topics = lda_model.get_document_topics(corpus_tfidf[0])
top_5_topics = sorted(document_topics, key=lambda x: x[1], reverse=True)[:5]



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['identity', 'code', 'required', 'false', 'credit', 'graded', 'responsible', 'instructor', 'puttha', 'sakkaplangkul', 'prerequisite', 'none', 'distribution', 'area', 'skill', 'assessment', 'skill', 'code', 'knowledge', 'topic', 'differentiation', 'assessment', 'type', 'imported', 'assessment', 'assessment', 'title', 'assessment', 'concept', 'limit', 'derivative', 'assessment', 'description', 'take', 'quiz', 'show', 'solve', 'problem', 'definition', 'limit', 'derivative', 'problem', 'solving', 'class', 'complexity', 'level', 'understand', 'skill', 'code', 'knowledge', 'topic', 'differentiation', 'assessment', 'type', 'imported', 'assessment', 'assessment', 'title', 'assessment', 'limit', 'derivative', 'problem', 'solving', 'assessment', 'description', 'take', 'quiz', 'show', 'solve', 'problem', 'definition', 'limit', 'derivative', 'class', 'complexity', 'level', 'apply']
Topic 1: 0.011*"community" + 0.007*"local" + 0.005*"firm" + 0.005*"web" + 0.005*"economics" + 0.004*"agile" + 0.004*"

use topic number 30


In [16]:
import os
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.corpora import Dictionary
from gensim.models import LdaModel, TfidfModel
from nltk.stem import WordNetLemmatizer

# Download stopwords and tokenize data (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

# Define the folder containing the text files
folder_path = 'doc'

# Set up stop words and preprocessing function
stop_words = set(stopwords.words('english'))
downweight_words = {"assessment", "graded", "credit", "requires", "cmkl","complex", "level", "detail", "competency", "identity", "description", "instructor"}  # Words to down-weight
#initialize stemmer
stemmer = WordNetLemmatizer()


def preprocess(text):
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    tokens = [stemmer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

# Read and preprocess each file in the folder
documents = []
competency_labels = []
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        # Extract competency name from the first line
        first_line = f.readline().strip()
        competency_name = re.findall(r'\((.*?)\)', first_line)
        competency_labels.append(competency_name[0] if competency_name else "Unknown")

        text = f.read()
        documents.append(preprocess(text))

# Check the first processed document
print(documents[0])

# Create a dictionary from the processed documents
dictionary = Dictionary(documents)

#remove low value token
dictionary.filter_extremes(no_below=5, no_above=0.5)  # Remove words in <5 docs or >50% of docs

# Adjust the frequency of specific words in the dictionary
for word in downweight_words:
    if word in dictionary.token2id:
        token_id = dictionary.token2id[word]
        dictionary.dfs[token_id] *= 0.1  # Down-weight by 90%

# Convert documents into a bag-of-words format
corpus_bow = [dictionary.doc2bow(doc) for doc in documents]

# Create the TF-IDF model
tfidf_model = TfidfModel(corpus_bow)

# Apply the TF-IDF model to the corpus to get the TF-IDF representation
corpus_tfidf = tfidf_model[corpus_bow]

# Train the LDA model using the TF-IDF corpus
num_topics = 30  # Number of topics
lda_model = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=num_topics, passes=300)

# Print topics with keywords
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx + 1}: {topic}")

# Group documents by top 5 topics
document_groups = {}  # Dictionary to store groups of documents by top 5 topics

print("\nDocument Grouping by Top 5 Topics:")
for i, doc in enumerate(corpus_tfidf):
    # Get topic probabilities for the document
    topic_probabilities = lda_model.get_document_topics(doc)
    # Sort topics by probability in descending order and select the top 5 topics
    top_5_topics = sorted(topic_probabilities, key=lambda x: x[1], reverse=True)[:5]

    # Print the top 5 topics for the document
    print(f"Document {i + 1} (Original: {competency_labels[i]}):")
    for topic_id, prob in top_5_topics:
        print(f"  Topic {topic_id}: {prob:.4f}")

    # Create a sorted tuple of the top 5 topic IDs to use as a unique key
    top_5_topic_ids = tuple(sorted([topic_id for topic_id, prob in top_5_topics]))

    # Group documents by the unique combination of top 5 topics
    if top_5_topic_ids not in document_groups:
        document_groups[top_5_topic_ids] = []
    document_groups[top_5_topic_ids].append(f"Document {i + 1} (Original: {competency_labels[i]})")


# Print each group of documents
for topic_ids, docs in document_groups.items():
    print(f"\nGroup with Top 5 Topics {topic_ids}:")
    for doc in docs:
        print(f"  {doc}")

# Save the model and dictionary
lda_model.save("lda_model.model")
dictionary.save("dictionary.dict")

# Example: Find top topics for the first document
document_topics = lda_model.get_document_topics(corpus_tfidf[0])
top_5_topics = sorted(document_topics, key=lambda x: x[1], reverse=True)[:5]



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['identity', 'code', 'required', 'false', 'credit', 'graded', 'responsible', 'instructor', 'puttha', 'sakkaplangkul', 'prerequisite', 'none', 'distribution', 'area', 'skill', 'assessment', 'skill', 'code', 'knowledge', 'topic', 'differentiation', 'assessment', 'type', 'imported', 'assessment', 'assessment', 'title', 'assessment', 'concept', 'limit', 'derivative', 'assessment', 'description', 'take', 'quiz', 'show', 'solve', 'problem', 'definition', 'limit', 'derivative', 'problem', 'solving', 'class', 'complexity', 'level', 'understand', 'skill', 'code', 'knowledge', 'topic', 'differentiation', 'assessment', 'type', 'imported', 'assessment', 'assessment', 'title', 'assessment', 'limit', 'derivative', 'problem', 'solving', 'assessment', 'description', 'take', 'quiz', 'show', 'solve', 'problem', 'definition', 'limit', 'derivative', 'class', 'complexity', 'level', 'apply']
Topic 1: 0.009*"web" + 0.007*"explainability" + 0.007*"mobile" + 0.007*"prototyping" + 0.006*"compression" + 0.006*"r

use 40 topic


In [19]:
import os
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.corpora import Dictionary
from gensim.models import LdaModel, TfidfModel
from nltk.stem import WordNetLemmatizer

# Download stopwords and tokenize data (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

# Define the folder containing the text files
folder_path = 'doc'

# Set up stop words and preprocessing function
stop_words = set(stopwords.words('english'))
downweight_words = {"assessment", "graded", "credit", "requires", "cmkl","complex", "level", "detail", "competency", "identity", "description", "instructor"}  # Words to down-weight
#initialize stemmer
stemmer = WordNetLemmatizer()


def preprocess(text):
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    tokens = [stemmer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

# Read and preprocess each file in the folder
documents = []
competency_labels = []
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        # Extract competency name from the first line
        first_line = f.readline().strip()
        competency_name = re.findall(r'\((.*?)\)', first_line)
        competency_labels.append(competency_name[0] if competency_name else "Unknown")

        text = f.read()
        documents.append(preprocess(text))

# Check the first processed document
print(documents[0])

# Create a dictionary from the processed documents
dictionary = Dictionary(documents)

#remove low value token
dictionary.filter_extremes(no_below=5, no_above=0.5)  # Remove words in <5 docs or >50% of docs

# Adjust the frequency of specific words in the dictionary
for word in downweight_words:
    if word in dictionary.token2id:
        token_id = dictionary.token2id[word]
        dictionary.dfs[token_id] *= 0.1  # Down-weight by 90%

# Convert documents into a bag-of-words format
corpus_bow = [dictionary.doc2bow(doc) for doc in documents]

# Create the TF-IDF model
tfidf_model = TfidfModel(corpus_bow)

# Apply the TF-IDF model to the corpus to get the TF-IDF representation
corpus_tfidf = tfidf_model[corpus_bow]

# Train the LDA model using the TF-IDF corpus
num_topics = 40  # Number of topics
lda_model = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=num_topics, passes=300)

# Print topics with keywords
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx + 1}: {topic}")

# Group documents by top 5 topics
document_groups = {}  # Dictionary to store groups of documents by top 5 topics

print("\nDocument Grouping by Top 5 Topics:")
for i, doc in enumerate(corpus_tfidf):
    # Get topic probabilities for the document
    topic_probabilities = lda_model.get_document_topics(doc)
    # Sort topics by probability in descending order and select the top 5 topics
    top_5_topics = sorted(topic_probabilities, key=lambda x: x[1], reverse=True)[:5]

    # Print the top 5 topics for the document
    print(f"Document {i + 1} (Original: {competency_labels[i]}):")
    for topic_id, prob in top_5_topics:
        print(f"  Topic {topic_id}: {prob:.4f}")

    # Create a sorted tuple of the top 5 topic IDs to use as a unique key
    top_5_topic_ids = tuple(sorted([topic_id for topic_id, prob in top_5_topics]))

    # Group documents by the unique combination of top 5 topics
    if top_5_topic_ids not in document_groups:
        document_groups[top_5_topic_ids] = []
    document_groups[top_5_topic_ids].append(f"Document {i + 1} (Original: {competency_labels[i]})")


# Print each group of documents
for topic_ids, docs in document_groups.items():
    print(f"\nGroup with Top 5 Topics {topic_ids}:")
    for doc in docs:
        print(f"  {doc}")

# Save the model and dictionary
lda_model.save("lda_model.model")
dictionary.save("dictionary.dict")

# Example: Find top topics for the first document
document_topics = lda_model.get_document_topics(corpus_tfidf[0])
top_5_topics = sorted(document_topics, key=lambda x: x[1], reverse=True)[:5]



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['identity', 'code', 'required', 'false', 'credit', 'graded', 'responsible', 'instructor', 'puttha', 'sakkaplangkul', 'prerequisite', 'none', 'distribution', 'area', 'skill', 'assessment', 'skill', 'code', 'knowledge', 'topic', 'differentiation', 'assessment', 'type', 'imported', 'assessment', 'assessment', 'title', 'assessment', 'concept', 'limit', 'derivative', 'assessment', 'description', 'take', 'quiz', 'show', 'solve', 'problem', 'definition', 'limit', 'derivative', 'problem', 'solving', 'class', 'complexity', 'level', 'understand', 'skill', 'code', 'knowledge', 'topic', 'differentiation', 'assessment', 'type', 'imported', 'assessment', 'assessment', 'title', 'assessment', 'limit', 'derivative', 'problem', 'solving', 'assessment', 'description', 'take', 'quiz', 'show', 'solve', 'problem', 'definition', 'limit', 'derivative', 'class', 'complexity', 'level', 'apply']
Topic 1: 0.009*"negotiation" + 0.008*"matrix" + 0.006*"linear" + 0.005*"message" + 0.005*"investigative" + 0.005*"per