In [6]:
import pandas as pd
import gensim
import gensim.corpora as corpora
from gensim.models import LdaModel

In [7]:
# Load datasets
print("Loading datasets...")
courses_df = pd.read_csv(r"C:\Users\vidya\Topic Modelling_3020\Thematic_course_outlines.csv")
jobs_df = pd.read_csv(r"C:\Users\vidya\Topic Modelling_3020\Thematic_skillsNer.csv")

Loading datasets...


In [8]:
thematic_areas = ["communications", "computer", "controls", "electronics", "power"]

# Number of topics based on coherence scores
coherence_scores = {
    'communications': {'job': 19, 'course': 13},
    'computer': {'job': 19, 'course': 13},
    'controls': {'job': 20, 'course': 9},
    'electronics': {'job': 13, 'course': 10},
    'power': {'job': 17, 'course': 9},
}

In [9]:
# Function to perform topic modeling and print results
def perform_and_print_topic_modeling(documents, num_topics, dataset_name):
    dictionary = corpora.Dictionary(documents)
    dictionary.filter_extremes(no_below=1, no_above=0.95)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, random_state=42, passes=50)
    
    # Print the topics
    print(f"\nTopics for {dataset_name} ({num_topics} topics):")
    topics = lda_model.print_topics(num_words=10)
    for topic in topics:
        print(topic)
    
    return lda_model

In [10]:
for thematic in thematic_areas:
    print(f"\n\n=== Processing thematic area: {thematic.upper()} ===")

    # Extract relevant documents
    job_documents = jobs_df[jobs_df["Thematic"] == thematic]["cleaned_description"].dropna().apply(lambda x: x.split(", ")).tolist()
    course_documents = courses_df[courses_df["Thematic"] == thematic]["cleaned_description"].dropna().apply(lambda x: x.split(", ")).tolist()

    # Get number of topics
    num_topics_jobs = coherence_scores[thematic]['job']
    num_topics_courses = coherence_scores[thematic]['course']

    # Perform and print topic modeling
    print(f"\n--- JOB POSTINGS ---")
    lda_jobs = perform_and_print_topic_modeling(job_documents, num_topics_jobs, "Job Postings")
    
    print(f"\n--- COURSE OUTLINES ---")
    lda_courses = perform_and_print_topic_modeling(course_documents, num_topics_courses, "Course Outlines")



=== Processing thematic area: COMMUNICATIONS ===

--- JOB POSTINGS ---

Topics for Job Postings (19 topics):
(0, '0.016*"cctv" + 0.016*"install" + 0.015*"infrastructure" + 0.014*"information security" + 0.014*"best practices" + 0.014*"access policy" + 0.011*"fire alarms" + 0.011*"active directory" + 0.010*"business process" + 0.009*"security system"')
(1, '0.055*"troubleshooting" + 0.035*"installation" + 0.029*"commissioning" + 0.023*"integration" + 0.023*"technical writing" + 0.023*"host" + 0.023*"m" + 0.023*"technical documentation" + 0.023*"fiber coaxial" + 0.023*"systems systems"')
(2, '0.039*"cloud computing" + 0.032*"installation" + 0.032*"system monitoring" + 0.026*"linux" + 0.026*"cloud migration" + 0.019*"troubleshooting" + 0.013*"infrastructure" + 0.013*"information technology" + 0.013*"windows" + 0.013*"system engineering"')
(3, '0.047*"certified professional" + 0.047*"manage" + 0.047*"citrix" + 0.032*"azure cloud" + 0.032*"best practices" + 0.024*"access policy" + 0.024*"