# 02 - Topic Modeling

Author: Santosh Yadaw
Email: santoshyadawprl@gmail.com

## 1. Setup

In [32]:
# Imports
import os
import re
import string
import pandas as pd

import nltk
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import LdaModel
from gensim.corpora import Dictionary

from sklearn.cluster import KMeans
import numpy as np

In [26]:
# Constants
BASE_DIR = os.path.dirname(os.getcwd()) 
print(f"BASE_DIR: {BASE_DIR}")

DATA_FOLDER = os.path.join(BASE_DIR,"data", "raw")
print(f"DATA_FOLDER: {DATA_FOLDER}")

FINAL_DATA_FOLDER = os.path.join(BASE_DIR,"data", "processed")
print(f"FINAL_DATA_FOLDER: {FINAL_DATA_FOLDER}")

RAW_DATA_PATH = os.path.join(DATA_FOLDER, "DS2-assessment-simulated-employee-text.xlsx")
print(f"RAW_DATA_PATH: {RAW_DATA_PATH}")

FINAL_DATA_PATH = os.path.join(FINAL_DATA_FOLDER, "processed_data.csv")
print(f"FINAL_DATA_PATH: {FINAL_DATA_PATH}")

stop_words = set(stopwords.words('english'))

BASE_DIR: /home/jupyter/Topic-Review-Extraction-HR
DATA_FOLDER: /home/jupyter/Topic-Review-Extraction-HR/data/raw
FINAL_DATA_FOLDER: /home/jupyter/Topic-Review-Extraction-HR/data/processed
RAW_DATA_PATH: /home/jupyter/Topic-Review-Extraction-HR/data/raw/DS2-assessment-simulated-employee-text.xlsx
FINAL_DATA_PATH: /home/jupyter/Topic-Review-Extraction-HR/data/processed/processed_data.csv


## 2. Data Prep

In [7]:
# Read data
raw_data = pd.read_excel(RAW_DATA_PATH)
raw_data.head()

Unnamed: 0,unique_identifier,employee_feedback,department
0,3565,There's a culture of blame within the company ...,Dept A
1,7323,The company's approach to feedback and perform...,Dept A
2,5008,"While page limits have been set, some departme...",Dept A
3,3460,na,Dept A
4,2179,The culture of collaboration within our team i...,Dept A


In [16]:
# Initialise the wordnet lemmatizer
lemmatizer = WordNetLemmatizer()

def process_text(text):
    # 1. Convert to lower case
    text = text.lower()

    # 2. Remove any digits and words containing digits
    text = ' '.join([word for word in text.split() if not any(c.isdigit() for c in word)])

    # 3. Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # 4. Remove unicode characters
    text = text.encode("ascii", "ignore").decode()
    
    # 5. Remove extrace spaces
    text = re.sub(' +', ' ', text).strip()

    # 6. Remove stop words in English
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]

    # 7. Lemmatize the words
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(lemmatized_tokens)

# Apply processing function
raw_data['clean_employee_feedback'] = raw_data['employee_feedback'].apply(process_text)

In [24]:
# Remove any samples with less than 1 words
raw_data["word_count"] = raw_data['employee_feedback'].apply(lambda x : len(x.split()))
data_final = raw_data[raw_data["word_count"] > 1]
data_final

Unnamed: 0,unique_identifier,employee_feedback,department,clean_employee_feedback,word_count
0,3565,There's a culture of blame within the company ...,Dept A,there culture blame within company make diffic...,25
1,7323,The company's approach to feedback and perform...,Dept A,company approach feedback performance review g...,85
2,5008,"While page limits have been set, some departme...",Dept A,page limit set department ignoring still long ...,31
4,2179,The culture of collaboration within our team i...,Dept A,culture collaboration within team truly someth...,57
5,6830,While the workload can be overwhelming at time...,Dept A,workload overwhelming time appreciate company ...,36
...,...,...,...,...,...
150,7590,Our documentation is thorough. Onboarding new ...,Dept D,documentation thorough onboarding new member q...,14
151,4622,Feedback flows freely. Suggestions to improve ...,Dept D,feedback flow freely suggestion improve seen o...,12
152,2635,Our team leads by example. The standards they ...,Dept D,team lead example standard set motivate excel,13
153,3272,Cross-training and job shadowing help broaden ...,Dept D,crosstraining job shadowing help broaden skill...,16


In [27]:
# Save as final data
data_final.to_csv(FINAL_DATA_PATH, index=False)

## 3. Modeling

### 3.1 Latent Dirichlet Allocation (LDA)

A popular method for topic method that helps identify overarching topics in the entire dataset

#### 3.1.1 All response LDA

In [30]:
dictionary = Dictionary(raw_data['clean_employee_feedback'].str.split())
corpus = [dictionary.doc2bow(text.split()) for text in raw_data['clean_employee_feedback']]
lda = LdaModel(corpus, id2word=dictionary, num_topics=5) # adjust num_topics based on your needs

# Displaying topics
for idx, topic in lda.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.019*"company" + 0.016*"work" + 0.010*"good" + 0.008*"balance" + 0.008*"feel" + 0.008*"process" + 0.007*"expectation" + 0.006*"available" + 0.006*"worklife" + 0.006*"make"
Topic: 1 
Words: 0.022*"work" + 0.016*"company" + 0.014*"feedback" + 0.012*"employee" + 0.010*"would" + 0.010*"difficult" + 0.009*"lack" + 0.009*"like" + 0.008*"need" + 0.008*"workload"
Topic: 2 
Words: 0.033*"feel" + 0.030*"company" + 0.017*"work" + 0.015*"team" + 0.014*"like" + 0.009*"would" + 0.009*"goal" + 0.008*"employee" + 0.008*"manager" + 0.008*"appreciate"
Topic: 3 
Words: 0.012*"work" + 0.008*"make" + 0.008*"effort" + 0.008*"still" + 0.007*"difficult" + 0.007*"workload" + 0.007*"everyone" + 0.006*"even" + 0.006*"company" + 0.006*"level"
Topic: 4 
Words: 0.023*"company" + 0.015*"employee" + 0.013*"work" + 0.013*"would" + 0.012*"help" + 0.010*"could" + 0.009*"feel" + 0.009*"like" + 0.008*"investment" + 0.007*"make"


#### 3.1.2 LDA by Department

For specific concerns by different departments, we can segment the data by department and conduct topic modeling (LDA) for each

In [31]:
departments = raw_data['department'].unique()

for dept in departments:
    dept_df = raw_data[raw_data['department'] == dept]
    dept_corpus = [dictionary.doc2bow(text.split()) for text in dept_df['clean_employee_feedback']]
    
    dept_lda = LdaModel(dept_corpus, id2word=dictionary, num_topics=5) # adjust num_topics if needed
    
    print(f"\nTopics for department {dept}:")
    for idx, topic in dept_lda.print_topics(-1):
        print('Topic: {} \nWords: {}'.format(idx, topic))


Topics for department Dept A:
Topic: 0 
Words: 0.020*"company" + 0.012*"work" + 0.012*"package" + 0.011*"benefit" + 0.010*"feel" + 0.009*"employee" + 0.009*"offer" + 0.008*"workload" + 0.006*"like" + 0.006*"area"
Topic: 1 
Words: 0.023*"company" + 0.013*"feel" + 0.013*"like" + 0.010*"appreciate" + 0.010*"could" + 0.008*"work" + 0.008*"employee" + 0.008*"however" + 0.008*"concern" + 0.007*"na"
Topic: 2 
Words: 0.020*"company" + 0.014*"work" + 0.008*"employee" + 0.008*"success" + 0.008*"incredibly" + 0.008*"feel" + 0.008*"available" + 0.007*"manager" + 0.007*"goal" + 0.007*"share"
Topic: 3 
Words: 0.015*"company" + 0.011*"feedback" + 0.009*"make" + 0.009*"difficult" + 0.009*"mistake" + 0.005*"review" + 0.005*"health" + 0.005*"approach" + 0.005*"performance" + 0.005*"work"
Topic: 4 
Words: 0.015*"feedback" + 0.008*"would" + 0.008*"work" + 0.007*"make" + 0.007*"something" + 0.007*"manager" + 0.007*"everyone" + 0.007*"issue" + 0.006*"employee" + 0.006*"performance"

Topics for department D

#### 3.1.3 Infer profile individuals

To deduce the profile, we can try to cluster employees based on their topic wieghts from the LDA model, suggesting those with similar topic weights might have similar sentiments or converns

In [35]:
# Extracting topic distributions for each response
topic_weights = [lda.get_document_topics(item, minimum_probability=0) for item in corpus]
topic_distr = np.array([[weight for _, weight in item] for item in topic_weights])

# Clustering using KMeans
kmeans = KMeans(n_clusters=5, random_state=42).fit(topic_distr) # adjust the number of clusters based on your needs

# Attaching cluster labels to df
raw_data['cluster'] = kmeans.labels_

# You can then analyze each cluster, exploring potential profiles. Example:
for cluster_num in range(5): # adjust based on the number of clusters used
    cluster_data = raw_data[raw_data['cluster'] == cluster_num]
    print(f"Sample responses from Cluster {cluster_num}:")
    print(cluster_data['clean_employee_feedback'].sample(5)) # showing 5 sample responses from the cluster

Sample responses from Cluster 0:
82     appreciate flexibility offered company sometim...
9      company benefit package generally competitive ...
59     worklife balance option excellent whether need...
77     appreciate company commitment diversity inclus...
144    team celebrates win together maintains momentu...
Name: clean_employee_feedback, dtype: object
Sample responses from Cluster 1:
105           career advancement depends managerial whim
43     opendoor policy management willingness listen ...
41     relook benchmarking investment bank compensati...
24                                                    na
53     would help create positive motivated work envi...
Name: clean_employee_feedback, dtype: object
Sample responses from Cluster 2:
133    clear objective set everyone understood goal p...
21     however doesnt seem much encouragement share m...
2      page limit set department ignoring still long ...
40                                                      
27           

  super()._check_params_vs_input(X, default_n_init=10)
