In [80]:
import pandas as pd

Import dataset

In [81]:
df = pd.read_csv('./Employee_Complaints.csv')
df.head(10)

Unnamed: 0,Genre,Report,Employee Age,Employee Role,Gender
0,Communication Issues,I never receive clear instructions for my tasks.,46,Senior,Male
1,Communication Issues,Updates from management are inconsistent and c...,21,Intern,Female
2,Communication Issues,I often misunderstand what's expected due to v...,49,Senior,Male
3,Communication Issues,Important information is never shared on time.,36,Junior,Male
4,Communication Issues,"My queries go unanswered, making it hard to pr...",24,Intern,Female
5,Communication Issues,"Meetings are held without clear agendas, wasti...",44,Senior,Male
6,Communication Issues,"During changes, communication is poor and leav...",36,Junior,Male
7,Communication Issues,Our team goals are never properly communicated.,38,Junior,Male
8,Communication Issues,Feedback on my performance is hardly provided.,21,Intern,Female
9,Communication Issues,"My job responsibilities are unclear, causing c...",36,Junior,Male


In [82]:
# Info of the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 482 entries, 0 to 481
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Genre          482 non-null    object
 1   Report         480 non-null    object
 2   Employee Age   482 non-null    int64 
 3   Employee Role  482 non-null    object
 4   Gender         482 non-null    object
dtypes: int64(1), object(4)
memory usage: 19.0+ KB


In [83]:
# Check for missing values
df.isnull().sum()

Genre            0
Report           2
Employee Age     0
Employee Role    0
Gender           0
dtype: int64

In [84]:
df.duplicated().sum()

0

Let's find out how many kinds of reports there are! But the important question is, what are they?

In [85]:
df['Genre'].unique()

array(['Communication Issues', 'Workload and Stress',
       'Management Lifestyle', 'Compensation and Benefits',
       'Career Development', 'Workplace Environment',
       'Lack of training and development'], dtype=object)

In [86]:
df_count=df['Genre'].value_counts()
df_count

Genre
Communication Issues                100
Workload and Stress                 100
Management Lifestyle                100
Lack of training and development     70
Workplace Environment                52
Compensation and Benefits            30
Career Development                   30
Name: count, dtype: int64

We can see here that there are 7 types of report

Plotting to visualize data

In [87]:
import plotly.express as px
# Assuming df is your DataFrame
fig = px.scatter(df, x='Employee Role', y='Employee Age', color='Genre')

# Display the plot in the notebook
fig.show()

# Save the Plotly figure as an HTML file
fig.write_html("scatter_plot.html")


Insight:
if we filter out only Lack of training and development we will find that most reports are from inters and juniors and that is makes sense as they are seeking chances for skill enhancement and career growth.
Also if we filter out Workload and Stress we will find that most reports are from Seniors and that is makes sense as they are experiencing challenges and pressures in their roles

Age and Role to Genre

In [88]:
# Assuming df is your DataFrame
fig_1 = px.scatter(df, x='Employee Role', y='Employee Age', color='Genre')

# Display the plot in the notebook
fig_1.show()
# Save the Plotly figure as an HTML file
fig_1.write_html("scatter_plot_1.html")


Data Preprocessing

In [89]:
# Assuming df is your DataFrame
df.drop(['Employee Age', 'Employee Role', 'Gender'], axis=1, inplace=True)

# Display the modified DataFrame
df

Unnamed: 0,Genre,Report
0,Communication Issues,I never receive clear instructions for my tasks.
1,Communication Issues,Updates from management are inconsistent and c...
2,Communication Issues,I often misunderstand what's expected due to v...
3,Communication Issues,Important information is never shared on time.
4,Communication Issues,"My queries go unanswered, making it hard to pr..."
...,...,...
477,Lack of training and development,Opportunities for job rotation and diversifica...
478,Lack of training and development,Leadership doesn't recognize employees' aspira...
479,Lack of training and development,Employees feel unsupported in their pursuit of...
480,Lack of training and development,No structured program for mentoring new employ...


In [90]:
import string

sentences = []

for sentence in df['Report']:
    # Check if the value is a string, otherwise handle accordingly
    if isinstance(sentence, str):
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        sentence = sentence.replace('"', '')
        sentences.append(sentence)
    else:
        sentences.append("")  # Replace non-string values with an empty string

df['Reports'] = pd.DataFrame(sentences)


Use of TF-IDF vectorize and Bag of Words (BoW) techniques to vectorize the texts

Tokenization

In [91]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/oumaymabamoh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oumaymabamoh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [92]:
from nltk.tokenize import word_tokenize
i=0
all_tokens = []
for report in df['Report']:
  proto=word_tokenize(str(report))
  all_tokens.append(proto)
print(all_tokens)

[['I', 'never', 'receive', 'clear', 'instructions', 'for', 'my', 'tasks', '.'], ['Updates', 'from', 'management', 'are', 'inconsistent', 'and', 'confusing', '.'], ['I', 'often', 'misunderstand', 'what', "'s", 'expected', 'due', 'to', 'vague', 'emails', '.'], ['Important', 'information', 'is', 'never', 'shared', 'on', 'time', '.'], ['My', 'queries', 'go', 'unanswered', ',', 'making', 'it', 'hard', 'to', 'proceed', '.'], ['Meetings', 'are', 'held', 'without', 'clear', 'agendas', ',', 'wasting', 'time', '.'], ['During', 'changes', ',', 'communication', 'is', 'poor', 'and', 'leaves', 'us', 'in', 'the', 'dark', '.'], ['Our', 'team', 'goals', 'are', 'never', 'properly', 'communicated', '.'], ['Feedback', 'on', 'my', 'performance', 'is', 'hardly', 'provided', '.'], ['My', 'job', 'responsibilities', 'are', 'unclear', ',', 'causing', 'confusion', '.'], ['Management', 'seems', 'to', 'ignore', 'our', 'suggestions', 'for', 'improvement', '.'], ['Too', 'much', 'jargon', 'is', 'used', 'in', 'company

In [93]:
df_token = pd.DataFrame({'Tokens': all_tokens})

# print the DataFrame
print(df_token)

                                                Tokens
0    [I, never, receive, clear, instructions, for, ...
1    [Updates, from, management, are, inconsistent,...
2    [I, often, misunderstand, what, 's, expected, ...
3    [Important, information, is, never, shared, on...
4    [My, queries, go, unanswered, ,, making, it, h...
..                                                 ...
477  [Opportunities, for, job, rotation, and, diver...
478  [Leadership, does, n't, recognize, employees, ...
479  [Employees, feel, unsupported, in, their, purs...
480  [No, structured, program, for, mentoring, new,...
481  [Lack, of, training, programs, affects, our, c...

[482 rows x 1 columns]


In [94]:
df['Tokenized']=all_tokens
df

Unnamed: 0,Genre,Report,Reports,Tokenized
0,Communication Issues,I never receive clear instructions for my tasks.,I never receive clear instructions for my tasks,"[I, never, receive, clear, instructions, for, ..."
1,Communication Issues,Updates from management are inconsistent and c...,Updates from management are inconsistent and c...,"[Updates, from, management, are, inconsistent,..."
2,Communication Issues,I often misunderstand what's expected due to v...,I often misunderstand whats expected due to va...,"[I, often, misunderstand, what, 's, expected, ..."
3,Communication Issues,Important information is never shared on time.,Important information is never shared on time,"[Important, information, is, never, shared, on..."
4,Communication Issues,"My queries go unanswered, making it hard to pr...",My queries go unanswered making it hard to pro...,"[My, queries, go, unanswered, ,, making, it, h..."
...,...,...,...,...
477,Lack of training and development,Opportunities for job rotation and diversifica...,Opportunities for job rotation and diversifica...,"[Opportunities, for, job, rotation, and, diver..."
478,Lack of training and development,Leadership doesn't recognize employees' aspira...,Leadership doesnt recognize employees aspirati...,"[Leadership, does, n't, recognize, employees, ..."
479,Lack of training and development,Employees feel unsupported in their pursuit of...,Employees feel unsupported in their pursuit of...,"[Employees, feel, unsupported, in, their, purs..."
480,Lack of training and development,No structured program for mentoring new employ...,No structured program for mentoring new employees,"[No, structured, program, for, mentoring, new,..."


Remove Stopwords

Download ready-to-use stop words from nltk library

In [95]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oumaymabamoh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [96]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [97]:
new_stopwords=['I', 'me', 'my','My', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", "I'm", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',"we're" "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", "ke", "keep", "keeps", "kept", "keys", "kg", "kh", "ki", "kind", "km", "kn", "knew", "know", "known", "knows", "kp", "kr", "kw", "ky", "kz", "l", "la", "large", "largely", "last", "lately", "later", "latest", "latter", "latterly", "lb", "lc", "least", "length", "less", "lest", "let", "let", "lets", "li", "like", "liked", "likely", "likewise", "line", "little", "lk", "ll", "long", "longer", "longest", "look", "looking", "looks", "low", "lower", "lr", "ls", "lt", "ltd", "lu", "lv", "ly", "m", "ma", "made", "mainly", "make", "makes", "making", "man", "many", "may", "maybe", "mayn't", "maynt", "mc", "md", "me", "mean", "means", "meantime", "meanwhile", "member", "members", "men", "merely", "mg", "mh", "microsoft", "might", "mightve", "mightnt", "mightnt", "mil", "mill", "million", "mine", "minus", "miss", "mk", "ml", "mm", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mp", "mq", "mr", "mrs", "ms", "msie", "mt", "mu", "much", "mug", "must", "must've", "mustn't", "mustnt", "mv", "mw", "mx", "my", "myself", "myse", "mz", "n", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needed", "needing", "needn't", "neednt", "needs", "neither", "net", "netscape", "never", "neverf", "neverless", "nevertheless", "new", "newer", "newest", "next", "nf", "ng", "ni", "nine", "ninety", "nl", "no", "no-one", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "notwithstanding", "novel", "now", "nowhere", "np", "nr", "nu", "null", "number", "numbers", "nz", "o", "obtain", "obtained", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "older", "oldest", "om", "omitted", "on", "once", "one", "one's", "ones", "only", "onto", "open", "opened", "opening", "opens", "opposite", 'or', "ord", "order", "ordered", "ordering", "orders", "org", "other", "others", "otherwise", "ought", "oughtnt", "oughtnt", "our",
"Sarah","ua", "ug", "uk", "um", "un", "under", "underneath", "undoing", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "up", "upon", "ups", "upwards", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "versus", "very", "via", "viz", "vn", "vol", "vols", "vs", "w", "want", "wanted", "wanting", "wants", "was", "wasn't", "wasnt", "way", "ways", "we", "we'd", "we'll", "we're", "we've", "web", "webpage", "website", "wed", "welcome", "well", "wells", "went", "were", "weren't", "werent", "weve", "wf", "what", "what'll", "what's", "what've", "whatever", "whats", "when", "whence", "whenever", "where", "where's", "whereafter", "whereas", "whereby", "wherein", "wheres", "whereupon", "wherever", "whether", "which", "whichever", "while", "whilst", "whim", "whither", "who", "who'd", "who'll", "who's", "whod", "whoever", "whole", "wholl", "whom", "whomever", "whos", "whose", "why", "widely", "width", "will", "willing", "wish", "with", "within", "without", "won't", "wont", "words", "work", "worked", "working", "works", "world", "would", "would've", "wouldn't", "wouldnt", "www", "x", "y", "ye", "year", "years", "yes", "yet", "you", "you'd", "you'll", "you're", "you've", "youd", "youll", "your", "youre", "yours", "yourself", "yourselves", "youve", "yt", "yu", "z", "za", "zero",
"Shereen",
"Tarek",
"Yasmine","'ll", "'tis", "'twas", "'ve", "10", "39", "a", "a's", "able", "ableabout", "about", "above", "abroad", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "adopted", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ago", "ah", "ahead", "ai", "ain't", "aint", "al", "all", "allow", "allows", "almost", "alone", "along", "alongside", "already", "also", "although", "always", "am", "amid", "amidst", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "aq", "ar", "are", "area", "areas", "aren", "aren't", "arent", "arise", "around", "arpa", "as", "aside", "ask", "asked", "asking", "asks", "associated", "at", "au", "auth", "available", "aw", "away", "awfully", "az", "b", "ba", "back", "backed", "backing", "backs", "backward", "backwards", "bb", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "began", "begin", "beginning", "beginnings", "begins", "behind", "being", "beings", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bf", "bg", "bh", "bi", "big", "bill", "billion", "biol", "bj", "bm", "bn", "bo", "both", "bottom", "br", "brief", "briefly", "bs", "bt", "but", "buy", "bv", "bw", "by", "bz", "c", "c'mon", "c's", "ca", "call", "came", "can", "can't", "cannot", "cant", "caption", "case", "cases", "cause", "causes", "cc", "cd", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "ck", "cl", "clear", "clearly", "click", "cm", "cmon", "cn", "co", "co.", "com", "come", "comes", "computer", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "copy", "corresponding", "could", "could've", "couldn", "couldn't", "couldnt", "course", "cr", "cry", "cs", "cu", "currently", "cv", "cx", "cy", "cz",
"Yasser","d","dare","daren't","darent","date","de","dear","definitely","describe","described","despite","detail","did","didn","didn't","didnt","differ","different","differently","directly","dj","dk","dm","do","does","doesn","doesn't","doesnt","doing","don","don't","done","dont","doubtful","down","downed","downing","downs","downwards","due","during","dz","e","each","early","ec","ed","edu","ee","effect","eg","eh","eight","eighty","either","eleven","else","elsewhere","empty","end","ended","ending","ends","enough","entirely","er","es","especially","et","et-al","etc","even","evenly","ever","evermore","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","face","faces","fact","facts","fairly","far","farther","felt","few","fewer","ff","fi","fifteen","fifth","fifty","fify","fill","find","finds","fire","first","five","fix","fj","fk","fm","fo","followed","following","follows","for","forever","former","formerly","forth","forty","forward","found","four","fr","free","from","front","full","fully","further","furthered","furthering","furthermore","furthers","fx","g","ga","gave","gb","gd","ge","general","generally","get","gets","getting","gf","gg","gh","gi","give","given","gives","giving","gl","gm","gmt","gn","go","goes","going","gone","good","goods","got","gotten","gov","gp","gq","gr","great","greater","greatest","greetings","group","grouped","grouping","groups","gs","gt","gu","gw","gy","h","had","hadn't","hadnt","half","happens","hardly","has","hasn","hasn't","hasnt","have","haven","haven't","havent","having","he","he'd","he'll","he's","hed","hell","hello","help","hence","her","here","here's","hereafter","hereby","herein","heres","hereupon","hers","herself","herse","hes","hi","hid","high","higher","highest","him","himself","himse","his","hither","hk","hm","hn","home","homepage","hopefully","how","howd","howll","hows","howbeit","however","hr","ht","htm","html","http","hu","hundred","i","id","ill","im","ive","i.e.","id","ie","if","ignored","ii","il","ill","im","immediate","immediately","importance","important","in","inasmuch","inc","inc.","indeed","index","indicate","indicated","indicates","information","inner","inside","insofar","instead","int","interest","interested","interesting","interests","into","invention","inward","io","iq","ir","is","isn","isn't","isnt","it","it'd","it'll","it's","itd","itll","its","itself","itse”","ive","j","je","jm","jo","join","jp","just","k",
"Youssef", "ourselves", "out", "outside", "over", "overall", "owing", "own", "p", "pa", "page", "pages", "part", "parted", "particular", "particularly", "parting", "parts", "past", "pe", "per", "perhaps", "pf", "pg", "ph", "pk", "pl", "place", "placed", "places", "please", "plus", "pm", "pmid", "pn", "point", "pointed", "pointing", "points", "poorly", "possible", "possibly", "potentially", "pp", "pr", "predominantly", "present", "presented", "presenting", "presents", "presumably", "previously", "primarily", "probably", "problem", "problems", "promptly", "proud", "provided", "provides", "pt", "put", "puts", "pw", "py", "q", "qa", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "reserved", "respectively", "resulted", "resulting", "results", "right", "ring", "ro", "room", "rooms", "round", "ru", "run", "rw", "s", "sa", "said", "same", "saw", "say", "saying", "says", "sb", "sc", "sd", "se", "sec", "second", "secondly", "seconds", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "sees", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "seventy", "several", "sg", "sh", "shall", "shan't", "shant", "she", "she'd", "she'll", "she's", "shed", "shell", "shes", "should", "should've", "shouldn", "shouldn't", "shouldnt", "show", "showed", "showing", "shown", "showns", "shows", "si", "side", "sides", "significant", "significantly", "similar", "similarly", "since", "sincere", "site", "six", "sixty", "sj", "sk", "sl", "slightly", "sm", "small", "smaller", "smallest", "sn", "so", "some", "somebody", "someday", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "sr", "st", "state", "states", "still", "stop", "strongly", "su", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sv", "sy", "system", "sz",]

In [98]:
stpwrd = nltk.corpus.stopwords.words('english')
stpwrd.extend(new_stopwords)

In [99]:
all=[]
for list1 in df['Tokenized']:
  for word in list1:
    if word in new_stopwords:
      list1.remove(word)
  list1=" ".join(list1)
  all.append(list1)

In [100]:
df_token2 = pd.DataFrame({'Tokens': all})
df['cleaned']=df_token2
df[['Report','cleaned']]

Unnamed: 0,Report,cleaned
0,I never receive clear instructions for my tasks.,never receive instructions my tasks .
1,Updates from management are inconsistent and c...,Updates management inconsistent confusing .
2,I often misunderstand what's expected due to v...,often misunderstand 's expected to vague emails .
3,Important information is never shared on time.,Important is shared time .
4,"My queries go unanswered, making it hard to pr...","queries unanswered , it hard proceed ."
...,...,...
477,Opportunities for job rotation and diversifica...,Opportunities job rotation diversification abs...
478,Leadership doesn't recognize employees' aspira...,Leadership n't recognize employees ' aspiratio...
479,Employees feel unsupported in their pursuit of...,Employees feel unsupported their pursuit skill...
480,No structured program for mentoring new employ...,No structured program mentoring employees .


Lemmatization & Stemming

In [103]:
import spacy
import lemminflect
nlp = spacy.load('en_core_web_sm')


In [104]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()
from nltk.stem import WordNetLemmatizer
lem=WordNetLemmatizer()

In [105]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/oumaymabamoh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [106]:
words=word_tokenize(df['cleaned'].iloc[0])
for w in words:
    print(w, " : ", ps.stem(w))

never  :  never
receive  :  receiv
instructions  :  instruct
my  :  my
tasks  :  task
.  :  .


In [107]:
lemminf=[]
words=[]
doc = nlp(df['Reports'].iloc[0])
for token in doc:
    print(token," : ",token._.lemma())

I  :  I
never  :  never
receive  :  receive
clear  :  clear
instructions  :  instruction
for  :  for
my  :  my
tasks  :  task


In [108]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import pandas as pd

# Assuming 'cleaned' is your text column
df['cleaned'].fillna("", inplace=True)

# Initialize the Porter Stemmer
ps = PorterStemmer()

# Lists to store lemmatized and stemmed tokens
lemmatized_tokens = []
stemmed_tokens = []

# Loop through each row in the DataFrame
for i in range(len(df)):
    words = word_tokenize(df['cleaned'].iloc[i])

    # Initialize empty lists for lemmatized and stemmed tokens
    lemmatized = []
    stemmed = []

    # Loop through each word in the row
    for w in words:
        # Perform stemming
        stemmed_token = ps.stem(w)

        # Append to the lists
        stemmed.append(stemmed_token)

    # Append lists to the respective token lists
    lemmatized_tokens.append(" ".join(lemmatized))
    stemmed_tokens.append(" ".join(stemmed))

# Add new columns to your DataFrame
df['lemmatized_cleaned'] = lemmatized_tokens
df['stemmed_cleaned'] = stemmed_tokens

# Display the DataFrame
print(df[['cleaned', 'lemmatized_cleaned', 'stemmed_cleaned']])


                                               cleaned lemmatized_cleaned  \
0                never receive instructions my tasks .                      
1          Updates management inconsistent confusing .                      
2    often misunderstand 's expected to vague emails .                      
3                           Important is shared time .                      
4               queries unanswered , it hard proceed .                      
..                                                 ...                ...   
477  Opportunities job rotation diversification abs...                      
478  Leadership n't recognize employees ' aspiratio...                      
479  Employees feel unsupported their pursuit skill...                      
480        No structured program mentoring employees .                      
481       Lack training programs our competitiveness .                      

                                       stemmed_cleaned  
0                 

In [109]:
df['stemmed_cleaned'] = df['stemmed_cleaned'].apply(' '.join)
df

Unnamed: 0,Genre,Report,Reports,Tokenized,cleaned,lemmatized_cleaned,stemmed_cleaned
0,Communication Issues,I never receive clear instructions for my tasks.,I never receive clear instructions for my tasks,"[never, receive, instructions, my, tasks, .]",never receive instructions my tasks .,,n e v e r r e c e i v i n s t r u c t m ...
1,Communication Issues,Updates from management are inconsistent and c...,Updates from management are inconsistent and c...,"[Updates, management, inconsistent, confusing, .]",Updates management inconsistent confusing .,,u p d a t m a n a g i n c o n s i s t c ...
2,Communication Issues,I often misunderstand what's expected due to v...,I often misunderstand whats expected due to va...,"[often, misunderstand, 's, expected, to, vague...",often misunderstand 's expected to vague emails .,,o f t e n m i s u n d e r s t a n d ' s ...
3,Communication Issues,Important information is never shared on time.,Important information is never shared on time,"[Important, is, shared, time, .]",Important is shared time .,,i m p o r t i s s h a r e t i m e .
4,Communication Issues,"My queries go unanswered, making it hard to pr...",My queries go unanswered making it hard to pro...,"[queries, unanswered, ,, it, hard, proceed, .]","queries unanswered , it hard proceed .",,"q u e r i u n a n s w , i t h a r d ..."
...,...,...,...,...,...,...,...
477,Lack of training and development,Opportunities for job rotation and diversifica...,Opportunities for job rotation and diversifica...,"[Opportunities, job, rotation, diversification...",Opportunities job rotation diversification abs...,,o p p o r t u n j o b r o t a t d i v e ...
478,Lack of training and development,Leadership doesn't recognize employees' aspira...,Leadership doesnt recognize employees aspirati...,"[Leadership, n't, recognize, employees, ', asp...",Leadership n't recognize employees ' aspiratio...,,l e a d e r s h i p n ' t r e c o g n e ...
479,Lack of training and development,Employees feel unsupported in their pursuit of...,Employees feel unsupported in their pursuit of...,"[Employees, feel, unsupported, their, pursuit,...",Employees feel unsupported their pursuit skill...,,e m p l o y e f e e l u n s u p p o r t ...
480,Lack of training and development,No structured program for mentoring new employ...,No structured program for mentoring new employees,"[No, structured, program, mentoring, employees...",No structured program mentoring employees .,,n o s t r u c t u r p r o g r a m m e n ...


In [110]:
lemminf = []

for i in range(len(df)):
    words = []

    # Check if the 'Reports' value is not NaN
    if not pd.isna(df['Reports'].iloc[i]):
        doc = nlp(df['Reports'].iloc[i])

        for token in doc:
            if str(token) not in stpwrd:
                words.append(token._.lemma())

        lemminf.append(" ".join(words))
    else:
        lemminf.append("")  # Replace NaN with an empty string

# Create a new column in your DataFrame
df['lemmatized'] = lemminf



In [111]:
dflemm=pd.DataFrame(lemminf)
df['Lemminf']=dflemm
df

Unnamed: 0,Genre,Report,Reports,Tokenized,cleaned,lemmatized_cleaned,stemmed_cleaned,lemmatized,Lemminf
0,Communication Issues,I never receive clear instructions for my tasks.,I never receive clear instructions for my tasks,"[never, receive, instructions, my, tasks, .]",never receive instructions my tasks .,,n e v e r r e c e i v i n s t r u c t m ...,receive instruction task,receive instruction task
1,Communication Issues,Updates from management are inconsistent and c...,Updates from management are inconsistent and c...,"[Updates, management, inconsistent, confusing, .]",Updates management inconsistent confusing .,,u p d a t m a n a g i n c o n s i s t c ...,Update management inconsistent confusing,Update management inconsistent confusing
2,Communication Issues,I often misunderstand what's expected due to v...,I often misunderstand whats expected due to va...,"[often, misunderstand, 's, expected, to, vague...",often misunderstand 's expected to vague emails .,,o f t e n m i s u n d e r s t a n d ' s ...,misunderstand expect vague email,misunderstand expect vague email
3,Communication Issues,Important information is never shared on time.,Important information is never shared on time,"[Important, is, shared, time, .]",Important is shared time .,,i m p o r t i s s h a r e t i m e .,Important share time,Important share time
4,Communication Issues,"My queries go unanswered, making it hard to pr...",My queries go unanswered making it hard to pro...,"[queries, unanswered, ,, it, hard, proceed, .]","queries unanswered , it hard proceed .",,"q u e r i u n a n s w , i t h a r d ...",query unanswered hard proceed,query unanswered hard proceed
...,...,...,...,...,...,...,...,...,...
477,Lack of training and development,Opportunities for job rotation and diversifica...,Opportunities for job rotation and diversifica...,"[Opportunities, job, rotation, diversification...",Opportunities job rotation diversification abs...,,o p p o r t u n j o b r o t a t d i v e ...,Opportunity job rotation diversification absent,Opportunity job rotation diversification absent
478,Lack of training and development,Leadership doesn't recognize employees' aspira...,Leadership doesnt recognize employees aspirati...,"[Leadership, n't, recognize, employees, ', asp...",Leadership n't recognize employees ' aspiratio...,,l e a d e r s h i p n ' t r e c o g n e ...,Leadership nt recognize employee aspiration gr...,Leadership nt recognize employee aspiration gr...
479,Lack of training and development,Employees feel unsupported in their pursuit of...,Employees feel unsupported in their pursuit of...,"[Employees, feel, unsupported, their, pursuit,...",Employees feel unsupported their pursuit skill...,,e m p l o y e f e e l u n s u p p o r t ...,Employee feel unsupported pursuit skill enhanc...,Employee feel unsupported pursuit skill enhanc...
480,Lack of training and development,No structured program for mentoring new employ...,No structured program for mentoring new employees,"[No, structured, program, mentoring, employees...",No structured program mentoring employees .,,n o s t r u c t u r p r o g r a m m e n ...,No structured program mentor employee,No structured program mentor employee


Vectorization

In [130]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'Lemminf' is your text column
df['Lemminf'].fillna("", inplace=True)

# Create the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the corpus
E = vectorizer.fit_transform(df['Lemminf'])

# Get feature names
feature_names = vectorizer.get_feature_names_out()


In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'cleaned' is your text column
corpus_cleaned = df['cleaned'].apply(lambda x: ' '.join(x.split()))

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the data
Z_cleaned = vectorizer.fit_transform(corpus_cleaned)

# Get feature names
feature_names_cleaned = vectorizer.get_feature_names_out()

# Create a DataFrame
df_tfidfvect_cleaned = pd.DataFrame(data=Z_cleaned.toarray(), columns=feature_names_cleaned)

# Display the DataFrame
print(df_tfidfvect_cleaned)


     abilities  ability  above    absent  access  accessibility  accommodate  \
0          0.0      0.0    0.0  0.000000     0.0            0.0          0.0   
1          0.0      0.0    0.0  0.000000     0.0            0.0          0.0   
2          0.0      0.0    0.0  0.000000     0.0            0.0          0.0   
3          0.0      0.0    0.0  0.000000     0.0            0.0          0.0   
4          0.0      0.0    0.0  0.000000     0.0            0.0          0.0   
..         ...      ...    ...       ...     ...            ...          ...   
477        0.0      0.0    0.0  0.499563     0.0            0.0          0.0   
478        0.0      0.0    0.0  0.000000     0.0            0.0          0.0   
479        0.0      0.0    0.0  0.000000     0.0            0.0          0.0   
480        0.0      0.0    0.0  0.000000     0.0            0.0          0.0   
481        0.0      0.0    0.0  0.000000     0.0            0.0          0.0   

     accommodated  accountability  achi

BOW

In [164]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

#  df is the DataFrame
corpus = df['Lemminf']

# Replace NaN values with an empty string
corpus = corpus.replace(np.nan, '', regex=True)

# Create the CountVectorizer
vectorizer_bow = CountVectorizer()
X_bow = vectorizer_bow.fit_transform(corpus)
feature_names_bow = vectorizer_bow.get_feature_names_out()

In [176]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
import nltk
nltk.download('punkt')

# Function to apply stemming to a text
def stem_text(text):
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [PorterStemmer().stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

# Assuming 'cleaned' is your preprocessed column
corpus = df['cleaned']

# Apply stemming to the 'cleaned' column
corpus_stemmed = corpus.apply(stem_text)

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the data
X_bow_stemmed = vectorizer.fit_transform(corpus_stemmed)

# Get feature names
feature_names_bow_stemmed = vectorizer.get_feature_names_out()

# Display feature names or other relevant information
print("Feature Names:", feature_names_bow_stemmed)
df_bow_stemmed = pd.DataFrame(data=X_bow_stemmed.toarray(), columns=feature_names_bow_stemmed)
print("DataFrame with BOW for Stemming:")
print(df_bow_stemmed)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/oumaymabamoh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Feature Names: ['abil' 'absent' 'access' 'accommod' 'account' 'achiev' 'acknowledg'
 'acquisit' 'action' 'activ' 'ad' 'adapt' 'add' 'address' 'adequ' 'adjust'
 'administr' 'advanc' 'advoc' 'affect' 'agenda' 'air' 'align' 'alloc'
 'anonym' 'anxieti' 'app' 'appreci' 'arbitrari' 'arrang' 'aspir' 'assign'
 'attend' 'autonomi' 'avenu' 'awar' 'balanc' 'barrier' 'base' 'belong'
 'benefit' 'blame' 'bombard' 'bonus' 'break' 'breakdown' 'brush' 'budget'
 'bureaucraci' 'burnout' 'career' 'careerrel' 'cater' 'caus' 'certif'
 'challeng' 'chang' 'channel' 'chaotic' 'clariti' 'clean' 'cleanli'
 'closur' 'collabor' 'comfort' 'commit' 'common' 'commun' 'compani'
 'compens' 'compet' 'competit' 'competitor' 'complaint' 'complet'
 'complex' 'concentr' 'concern' 'condit' 'confidenti' 'conflict' 'confus'
 'consid' 'consider' 'consist' 'constant' 'constantli' 'construct'
 'consult' 'context' 'continu' 'contribut' 'control' 'costofliv' 'cover'
 'creat' 'creativ' 'crise' 'criteria' 'critic' 'crosscultur' 'cros

In [177]:
# Display the Bag of Words matrix for Lemmatized data
df_bow_lemmatized = pd.DataFrame(data=X_bow.toarray(), columns=feature_names_bow)
df_bow_lemmatized

Unnamed: 0,ability,absent,access,accessibility,accommodate,accountability,achievement,acknowledge,acquisition,action,...,wellbeing,wellexplain,wellmaintain,wellness,work,worker,worklife,workload,workshop,workspace
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
477,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
478,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
479,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
480,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [178]:
# Display the Bag of Words matrix for Stemmed data
df_bow_stemmed = pd.DataFrame(data=X_bow_stemmed.toarray(), columns=feature_names_bow_stemmed)
# Assuming df_bow_stemmed is your Bag of Words dataframe for Stemmed data
df_bow_stemmed

Unnamed: 0,abil,absent,access,accommod,account,achiev,acknowledg,acquisit,action,activ,...,well,wellb,wellexplain,wellmaintain,worker,worklif,workload,workshop,workspac,wors
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
477,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
478,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
479,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
480,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Use of TF-IDF vectorize and Bag of Words (BoW) techniques to vectorize the texts

Clustering Using Kmeans

In [180]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=12, init='k-means++', random_state= 42)
y_predict_Z= kmeans.fit_predict(Z_cleaned)
df['y_predict_stemm'] = pd.DataFrame(data = y_predict_Z)

In [181]:
kmeans = KMeans(n_clusters=12, init='k-means++', random_state= 42)
y_predict_E= kmeans.fit_predict(E)
df['y_predict_inflemm'] = pd.DataFrame(data = y_predict_E)

In [182]:
df['count']=1

In [183]:
import plotly.express as px

fig = px.bar(df, x="y_predict_stemm", y="count", color="Genre", title="Long-Form Input")
fig.show()

In [184]:
from collections import Counter

# Group the data by cluster
cluster_groups = df.groupby('y_predict_stemm')

# Count the frequency of each word in each cluster
word_counts = {}
for name, group in cluster_groups:
    words = ' '.join(group['Lemminf']).split()
    word_counts[name] = Counter(words)

# Get the top 15 words for each cluster
top_words = {}
for cluster, counter in word_counts.items():
    top_words[cluster] = [word for word, count in counter.most_common(15)]
for cluster, words in top_words.items():
    print(f"Cluster {cluster}: {', '.join(words)}")

Cluster 0: stressful, miss, fear, workload, Fear, stress, Manage, task, Important, share, time, Feedback, disregard, Clear, instruction
Cluster 1: Management, lack, management, communication, support, Time, tool, nt, stress, workload, direction, feedback, expectation, unclear, Lack
Cluster 2: nt, Manager, concern, address, team, raise, meeting, decision, lack, Management, contribution, conflict, employee, effectively, brush
Cluster 3: Lack, training, lack, proper, lead, opportunity, offer, feature, program, skill, growth, communication, manage, task, add
Cluster 4: communicate, Project, share, company, offer, consistently, update, Communication, effectively, Company, Update, Change, policy, regularly, communication
Cluster 5: nt, team, communication, Micromanagement, Employee, Leader, stress, performance, cause, Opportunity, skill, employee, support, development, explanation
Cluster 6: Supervisor, nt, provide, growth, opportunity, support, feedback, adequate, offer, time, challenge, ad

In [185]:
fig = px.bar(df, x="y_predict_inflemm", y="count", color="Genre", title="Long-Form Input")
fig.show()

In [186]:
from collections import Counter

# Group the data by cluster
cluster_groups = df.groupby('y_predict_inflemm')

# Count the frequency of each word in each cluster
word_counts = {}
for name, group in cluster_groups:
    words = ' '.join(group['Lemminf']).split()
    word_counts[name] = Counter(words)

# Get the top 15 words for each cluster
top_words = {}
for cluster, counter in word_counts.items():
    top_words[cluster] = [word for word, count in counter.most_common(15)]
for cluster, words in top_words.items():
    print(f"Cluster {cluster}: {', '.join(words)}")

Cluster 0: Overcommitting, fear, miss, stressful
Cluster 1: lack, understanding, Management, role, Manager, efficiency
Cluster 2: No, provision, ongoing, skill, development
Cluster 3: Lack, training, lead, turnover, lack, growth
Cluster 4: Team, discussion, lack, engagement, manager
Cluster 5: worklife, balance, nt, respect, completely, sync, Management, support, Leader, Supervisor
Cluster 6: Manage, workload, feel, impossible, proper, tool
Cluster 7: Update, company, performance, lack
Cluster 8: Miscommunication, common, remote, team
Cluster 9: involvement, Leadership, lack, employee, development, Lack, leadership, presence, project, noticeable, hamper, teamwork, support, skill
Cluster 10: nt, No, lack, employee, Leadership, workload, communication, Lack, Supervisor, support, task, stress, team, training, Management
Cluster 11: opportunity, Opportunity, skill, nt, growth, learn, limit, development, provide, Supervisor, offer, Lack, advancement, lack, communicate


Clustring BOW

In [190]:
from sklearn.feature_extraction.text import CountVectorizer

# Assuming 'corpus' is a list of documents
corpus = df['Lemminf']  # Use Lemmatized data or 'cleaned' if desired
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(corpus)

In [188]:
from sklearn.cluster import KMeans

kmeans_bow = KMeans(n_clusters=12, init='k-means++', random_state=42)
y_predict_bow = kmeans_bow.fit_predict(X_bow)
df['y_predict_bow'] = pd.DataFrame(data=y_predict_bow)

In [191]:
import plotly.express as px

fig = px.bar(df, x="y_predict_bow", y="count", color="Genre", title="Bag of Words Clustering")
fig.show()

In [192]:
from collections import Counter

# Group the data by cluster
cluster_groups_bow = df.groupby('y_predict_bow')

# Count the frequency of each word in each cluster
word_counts_bow = {}
for name, group in cluster_groups_bow:
    words = ' '.join(group['Lemminf']).split()
    word_counts_bow[name] = Counter(words)

# Get the top 15 words for each cluster
top_words_bow = {}
for cluster, counter in word_counts_bow.items():
    top_words_bow[cluster] = [word for word, count in counter.most_common(15)]
for cluster, words in top_words_bow.items():
    print(f"Cluster {cluster} (BoW): {', '.join(words)}")

Cluster 0 (BoW): workload, manage, No, peak, tool, Workload, management, Time, support, Manage, overwhelming, nt, handle, feel, impossible
Cluster 1 (BoW): lack, Lack, Leadership, training, lead, proper, skill, opportunity, employee, communication, transparency, involvement, development, feature, program
Cluster 2 (BoW): nt, Leadership, employee, Manager, Leader, recognize, guidance, skill, decision, support, provide, Benefit, Employee, Office, accommodate
Cluster 3 (BoW): Inadequate, space, employee, relax, recharge
Cluster 4 (BoW): No, employee, communication, team, Micromanagement, Employee, inconsistent, skill, development, share, provision, stress, performance, cause, Leadership
Cluster 5 (BoW): workload, heavy, lead, Heavy, Handle, communication, gap, hinder, career, growth, Deal, impact, performance, unnoticed, management
Cluster 6 (BoW): lack, Management, stress, management, support, communication, Lack, add, direction, understanding, morale, consistency, Stress, inadequate, co

Hirerachical clustering

In [193]:
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer  # Import CountVectorizer

# Assuming 'corpus' is your list of text data
corpus = df['Lemminf']

# TF-IDF Vectorization
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(corpus)

# Convert the sparse TF-IDF matrix to a dense numpy array
X_tfidf_dense = X_tfidf.toarray()

# Bag of Words (BOW) Vectorization
vectorizer_bow = CountVectorizer()
X_bow = vectorizer_bow.fit_transform(corpus)

# Convert the sparse BOW matrix to a dense numpy array
X_bow_dense = X_bow.toarray()

# Perform K-Means clustering on TF-IDF
kmeans_tfidf = KMeans(n_clusters=12, init='k-means++', random_state=42)
y_predict_tfidf = kmeans_tfidf.fit_predict(X_tfidf_dense)
df['y_predict_tfidf'] = pd.DataFrame(data=y_predict_tfidf)

# Perform K-Means clustering on Bag of Words
kmeans_bow = KMeans(n_clusters=12, init='k-means++', random_state=42)
y_predict_bow = kmeans_bow.fit_predict(X_bow_dense)
df['y_predict_bow'] = pd.DataFrame(data=y_predict_bow)

# Visualize the results for TF-IDF
fig_tfidf = px.bar(df, x="y_predict_tfidf", y="count", color="Genre", title="K-Means Clustering Results - TF-IDF")
fig_tfidf.show()

# Visualize the results for Bag of Words
fig_bow = px.bar(df, x="y_predict_bow", y="count", color="Genre", title="K-Means Clustering Results - Bag of Words")
fig_bow.show()

Semantic Analysis - Topic Modeling:

In [194]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# df is DataFrame with the 'cleaned' column containing preprocessed text
corpus = df['cleaned']

# Vectorize the text using CountVectorizer
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(corpus)

# Apply LDA
lda = LatentDirichletAllocation(n_components=12, random_state=42)
lda.fit(X)

# Display the top words for each topic
for i, topic in enumerate(lda.components_):
    top_words_indices = topic.argsort()[:-10 - 1:-1]
    top_words = [vectorizer.get_feature_names_out()[index] for index in top_words_indices]
    print(f"Topic {i + 1}: {', '.join(top_words)}")

Topic 1: shared, direction, meetings, initiatives, missing, updates, explanation, performance, company, focused
Topic 2: management, workload, nt, time, concerns, tools, address, managing, performance, team
Topic 3: employees, career, guidance, tasks, feedback, growth, workload, receive, advancement, nt
Topic 4: leaders, nt, leadership, training, communication, employees, offered, discussions, tasks, proper
Topic 5: managers, employees, lack, professional, provisions, nt, development, decisions, proper, pressure
Topic 6: nt, supervisors, provide, leadership, opportunities, employee, skill, offer, support, benefits
Topic 7: motivation, outdated, equipment, leads, common, frustration, neglected, affect, productivity, training
Topic 8: stress, employees, support, tasks, causing, lack, management, opportunities, managing, lacking
Topic 9: team, communicated, development, skill, company, support, project, wellbeing, projects, leaderships
Topic 10: lack, communication, leaderships, managemen

In [195]:
from sklearn.decomposition import NMF

#  DataFrame with the 'cleaned' column containing preprocessed text
corpus = df['cleaned']

# Vectorize the text using CountVectorizer
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(corpus)

# Apply NMF
nmf = NMF(n_components=12, random_state=42)
nmf.fit(X)

# Display the top words for each topic
for i, topic in enumerate(nmf.components_):
    top_words_indices = topic.argsort()[:-10 - 1:-1]
    top_words = [vectorizer.get_feature_names_out()[index] for index in top_words_indices]
    print(f"Topic {i + 1}: {', '.join(top_words)}")

Topic 1: nt, leadership, managers, leaders, company, recognize, decisions, office, employee, benefits
Topic 2: lack, leaderships, managements, leads, involvement, direction, morale, proper, transparency, employee
Topic 3: employees, roles, recognition, leadership, receive, career, feel, goals, guidance, professional
Topic 4: workload, managing, tools, heavy, time, feels, unpredictable, impossible, provisions, leads
Topic 5: communication, inconsistent, issues, teams, tools, effectively, addressing, gaps, proper, rumors
Topic 6: stress, support, causing, adds, remote, missing, unpredictable, adequate, workloads, peak
Topic 7: supervisors, nt, provide, growth, recognition, support, adequate, feedback, challenges, advocate
Topic 8: opportunities, skill, development, growth, limited, career, professional, offer, leadership, advancement
Topic 9: training, programs, leads, lack, proper, offered, outdated, effective, materials, inefficiencies
Topic 10: management, time, lacks, address, tools,

In [196]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from collections import Counter

# df is DataFrame with the 'cleaned' column containing preprocessed text
corpus = df['cleaned']

# Vectorize the text using CountVectorizer
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(corpus)

# Apply LDA
lda = LatentDirichletAllocation(n_components=12, random_state=42)
lda.fit(X)

# Display the top words for each topic
for i, topic in enumerate(lda.components_):
    top_words_indices = topic.argsort()[:-10 - 1:-1]
    top_words = [vectorizer.get_feature_names_out()[index] for index in top_words_indices]
    print(f"Topic {i + 1}: {', '.join(top_words)}")

# Extracting topics programmatically
lda_topic_distribution = np.argmax(lda.transform(X), axis=1)
topic_counts = Counter(lda_topic_distribution)
most_common_topics = topic_counts.most_common()

# Print the count of each topic
for topic, count in most_common_topics:
    print(f"Topic {topic + 1}: Count={count}")

Topic 1: shared, direction, meetings, initiatives, missing, updates, explanation, performance, company, focused
Topic 2: management, workload, nt, time, concerns, tools, address, managing, performance, team
Topic 3: employees, career, guidance, tasks, feedback, growth, workload, receive, advancement, nt
Topic 4: leaders, nt, leadership, training, communication, employees, offered, discussions, tasks, proper
Topic 5: managers, employees, lack, professional, provisions, nt, development, decisions, proper, pressure
Topic 6: nt, supervisors, provide, leadership, opportunities, employee, skill, offer, support, benefits
Topic 7: motivation, outdated, equipment, leads, common, frustration, neglected, affect, productivity, training
Topic 8: stress, employees, support, tasks, causing, lack, management, opportunities, managing, lacking
Topic 9: team, communicated, development, skill, company, support, project, wellbeing, projects, leaderships
Topic 10: lack, communication, leaderships, managemen

In [197]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from collections import Counter

# df is  DataFrame with the 'cleaned' column containing preprocessed text
corpus = df['cleaned']

# Vectorize the text using CountVectorizer
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(corpus)

# Apply NMF
nmf = NMF(n_components=12, random_state=42)
nmf.fit(X)

# Display the top words for each topic
for i, topic in enumerate(nmf.components_):
    top_words_indices = topic.argsort()[:-10 - 1:-1]
    top_words = [vectorizer.get_feature_names_out()[index] for index in top_words_indices]
    print(f"Topic {i + 1}: {', '.join(top_words)}")

# Extracting topics programmatically
nmf_topic_distribution = np.argmax(nmf.transform(X), axis=1)
topic_counts = Counter(nmf_topic_distribution)
most_common_topics = topic_counts.most_common()

# Print the count of each topic
for topic, count in most_common_topics:
    print(f"Topic {topic + 1}: Count={count}")

Topic 1: nt, leadership, managers, leaders, company, recognize, decisions, office, employee, benefits
Topic 2: lack, leaderships, managements, leads, involvement, direction, morale, proper, transparency, employee
Topic 3: employees, roles, recognition, leadership, receive, career, feel, goals, guidance, professional
Topic 4: workload, managing, tools, heavy, time, feels, unpredictable, impossible, provisions, leads
Topic 5: communication, inconsistent, issues, teams, tools, effectively, addressing, gaps, proper, rumors
Topic 6: stress, support, causing, adds, remote, missing, unpredictable, adequate, workloads, peak
Topic 7: supervisors, nt, provide, growth, recognition, support, adequate, feedback, challenges, advocate
Topic 8: opportunities, skill, development, growth, limited, career, professional, offer, leadership, advancement
Topic 9: training, programs, leads, lack, proper, offered, outdated, effective, materials, inefficiencies
Topic 10: management, time, lacks, address, tools,

Coherence Score

In [198]:
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel

# Assuming df is DataFrame with the 'cleaned' column containing preprocessed text
corpus = df['cleaned']

# Tokenize your text (assuming it's not tokenized already)
tokenized_text = [text.split() for text in corpus]

# Create a dictionary representation of the documents.
dictionary = corpora.Dictionary(tokenized_text)

# Convert the dictionary to a bag-of-words corpus.
corpus_lda = [dictionary.doc2bow(text) for text in tokenized_text]

# Build the LDA model
lda = LdaModel(corpus=corpus_lda, id2word=dictionary, num_topics=12, random_state=42)

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda, texts=tokenized_text, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print(f'Coherence Score (LDA): {coherence_lda}')

Coherence Score (LDA): 0.619381672972284


In [199]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Assuming df is DataFrame with the 'cleaned' column containing preprocessed text
corpus = df['cleaned']

# Vectorize the text using CountVectorizer for NMF
vectorizer_nmf = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_nmf = vectorizer_nmf.fit_transform(corpus)

# Apply NMF
nmf = NMF(n_components=12, random_state=42)
nmf.fit(X_nmf)

# Transform the corpus into NMF topic space
nmf_topic_distribution = nmf.transform(X_nmf)

# Calculate coherence using cosine similarity
coherence_values = []

# Calculate cosine similarity between each pair of NMF topics
similarity_matrix = cosine_similarity(nmf_topic_distribution)

# Exclude self-similarity
np.fill_diagonal(similarity_matrix, 0)

# Calculate average coherence
average_coherence = np.mean(similarity_matrix.max(axis=0))

print(f'Average Coherence Score (NMF): {average_coherence}')

Average Coherence Score (NMF): 0.9667035911370009
