# Topic Modelling template

In [34]:
import pandas as pd

import numpy as np

import spacy
nlp = spacy.load('en_core_web_sm')

import gensim
from gensim import corpora

from datetime import datetime as dt

import pyLDAvis.gensim

from operator import itemgetter

In [35]:
df = pd.read_csv('glassdoor_full_dataset.csv')
df['date'] = pd.to_datetime(df.date)
df.head()

Unnamed: 0,date,star,subject,pros,cons,advice,location,clean_member_type,tenure,employee_status,recommend,outlook,ceo,Career Opportunities,Compensation and Benefits,Culture & Values,Senior Management,Work/Life Balance,star_group
0,2019-11-04,5,Big dynamic company with many opportunities,Fast grows up if you are hard worker,unstable rota and early morning shifts,,London,Team Leader,I have been working at Pret A Manger full-time...,Current Employee,,,,Positive,Positive,Positive,Positive,Neutral,Positive
1,2020-01-20,5,Great Vacation Work,Lots of shifts and flexibility of hours,Management are lax on food waste,,Gatwick,Team Member,I worked at Pret A Manger full-time for less t...,Former Employee,,,,Positive,Positive,Positive,Positive,Positive,Positive
2,2020-01-20,4,Great company,"Great training, benefits and salary",Nothing to say so far,,,Team Leader,I have been working at Pret A Manger full-time,Current Employee,,,,Positive,Positive,Positive,Positive,Neutral,Positive
3,2020-01-16,3,Something Helpful,The worst thing is loss of bonus.,New management created silly rules and people ...,Respect your staff more.They are amazing hard ...,London,Production Team Member,I worked at Pret A Manger full-time for more t...,Former Employee,True,positive,positive,Negative,Negative,Negative,Neutral,Negative,Neutral
4,2020-01-15,3,One day probation,"Lot to learn, busy, new people",Very busy,Be more patient with newbies,Palmers Green,Kitchen Staff,I worked at Pret A Manger full-time for less t...,Former Employee,,,,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral


## Spacy 101

In [36]:
#create token #spacy 101 - https://spacy.io/usage/spacy-101 #%%time

df['pro_tokens'] = df.pros.map(lambda x: nlp(x))
df['con_tokens'] = df.cons.map(lambda x: nlp(x))

In [37]:
df.head()

Unnamed: 0,date,star,subject,pros,cons,advice,location,clean_member_type,tenure,employee_status,...,outlook,ceo,Career Opportunities,Compensation and Benefits,Culture & Values,Senior Management,Work/Life Balance,star_group,pro_tokens,con_tokens
0,2019-11-04,5,Big dynamic company with many opportunities,Fast grows up if you are hard worker,unstable rota and early morning shifts,,London,Team Leader,I have been working at Pret A Manger full-time...,Current Employee,...,,,Positive,Positive,Positive,Positive,Neutral,Positive,"(Fast, grows, up, if, you, are, hard, worker)","(unstable, rota, and, early, morning, shifts)"
1,2020-01-20,5,Great Vacation Work,Lots of shifts and flexibility of hours,Management are lax on food waste,,Gatwick,Team Member,I worked at Pret A Manger full-time for less t...,Former Employee,...,,,Positive,Positive,Positive,Positive,Positive,Positive,"(Lots, of, shifts, and, flexibility, of, hours)","(Management, are, lax, on, food, waste)"
2,2020-01-20,4,Great company,"Great training, benefits and salary",Nothing to say so far,,,Team Leader,I have been working at Pret A Manger full-time,Current Employee,...,,,Positive,Positive,Positive,Positive,Neutral,Positive,"(Great, training, ,, benefits, and, salary)","(Nothing, to, say, so, far)"
3,2020-01-16,3,Something Helpful,The worst thing is loss of bonus.,New management created silly rules and people ...,Respect your staff more.They are amazing hard ...,London,Production Team Member,I worked at Pret A Manger full-time for more t...,Former Employee,...,positive,positive,Negative,Negative,Negative,Neutral,Negative,Neutral,"(The, worst, thing, is, loss, of, bonus, .)","(New, management, created, silly, rules, and, ..."
4,2020-01-15,3,One day probation,"Lot to learn, busy, new people",Very busy,Be more patient with newbies,Palmers Green,Kitchen Staff,I worked at Pret A Manger full-time for less t...,Former Employee,...,,,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,"(Lot, to, learn, ,, busy, ,, new, people)","(Very, busy)"


## Clean token column

In [46]:
def text_cleaner(tokens, root_word = True):
    
    clean_token = []
    
    for token in tokens:
        if token.is_stop == False:
            if (token.pos_ == "VERB") or  (token.pos_ == "NOUN") or (token.pos_ == "ADJ") or (token.pos_ == "ADV"):
                if root_word == True:
                    clean_token.append(token.lemma_)
                else:
                    clean_token.append(token.lower_)
    return clean_token


In [47]:
#create new column with the above function
df['pro_clean_token'] = df.pro_tokens.map(text_cleaner)
df['con_clean_token'] = df.con_tokens.map(text_cleaner)


In [48]:
df.head()

Unnamed: 0,date,star,subject,pros,cons,advice,location,clean_member_type,tenure,employee_status,...,Career Opportunities,Compensation and Benefits,Culture & Values,Senior Management,Work/Life Balance,star_group,pro_tokens,con_tokens,pro_clean_token,con_clean_token
0,2019-11-04,5,Big dynamic company with many opportunities,Fast grows up if you are hard worker,unstable rota and early morning shifts,,London,Team Leader,I have been working at Pret A Manger full-time...,Current Employee,...,Positive,Positive,Positive,Positive,Neutral,Positive,"(Fast, grows, up, if, you, are, hard, worker)","(unstable, rota, and, early, morning, shifts)","[fast, grow, hard, worker]","[unstable, early, morning, shift]"
1,2020-01-20,5,Great Vacation Work,Lots of shifts and flexibility of hours,Management are lax on food waste,,Gatwick,Team Member,I worked at Pret A Manger full-time for less t...,Former Employee,...,Positive,Positive,Positive,Positive,Positive,Positive,"(Lots, of, shifts, and, flexibility, of, hours)","(Management, are, lax, on, food, waste)","[lot, shift, flexibility, hour]","[management, lax, food, waste]"
2,2020-01-20,4,Great company,"Great training, benefits and salary",Nothing to say so far,,,Team Leader,I have been working at Pret A Manger full-time,Current Employee,...,Positive,Positive,Positive,Positive,Neutral,Positive,"(Great, training, ,, benefits, and, salary)","(Nothing, to, say, so, far)","[great, training, benefit, salary]",[far]
3,2020-01-16,3,Something Helpful,The worst thing is loss of bonus.,New management created silly rules and people ...,Respect your staff more.They are amazing hard ...,London,Production Team Member,I worked at Pret A Manger full-time for more t...,Former Employee,...,Negative,Negative,Negative,Neutral,Negative,Neutral,"(The, worst, thing, is, loss, of, bonus, .)","(New, management, created, silly, rules, and, ...","[bad, thing, loss, bonus]","[new, management, create, silly, rule, people,..."
4,2020-01-15,3,One day probation,"Lot to learn, busy, new people",Very busy,Be more patient with newbies,Palmers Green,Kitchen Staff,I worked at Pret A Manger full-time for less t...,Former Employee,...,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,"(Lot, to, learn, ,, busy, ,, new, people)","(Very, busy)","[lot, learn, busy, new, people]",[busy]


## Pro comments Topic Modelling

In [49]:
#create our dictionary
#this assigns a number to each word
dictionary = corpora.Dictionary(df.pro_clean_token)

#number of times a word appears in the all of the data
corpus = [dictionary.doc2bow(text) for text in df.pro_clean_token]


In [53]:
%%time
number_topics = 2

lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=number_topics, id2word=dictionary, passes=15)


Wall time: 4.64 s


In [54]:
topics = lda_model.print_topics(num_words = 5)

for topic in topics:
    print(topic)


(0, '0.080*"food" + 0.071*"free" + 0.052*"pay" + 0.038*"good" + 0.016*"coffee"')
(1, '0.055*"good" + 0.055*"work" + 0.034*"great" + 0.027*"people" + 0.024*"team"')


In [55]:
lda_display = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics = False)
pyLDAvis.display(lda_display)

In [56]:
pyLDAvis.save_html(lda_display, 'Pro_Topics.html')

In [117]:
#cast topic probability back to dataframe
df['pro_topic_probability'] = df.pro_clean_token.map(lambda x: lda_model[dictionary.doc2bow(x)])

## retrieve the topic with the highest probability and place id as column value
df['pro_topic_id'] = df.pro_topic_probability.map(lambda x: max(x, key=itemgetter(1))[0])

df['pro_topic_label'] = df.pro_topic_id.apply(lambda x: 'Work environment' if x == 1 else 'Benefits')

df.head()

Unnamed: 0,date,star,subject,pros,cons,advice,location,clean_member_type,tenure,employee_status,...,pro_clean_token,con_clean_token,pro_topic,pro_topic_label,pro_topic_probability,pro_topic_id,con_topic_probability,con_topic_id,con_topic_label,unique_id
0,2019-11-04,5,Big dynamic company with many opportunities,Fast grows up if you are hard worker,unstable rota and early morning shifts,,London,Team Leader,I have been working at Pret A Manger full-time...,Current Employee,...,"[fast, grow, hard, worker]","[unstable, early, morning, shift]","[(0, 0.54603934), (1, 0.4539607)]",Benefits,"[(0, 0.54555506), (1, 0.45444492)]",0,"[(0, 0.11103573), (1, 0.8889643)]",1,Culture,0
1,2020-01-20,5,Great Vacation Work,Lots of shifts and flexibility of hours,Management are lax on food waste,,Gatwick,Team Member,I worked at Pret A Manger full-time for less t...,Former Employee,...,"[lot, shift, flexibility, hour]","[management, lax, food, waste]","[(0, 0.4185669), (1, 0.5814331)]",Work environment,"[(0, 0.41846275), (1, 0.58153725)]",1,"[(0, 0.19058193), (1, 0.8094181)]",1,Culture,1
2,2020-01-20,4,Great company,"Great training, benefits and salary",Nothing to say so far,,,Team Leader,I have been working at Pret A Manger full-time,Current Employee,...,"[great, training, benefit, salary]",[far],"[(0, 0.12189384), (1, 0.8781062)]",Work environment,"[(0, 0.121889845), (1, 0.8781102)]",1,"[(0, 0.25403154), (1, 0.74596846)]",1,Culture,2
3,2020-01-16,3,Something Helpful,The worst thing is loss of bonus.,New management created silly rules and people ...,Respect your staff more.They are amazing hard ...,London,Production Team Member,I worked at Pret A Manger full-time for more t...,Former Employee,...,"[bad, thing, loss, bonus]","[new, management, create, silly, rule, people,...","[(0, 0.11536618), (1, 0.88463384)]",Work environment,"[(0, 0.11538027), (1, 0.8846197)]",1,"[(0, 0.25526074), (1, 0.74473923)]",1,Culture,3
4,2020-01-15,3,One day probation,"Lot to learn, busy, new people",Very busy,Be more patient with newbies,Palmers Green,Kitchen Staff,I worked at Pret A Manger full-time for less t...,Former Employee,...,"[lot, learn, busy, new, people]",[busy],"[(0, 0.08738959), (1, 0.9126104)]",Work environment,"[(0, 0.08738959), (1, 0.9126104)]",1,"[(0, 0.74890715), (1, 0.25109285)]",0,Work,4


## Con comments Topic Modelling


In [118]:
#create our dictionary
#this assigns a number to each word
con_dictionary = corpora.Dictionary(df.con_clean_token)

#number of times a word appears in the all of the data
corpus = [con_dictionary.doc2bow(text) for text in df.con_clean_token]


In [119]:
%%time
number_topics = 2

con_lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=number_topics, id2word=con_dictionary, passes=15)


Wall time: 5.59 s


In [120]:
con_topics = con_lda_model.print_topics(num_words = 5)

for topic in con_topics:
    print(topic)

(0, '0.039*"work" + 0.023*"manager" + 0.015*"hard" + 0.015*"lot" + 0.013*"job"')
(1, '0.036*"hour" + 0.030*"work" + 0.018*"time" + 0.017*"pay" + 0.015*"long"')


In [121]:
con_lda_display = pyLDAvis.gensim.prepare(con_lda_model, corpus, con_dictionary, sort_topics = False)
pyLDAvis.display(con_lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [122]:
pyLDAvis.save_html(lda_display, 'Con_Topics.html')

In [123]:
#cast topic probability back to dataframe
df['con_topic_probability'] = df.con_clean_token.map(lambda x: con_lda_model[con_dictionary.doc2bow(x)])

## retrieve the topic with the highest probability and place id as column value
df['con_topic_id'] = df.con_topic_probability.map(lambda x: max(x, key=itemgetter(1))[0])

df['con_topic_label'] = df.con_topic_id.apply(lambda x: 'Culture' if x == 1 else 'Work')

df.head()

Unnamed: 0,date,star,subject,pros,cons,advice,location,clean_member_type,tenure,employee_status,...,pro_clean_token,con_clean_token,pro_topic,pro_topic_label,pro_topic_probability,pro_topic_id,con_topic_probability,con_topic_id,con_topic_label,unique_id
0,2019-11-04,5,Big dynamic company with many opportunities,Fast grows up if you are hard worker,unstable rota and early morning shifts,,London,Team Leader,I have been working at Pret A Manger full-time...,Current Employee,...,"[fast, grow, hard, worker]","[unstable, early, morning, shift]","[(0, 0.54603934), (1, 0.4539607)]",Benefits,"[(0, 0.54555506), (1, 0.45444492)]",0,"[(0, 0.8649142), (1, 0.13508579)]",0,Work,0
1,2020-01-20,5,Great Vacation Work,Lots of shifts and flexibility of hours,Management are lax on food waste,,Gatwick,Team Member,I worked at Pret A Manger full-time for less t...,Former Employee,...,"[lot, shift, flexibility, hour]","[management, lax, food, waste]","[(0, 0.4185669), (1, 0.5814331)]",Work environment,"[(0, 0.41846275), (1, 0.58153725)]",1,"[(0, 0.118999586), (1, 0.8810004)]",1,Culture,1
2,2020-01-20,4,Great company,"Great training, benefits and salary",Nothing to say so far,,,Team Leader,I have been working at Pret A Manger full-time,Current Employee,...,"[great, training, benefit, salary]",[far],"[(0, 0.12189384), (1, 0.8781062)]",Work environment,"[(0, 0.121889845), (1, 0.8781102)]",1,"[(0, 0.74737763), (1, 0.25262237)]",0,Work,2
3,2020-01-16,3,Something Helpful,The worst thing is loss of bonus.,New management created silly rules and people ...,Respect your staff more.They are amazing hard ...,London,Production Team Member,I worked at Pret A Manger full-time for more t...,Former Employee,...,"[bad, thing, loss, bonus]","[new, management, create, silly, rule, people,...","[(0, 0.11536618), (1, 0.88463384)]",Work environment,"[(0, 0.11538027), (1, 0.8846197)]",1,"[(0, 0.24565211), (1, 0.7543479)]",1,Culture,3
4,2020-01-15,3,One day probation,"Lot to learn, busy, new people",Very busy,Be more patient with newbies,Palmers Green,Kitchen Staff,I worked at Pret A Manger full-time for less t...,Former Employee,...,"[lot, learn, busy, new, people]",[busy],"[(0, 0.08738959), (1, 0.9126104)]",Work environment,"[(0, 0.08738959), (1, 0.9126104)]",1,"[(0, 0.7417878), (1, 0.2582122)]",0,Work,4


# Pro Comments Word Counts

In [124]:
df['unique_id'] = df.index

pro_df = df[['unique_id','date', 'pro_topic_label' ,'star_group', 'pro_clean_token']]

pro_df.head()

Unnamed: 0,unique_id,date,pro_topic_label,star_group,pro_clean_token
0,0,2019-11-04,Benefits,Positive,"[fast, grow, hard, worker]"
1,1,2020-01-20,Work environment,Positive,"[lot, shift, flexibility, hour]"
2,2,2020-01-20,Work environment,Positive,"[great, training, benefit, salary]"
3,3,2020-01-16,Work environment,Neutral,"[bad, thing, loss, bonus]"
4,4,2020-01-15,Work environment,Neutral,"[lot, learn, busy, new, people]"


In [125]:
pro_df['pro_clean_token'] = [','.join(map(str, l)) for l in pro_df['pro_clean_token']]

pro_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,unique_id,date,pro_topic_label,star_group,pro_clean_token
0,0,2019-11-04,Benefits,Positive,"fast,grow,hard,worker"
1,1,2020-01-20,Work environment,Positive,"lot,shift,flexibility,hour"
2,2,2020-01-20,Work environment,Positive,"great,training,benefit,salary"
3,3,2020-01-16,Work environment,Neutral,"bad,thing,loss,bonus"
4,4,2020-01-15,Work environment,Neutral,"lot,learn,busy,new,people"


In [126]:
# Set up empty df
pro_split_df = pd.DataFrame()

for i, row in pro_df.iterrows():
    
    # Create a list of the pro words
    pro_list = row['pro_clean_token'].split(',')
    
    # Make df
    pro_iter_df = pd.DataFrame(pro_list, columns = ['pro_clean_token'])
    
    # Populate extra columns
    pro_iter_df['date'] = row['date']
    pro_iter_df['star_group'] = row['star_group']
    pro_iter_df['unique_id'] = row['unique_id']
    pro_iter_df['pro_topic_label'] = row['pro_topic_label']
    
  
    pro_split_df = pro_split_df.append(pro_iter_df, ignore_index = True)
    
print(pro_split_df.shape)
pro_split_df.head()

(4678, 5)


Unnamed: 0,pro_clean_token,date,star_group,unique_id,pro_topic_label
0,fast,2019-11-04,Positive,0,Benefits
1,grow,2019-11-04,Positive,0,Benefits
2,hard,2019-11-04,Positive,0,Benefits
3,worker,2019-11-04,Positive,0,Benefits
4,lot,2020-01-20,Positive,1,Work environment


In [127]:
pro_split_df.to_csv('Pros_word_count.csv', index = False)

# Pro Comments Word Counts

In [164]:
con_df = df[['unique_id','date', 'con_topic_label' ,'star_group', 'con_clean_token']]

con_df['con_clean_token'] = [','.join(map(str, l)) for l in con_df['con_clean_token']]

con_df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,unique_id,date,con_topic_label,star_group,con_clean_token
0,0,2019-11-04,Work,Positive,"unstable,early,morning,shift"
1,1,2020-01-20,Culture,Positive,"management,lax,food,waste"
2,2,2020-01-20,Work,Positive,far
3,3,2020-01-16,Culture,Neutral,"new,management,create,silly,rule,people,work,m..."
4,4,2020-01-15,Work,Neutral,busy


In [165]:
# Set up empty df
con_split_df = pd.DataFrame()

for i, row in con_df.iterrows():
    
    # Create a list of the pro words
    con_list = row['con_clean_token'].split(',')
    
    # Make df
    con_iter_df = pd.DataFrame(con_list, columns = ['con_clean_token'])
    
    # Populate extra columns
    con_iter_df['date'] = row['date']
    con_iter_df['star_group'] = row['star_group']
    con_iter_df['unique_id'] = row['unique_id']
    con_iter_df['con_topic_label'] = row['con_topic_label']
    
  
    con_split_df = con_split_df.append(con_iter_df, ignore_index = True)
    
print(con_split_df.shape)
con_split_df.head()


(5704, 5)


Unnamed: 0,con_clean_token,date,star_group,unique_id,con_topic_label
0,unstable,2019-11-04,Positive,0,Work
1,early,2019-11-04,Positive,0,Work
2,morning,2019-11-04,Positive,0,Work
3,shift,2019-11-04,Positive,0,Work
4,management,2020-01-20,Positive,1,Culture


In [166]:
con_split_df.to_csv('Con_word_count.csv', index = False)