This Notebook shows you how to perform topic modeling using LDA, NMF, and LSI. 

In [1]:
import pandas as pd
df = pd.read_excel("../kevin/us_data_subset.xlsx") 
df.head()

Unnamed: 0,TI,AB,SO,PY
0,The Role of Business Intelligence and Communic...,This study examines the role that business int...,JOURNAL OF THE ASSOCIATION FOR INFORMATION SYS...,2017
1,"OPERATIONAL IT FAILURES, IT VALUE DESTRUCTION,...",This paper presents an empirical study of chan...,MIS QUARTERLY,2017
2,ON THE ROLE OF FAIRNESS AND SOCIAL DISTANCE IN...,Online referral systems help firms attract new...,MIS QUARTERLY,2017
3,KNOWLEDGE MANAGEMENT SYSTEM USE AND JOB PERFOR...,This paper seeks to develop a better understan...,MIS QUARTERLY,2017
4,REPEATED INTERACTIONS VERSUS SOCIAL TIES: QUAN...,The growing importance of online social networ...,MIS QUARTERLY,2017


In [3]:
#combine the title and abstract 
df['text'] = df['TI'] + ". " + df['AB'] #title followed by a period and then the abstractdf
df.head()

Unnamed: 0,TI,AB,SO,PY,text
0,The Role of Business Intelligence and Communic...,This study examines the role that business int...,JOURNAL OF THE ASSOCIATION FOR INFORMATION SYS...,2017,The Role of Business Intelligence and Communic...
1,"OPERATIONAL IT FAILURES, IT VALUE DESTRUCTION,...",This paper presents an empirical study of chan...,MIS QUARTERLY,2017,"OPERATIONAL IT FAILURES, IT VALUE DESTRUCTION,..."
2,ON THE ROLE OF FAIRNESS AND SOCIAL DISTANCE IN...,Online referral systems help firms attract new...,MIS QUARTERLY,2017,ON THE ROLE OF FAIRNESS AND SOCIAL DISTANCE IN...
3,KNOWLEDGE MANAGEMENT SYSTEM USE AND JOB PERFOR...,This paper seeks to develop a better understan...,MIS QUARTERLY,2017,KNOWLEDGE MANAGEMENT SYSTEM USE AND JOB PERFOR...
4,REPEATED INTERACTIONS VERSUS SOCIAL TIES: QUAN...,The growing importance of online social networ...,MIS QUARTERLY,2017,REPEATED INTERACTIONS VERSUS SOCIAL TIES: QUAN...


In [7]:
df.isnull().sum() #check for null 

TI      0
AB      1
SO      0
PY      0
text    1
dtype: int64

In [4]:
#remove the null record from text
df.dropna(subset = ['text'], inplace = True)

In [5]:
df.shape

(2854, 5)

In [6]:
#preprocess data
import spacy
import re
from spacy.lang.en.stop_words import STOP_WORDS
stopword_list = list(STOP_WORDS)
nlp_en = spacy.load('en_core_web_lg')
def preprocess(txt):
    txt = txt.lower() #normalize text
    txt = re.sub(r'[^a-zA-Z]', ' ', txt)
    docs = nlp_en(txt)
    #lemmatize and remove stopwords
    word_list = [doc.lemma_ for doc in docs if doc.text 
                            not in stopword_list ]
    txt = " ".join(word_list)
    txt = txt.replace("-PRON-","")
    txt = txt.replace("PRON","")
    return txt

In [7]:
corpus = list(df['text'])
cleaned_corpus = [preprocess(txt) for txt in corpus]

In [12]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
vectorizer = CountVectorizer(min_df=10, max_df=0.9, 
                             stop_words='english', lowercase=True)
data_vectorized = vectorizer.fit_transform(cleaned_corpus)


In [14]:
#function to get topics and put them in dataframe columns - we pass an empty dataframe to it
def get_topics(model, vectorizer, model_name, df, top_n = 20):
        result = []
        for idx, topic in enumerate(model.components_):
            #print("Topic %d:" % (idx))
            topic_label = model_name + "_topic_" + str(idx)
            score = "SCORE_" + str(idx)
            aList = [(vectorizer.get_feature_names()[i], topic[i])
                            for i in topic.argsort()[:-top_n - 1:-1]]
            l1, l2 = zip(*aList)
            df[topic_label] = l1
            df[score] = l2

In [15]:
#Let us assume we have to get 20 topics --- use coherence or some other metric to determine optimum topics
n = 20

In [16]:
print("Working on " + str(n) + " topics .....")
df = pd.DataFrame()
# Build a Latent Dirichlet Allocation Model
print("LDA being performed ...")
lda_model = LatentDirichletAllocation(n_components=n, max_iter=5000, learning_method='online')
doc_topics = lda_model.fit_transform(data_vectorized)
print("Generating LDA Model .....")
get_topics(lda_model, vectorizer, "LDA", df) #populates the dataframe
#display the first 10 records of the dataframe
df.head(10)
    

Working on 20 topics .....
LDA being performed ...
Generating LDA Model .....


Unnamed: 0,LDA_topic_0,SCORE_0,LDA_topic_1,SCORE_1,LDA_topic_2,SCORE_2,LDA_topic_3,SCORE_3,LDA_topic_4,SCORE_4,...,LDA_topic_15,SCORE_15,LDA_topic_16,SCORE_16,LDA_topic_17,SCORE_17,LDA_topic_18,SCORE_18,LDA_topic_19,SCORE_19
0,health,316.211782,service,1279.646793,online,1055.046495,research,919.872726,information,836.531112,...,infrastructure,310.119522,trust,670.849261,project,1043.582836,social,1352.629815,security,489.072662
1,adoption,203.755454,customer,692.650189,product,919.853722,information,703.939482,digital,441.265733,...,governance,278.266962,recommendation,256.545535,decision,819.169268,network,948.095209,user,341.438946
2,patient,203.25776,platform,492.041782,consumer,726.511794,new,599.011124,innovation,422.782872,...,initiative,177.798606,commerce,238.825821,control,417.652569,community,704.581548,internet,273.636199
3,care,163.1493,provider,282.547692,review,528.87992,theory,583.155996,privacy,415.923357,...,supplier,169.114108,agent,158.967136,option,195.328016,medium,498.771883,search,243.661221
4,healthcare,149.812563,quality,134.885778,web,431.31417,work,468.795813,study,255.333146,...,return,99.279357,game,133.628069,flexibility,152.645132,source,362.419679,advertising,235.342768
5,hospital,141.959444,personalization,130.026394,effect,420.038168,technology,428.557839,organization,247.503386,...,event,98.846774,study,111.415504,management,151.876933,open,313.554854,mobile,200.962228
6,capacity,137.22521,erp,121.14662,website,319.329448,practice,421.626239,theory,237.537228,...,market,81.438555,online,98.250804,make,149.117578,online,280.140726,content,174.972624
7,reuse,104.730569,resource,92.190965,information,315.891228,use,419.176357,research,228.271905,...,announcement,65.904768,electronic,94.740431,manager,129.378238,user,262.13841,threat,159.710713
8,delivery,104.325308,computing,92.069565,quality,294.127147,paper,362.132349,organizational,224.73281,...,ra,60.166796,human,78.693713,real,118.69991,participation,243.788059,effect,146.464131
9,physician,97.865877,base,90.305625,site,285.572712,change,348.061326,policy,223.438153,...,informational,39.483719,center,78.276841,goal,102.770698,oss,196.904133,cue,135.633315


Note that the scores shown have to be normalized to get probabilities.

In [17]:
#Let us try non-negative matrix factorization
nmf_df = pd.DataFrame()
print("NMF being performed ....")
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=n, max_iter=5000)
nmf_Z = nmf_model.fit_transform(data_vectorized)
   
print("Generating NMF Model ....")
get_topics(nmf_model, vectorizer, "NMF", nmf_df)
nmf_df.head()
    

NMF being performed ....
Generating NMF Model ....


Unnamed: 0,NMF_topic_0,SCORE_0,NMF_topic_1,SCORE_1,NMF_topic_2,SCORE_2,NMF_topic_3,SCORE_3,NMF_topic_4,SCORE_4,...,NMF_topic_15,SCORE_15,NMF_topic_16,SCORE_16,NMF_topic_17,SCORE_17,NMF_topic_18,SCORE_18,NMF_topic_19,SCORE_19
0,use,15.702796,product,10.859307,firm,9.503629,information,10.210022,project,9.878395,...,online,8.375184,user,12.021487,risk,6.457414,trust,8.485874,design,7.921679
1,study,3.149416,consumer,4.55154,performance,2.220888,privacy,1.130997,software,4.852932,...,review,3.621776,behavior,1.320081,decision,5.493375,study,2.00199,theory,2.795955
2,effect,2.519469,review,1.758293,capability,2.173323,security,0.721607,development,3.024308,...,community,3.115685,perceive,1.069223,security,2.469659,perceive,1.107102,base,1.207851
3,individual,1.756827,quality,1.407722,value,1.14933,organization,0.527114,control,2.803027,...,consumer,1.685983,effect,1.030635,investment,1.754728,relationship,1.071022,approach,1.140387
4,theory,1.637902,effect,0.994338,investment,1.086437,supply,0.432534,source,1.286781,...,effect,1.003725,content,0.906786,factor,1.181523,commerce,0.884172,support,0.932231


In [18]:
lsi_df = pd.DataFrame()
print("LSI being performed ...")
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=n, n_iter = 5000)
lsi_Z = lsi_model.fit_transform(data_vectorized)
    
print("Generating LSI Model .....")
get_topics(lsi_model, vectorizer, "LSI", lsi_df)
lsi_df.head()
    

LSI being performed ...
Generating LSI Model .....


Unnamed: 0,LSI_topic_0,SCORE_0,LSI_topic_1,SCORE_1,LSI_topic_2,SCORE_2,LSI_topic_3,SCORE_3,LSI_topic_4,SCORE_4,...,LSI_topic_15,SCORE_15,LSI_topic_16,SCORE_16,LSI_topic_17,SCORE_17,LSI_topic_18,SCORE_18,LSI_topic_19,SCORE_19
0,information,0.33127,product,0.474889,firm,0.56604,information,0.689237,product,0.381326,...,community,0.358273,use,0.448408,use,0.266372,trust,0.472078,decision,0.386352
1,research,0.224602,consumer,0.304214,capability,0.18346,research,0.165567,project,0.362461,...,software,0.258746,software,0.319437,business,0.23083,network,0.2975,software,0.256153
2,study,0.204003,firm,0.302815,performance,0.174968,privacy,0.099086,software,0.191957,...,online,0.228945,study,0.164223,effect,0.207269,software,0.243792,review,0.244726
3,model,0.194725,online,0.211117,business,0.151584,security,0.086015,review,0.162741,...,model,0.217194,effect,0.104423,performance,0.198176,process,0.174697,process,0.152546
4,technology,0.177888,market,0.20544,project,0.127822,consumer,0.071696,information,0.16066,...,innovation,0.158267,quality,0.091533,review,0.193236,business,0.172973,risk,0.132635


Gensim is a very popular package for NLP tasks. Given below is an example of LDA using the Gensim package. Note that we don't have to explicitly vectorize the documents. Also, the corpus has to be a list of a list of words. The code has been adapted from the Gensim tutorials.

In [8]:
#Let us take a look at our cleaned corpus
cleaned_corpus[:2]

['role business intelligence communication technology organizational agility   configurational approach   study examine role business intelligence   bi   communication technology play firm achieve organizational sensing agility   decision make agility   act agility different organizational environmental context   base information processing view organization dynamic capability theory   suggest configurational analytic framework depart standard linear paradigm examine s effect agility embed configuration organizational environmental element   line approach   use fuzzy set qualitative comparative analysis   fsqca   analyze field survey datum diverse industry   finding suggest equifinal pathway organizational agility specific boundary condition middle range theory determine role bi communication technology play organization   achieve organizational agility   discuss implication theory practice discuss future research avenue',
 'operational failure   value destruction   board level governa

In [11]:
#let us convert it to a list of a list of words
cleaned_corpus = [txt.split() for txt in cleaned_corpus]
cleaned_corpus[:2]

[['role',
  'business',
  'intelligence',
  'communication',
  'technology',
  'organizational',
  'agility',
  'configurational',
  'approach',
  'study',
  'examine',
  'role',
  'business',
  'intelligence',
  'bi',
  'communication',
  'technology',
  'play',
  'firm',
  'achieve',
  'organizational',
  'sensing',
  'agility',
  'decision',
  'make',
  'agility',
  'act',
  'agility',
  'different',
  'organizational',
  'environmental',
  'context',
  'base',
  'information',
  'processing',
  'view',
  'organization',
  'dynamic',
  'capability',
  'theory',
  'suggest',
  'configurational',
  'analytic',
  'framework',
  'depart',
  'standard',
  'linear',
  'paradigm',
  'examine',
  's',
  'effect',
  'agility',
  'embed',
  'configuration',
  'organizational',
  'environmental',
  'element',
  'line',
  'approach',
  'use',
  'fuzzy',
  'set',
  'qualitative',
  'comparative',
  'analysis',
  'fsqca',
  'analyze',
  'field',
  'survey',
  'datum',
  'diverse',
  'industry',
 

In [12]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(cleaned_corpus)

# Filter out words that occur less than 10 documents, or more than 80% of the documents.
dictionary.filter_extremes(no_below=10, no_above=0.8)

In [14]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in cleaned_corpus]

In [15]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 20
chunksize = 3000
passes = 20
iterations = 5000
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [16]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -2.1325.
[([(0.035534427, 'research'),
   (0.029536864, 'design'),
   (0.017161876, 'system'),
   (0.014146009, 'approach'),
   (0.013011451, 'information'),
   (0.010492493, 'framework'),
   (0.010143331, 'method'),
   (0.01012634, 'theory'),
   (0.009775196, 'paper'),
   (0.009470124, 'model'),
   (0.009407007, 'decision'),
   (0.008342261, 'base'),
   (0.007372736, 'process'),
   (0.007258637, 'problem'),
   (0.0070692794, 'propose'),
   (0.0066523, 'provide'),
   (0.0064737415, 'develop'),
   (0.0060670474, 'science'),
   (0.0057143676, 'analysis'),
   (0.0055991714, 'study')],
  -1.3313410118493487),
 ([(0.031094925, 'system'),
   (0.022595163, 'use'),
   (0.021588111, 'knowledge'),
   (0.01676294, 'research'),
   (0.016703313, 'information'),
   (0.014610848, 'theory'),
   (0.01171822, 'user'),
   (0.010414423, 'work'),
   (0.009563938, 'idea'),
   (0.009200944, 'organizational'),
   (0.008331014, 'study'),
   (0.008191933, 'support'),
   (0.008168371, 'c

In [22]:
for i in range(20):
    print("Topic: " + str(i + 1) + ":")
    a, b = zip(*top_topics[i][0])
    for w in b:
        print(w, end=', ')
    print("\n")
    

Topic: 1:
research, design, system, approach, information, framework, method, theory, paper, model, decision, base, process, problem, propose, provide, develop, science, analysis, study, 

Topic: 2:
system, use, knowledge, research, information, theory, user, work, idea, organizational, study, support, community, organization, process, technology, individual, paper, base, new, 

Topic: 3:
user, technology, use, model, study, behavior, perceive, social, influence, research, effect, intention, individual, result, theory, information, adoption, usage, test, relationship, 

Topic: 4:
firm, market, investment, industry, cost, effect, value, product, model, high, s, competition, strategy, increase, price, find, technology, result, low, profit, 

Topic: 5:
innovation, research, information, model, construct, study, measure, system, business, measurement, process, technology, analysis, new, group, base, use, modeling, paper, result, 

Topic: 6:
firm, capability, performance, information, techn