# 1) Importing Libraries and reading the data.

In [1]:
import os
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import glob

In [2]:
DATA_PATH = input(r"Enter the path of the folder named train_txt: ")
print(os.listdir(DATA_PATH))

Enter the path of the folder named train_txt: C:\Users\umerb\IMS\PROJECTS\train_txt
['data01.txt', 'data02.txt', 'data03.txt', 'data04.txt', 'data05.txt', 'data06.txt', 'data07.txt', 'data08.txt', 'data09.txt', 'data10.txt', 'data11.txt', 'data12.txt', 'data13.txt', 'data14.txt', 'data15.txt', 'data16.txt', 'data17.txt', 'data18.txt', 'data19.txt', 'data20.txt']


# 2) Preprocessing the data.

In [3]:
folders = ["data{0:02}".format(i) for i in range(1,2)]
# Read all texts into a list.
papers = []
for folder in folders:
    file_names = os.listdir(DATA_PATH)
    for file_name in file_names:
        with open(DATA_PATH + '/' + file_name, encoding='utf-8', errors='ignore', mode='r+') as f:
            data = f.read()
        papers.append(data)
len(papers)

20

In [4]:
print(papers[19][:1000])

Article

Overcoming Barriers in Supply Chain Analytics—
Investigating Measures in LSCM Organizations
Tino T. Herden *, Benjamin Nitsche and Benno Gerlach
Chair of Logistics, Technische Universität Berlin, Straße des 17. Juni 135, 10623 Berlin, Germany
* Correspondence: herden@logistik.tu-berlin.de
Received: 23 December 2019; Accepted: 17 February 2020; Published: 26 February 2020

Abstract: While supply chain analytics shows promise regarding value, benefits, and increase in
performance for logistics and supply chain management (LSCM) organizations, those organizations
are often either reluctant to invest or unable to achieve the returns they aspire to. This article
systematically explores the barriers LSCM organizations experience in employing supply chain
analytics that contribute to such reluctance and unachieved returns and measures to overcome these
barriers. This article therefore aims to systemize the barriers and measures and allocate measures to
barriers in order to provide or

In [5]:
#Performing tokenizaation, lemmatization, stemming and removing stop words

stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

def normalize_corpus(papers):
    norm_papers = []
    for paper in papers:
        paper = paper.lower()
        paper_tokens = [token.strip() for token in wtk.tokenize(paper)]
        paper_tokens = [wnl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
        paper_tokens = [token for token in paper_tokens if len(token) > 1]
        paper_tokens = [token for token in paper_tokens if token not in stop_words]
        paper_tokens = list(filter(None, paper_tokens))
        if paper_tokens:
            norm_papers.append(paper_tokens)
            
    return norm_papers
    
norm_papers = normalize_corpus(papers)
print(len(norm_papers))

20


In [6]:
print(norm_papers[0][:50])

['agriculture', 'article', 'influence', 'specie', 'composition', 'management', 'biomass', 'production', 'missouri', 'ranjith', 'udawatta', 'clark', 'gantzer', 'timothy', 'reinbott', 'ray', 'wright', 'robert', 'pierce', 'ii', 'walter', 'wehtje', 'school', 'natural', 'resource', 'university', 'missouri', 'columbia', 'mo', 'usa', 'gantzerc', 'missouri', 'edu', 'piercer', 'missouri', 'edu', 'ii', 'wehtjew', 'missouri', 'edu', 'center', 'agroforestry', 'school', 'natural', 'resource', 'university', 'missouri', 'columbia', 'mo', 'usa']


# 3)  Feature Engineering

In [7]:
#Converting a collection of text documents to a matrix of token counts

cv = CountVectorizer(min_df=0.1, max_df=0.7, ngram_range=(1,2),
                     token_pattern=None, tokenizer=lambda doc: doc,
                     preprocessor=lambda doc: doc)
cv_features = cv.fit_transform(norm_papers)
cv_features.shape                     

(20, 5479)

In [8]:
vocabulary = np.array(cv.get_feature_names())
print('Total Vocabulary Size:', len(vocabulary))

Total Vocabulary Size: 5479


# 4) Topic modeling with Latent Dirichlet Allocation (LDA)

In [9]:
%%time

TOTAL_TOPICS = 20

lda_model = LatentDirichletAllocation(n_components =TOTAL_TOPICS, max_iter=500, max_doc_update_iter=50,
                                      learning_method='online', batch_size=5479, learning_offset=50., 
                                      random_state=42, n_jobs=16)
document_topics = lda_model.fit_transform(cv_features)

Wall time: 3min 6s


In [10]:
topic_terms = lda_model.components_

# 5) Terms per topic

In [11]:
#Keywords for the various topics
top_terms = 20
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :top_terms]
topic_keyterms = vocabulary[topic_key_term_idxs]
topics = [', '.join(topic) for topic in topic_keyterms]
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame(topics,
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, TOTAL_TOPICS+1)])
topics_df


Unnamed: 0,Terms per Topic
Topic1,"energy, pig, growing, diet, content, fed, fiber, nutrient, adult, greater, animal, ge, corn, dm, acid, crossref, protein, meal, matter, stage"
Topic2,"inverter, output, group, apple, crossref, management, cost, power, welfare, animal, feature, wild, transport, current, october, sd, approach, controlled, error, measure"
Topic3,"analytics, measure, barrier, organization, blockchain, technical, chain, solution, fruit, supply, supply chain, business, efficiency, technical efficiency, initiative, smart, user, management, video, need"
Topic4,"proposed, error, accuracy, power, input, energy, part, carry, output, design, th, performance, scheme, reduction, operation, electronics, measure, prediction, ieee, analytics"
Topic5,"blockchain, group, crossref, cost, animal, score, smart, measure, higher, prediction, efficiency, density, feature, day, apple, treatment, application, analytics, ieee, approach"
Topic6,"image, bc, focus, group, fusion, crossref, proposed, treatment, mouse, image fusion, technique, region, map, il, animal, expression, score, intestinal, administration, multi"
Topic7,"output, power, design, voltage, current, electronics, core, density, proposed, equation, simulation, input, efficiency, frequency, ieee, loss, dc, energy, equivalent, circuit"
Topic8,"energy, cost, broiler, density, area, inverter, feature, crossref, animal, current, management, frequency, controlled, score, output, power, bird, production, distance, voltage"
Topic9,"analytics, yield, energy, blockchain, barrier, crossref, feature, measure, management, approach, treatment, growing, solution, pig, prediction, organization, algorithm, biomass, efficiency, technical efficiency"
Topic10,"wild, west, italy, sample, antibody, crossref, positive, serum, animal, european, presence, region, adult, mammal, dis, dis crossref, surveillance, human, pcr, bird"


# 6) Weight of each terms in a particular topic with respect to each document.


In [12]:
pd.options.display.float_format = '{:,.5f}'.format
dt_df = pd.DataFrame(document_topics, 
                     columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
dt_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
T1,1e-05,2e-05,1e-05,2e-05,0.1014,0.03856,2e-05,3e-05,3e-05,0.99971,1e-05,2e-05,2e-05,2e-05,1e-05,1e-05,0.00063,2e-05,2e-05,1e-05
T2,1e-05,2e-05,1e-05,2e-05,2e-05,1e-05,2e-05,3e-05,3e-05,2e-05,1e-05,2e-05,2e-05,2e-05,1e-05,1e-05,0.00063,2e-05,2e-05,1e-05
T3,1e-05,0.76056,1e-05,2e-05,0.79294,1e-05,2e-05,3e-05,3e-05,2e-05,1e-05,2e-05,2e-05,2e-05,0.94582,1e-05,0.00063,0.31728,0.02729,0.99987
T4,1e-05,2e-05,1e-05,2e-05,2e-05,1e-05,2e-05,3e-05,3e-05,2e-05,1e-05,2e-05,2e-05,2e-05,1e-05,1e-05,0.00063,2e-05,2e-05,1e-05
T5,1e-05,2e-05,1e-05,2e-05,2e-05,1e-05,2e-05,3e-05,3e-05,2e-05,1e-05,2e-05,2e-05,2e-05,1e-05,1e-05,0.00063,2e-05,2e-05,1e-05
T6,1e-05,2e-05,1e-05,2e-05,0.02702,1e-05,0.99971,3e-05,3e-05,2e-05,1e-05,2e-05,0.07782,0.99962,0.02612,1e-05,0.00063,2e-05,2e-05,1e-05
T7,1e-05,2e-05,1e-05,2e-05,2e-05,1e-05,2e-05,3e-05,3e-05,2e-05,0.33265,0.99958,0.23881,2e-05,0.02781,1e-05,0.00063,2e-05,2e-05,1e-05
T8,1e-05,2e-05,1e-05,2e-05,2e-05,1e-05,2e-05,3e-05,3e-05,2e-05,1e-05,2e-05,2e-05,2e-05,1e-05,1e-05,0.00063,2e-05,2e-05,1e-05
T9,1e-05,2e-05,1e-05,2e-05,2e-05,1e-05,2e-05,3e-05,3e-05,2e-05,1e-05,2e-05,2e-05,2e-05,1e-05,1e-05,0.00063,2e-05,2e-05,1e-05
T10,1e-05,2e-05,1e-05,2e-05,2e-05,1e-05,2e-05,3e-05,0.99948,2e-05,1e-05,2e-05,2e-05,2e-05,1e-05,1e-05,0.00063,2e-05,2e-05,1e-05


# 7)  Dominant topic for each research paper.

In [13]:
pd.options.display.float_format = '{:,.5f}'.format
pd.set_option('display.max_colwidth', 200)

max_contrib_topics = dt_df.max(axis=0)
dominant_topics = max_contrib_topics.index
contrib_perc = max_contrib_topics.values
document_numbers = [dt_df[dt_df[t] == max_contrib_topics.loc[t]].index[0]
                       for t in dominant_topics]
documents = [papers[i] for i in document_numbers]

results_df = pd.DataFrame({'Dominant Topic': dominant_topics, 'Contribution %': contrib_perc,
                          'Paper Num': document_numbers, 'Topic': topics_df['Terms per Topic'], 
                          'Paper Name': documents})
results_df

Unnamed: 0,Dominant Topic,Contribution %,Paper Num,Topic,Paper Name
Topic1,T1,0.99971,9,"energy, pig, growing, diet, content, fed, fiber, nutrient, adult, greater, animal, ge, corn, dm, acid, crossref, protein, meal, matter, stage","animals\nArticle\n\nEffects of Different Crude Protein and Dietary Fiber\nLevels on the Comparative Energy and Nutrient\nUtilization in Sows and Growing Pigs\nWenxuan Dong, Gang Zhang, Zhongchao L..."
Topic2,T2,0.00063,16,"inverter, output, group, apple, crossref, management, cost, power, welfare, animal, feature, wild, transport, current, october, sd, approach, controlled, error, measure","Editorial\n\nAcknowledgement to Reviewers of Logistics in 2019\nLogistics Editorial Office\nMDPI, St. Alban-Anlage 66, 4052 Basel, Switzerland\nPublished: 30 January 2020\n\nThe editorial team gre..."
Topic3,T3,0.99987,19,"analytics, measure, barrier, organization, blockchain, technical, chain, solution, fruit, supply, supply chain, business, efficiency, technical efficiency, initiative, smart, user, management, vid...","Article\n\nOvercoming Barriers in Supply Chain Analytics—\nInvestigating Measures in LSCM Organizations\nTino T. Herden *, Benjamin Nitsche and Benno Gerlach\nChair of Logistics, Technische Univer..."
Topic4,T4,0.00063,16,"proposed, error, accuracy, power, input, energy, part, carry, output, design, th, performance, scheme, reduction, operation, electronics, measure, prediction, ieee, analytics","Editorial\n\nAcknowledgement to Reviewers of Logistics in 2019\nLogistics Editorial Office\nMDPI, St. Alban-Anlage 66, 4052 Basel, Switzerland\nPublished: 30 January 2020\n\nThe editorial team gre..."
Topic5,T5,0.00063,16,"blockchain, group, crossref, cost, animal, score, smart, measure, higher, prediction, efficiency, density, feature, day, apple, treatment, application, analytics, ieee, approach","Editorial\n\nAcknowledgement to Reviewers of Logistics in 2019\nLogistics Editorial Office\nMDPI, St. Alban-Anlage 66, 4052 Basel, Switzerland\nPublished: 30 January 2020\n\nThe editorial team gre..."
Topic6,T6,0.99971,6,"image, bc, focus, group, fusion, crossref, proposed, treatment, mouse, image fusion, technique, region, map, il, animal, expression, score, intestinal, administration, multi","animals\nArticle\n\nThe Prophylactic Use of Bovine Colostrum in a\nMurine Model of TNBS-Induced Colitis\nLaura Menchetti 1 , Giulio Curone 2 , Iulia Elena Filipescu 3 , Olimpia Barbato 1 ,\nLeonar..."
Topic7,T7,0.99958,11,"output, power, design, voltage, current, electronics, core, density, proposed, equation, simulation, input, efficiency, frequency, ieee, loss, dc, energy, equivalent, circuit","Article\n\nPower Density Maximization in Medium Frequency\nTransformers by Using Their Maximum Flux Density\nfor DC–DC Converters\nDante Ruiz-Robles 1,*, Edgar L. Moreno-Goytia 1, Vicente Venegas-..."
Topic8,T8,0.00063,16,"energy, cost, broiler, density, area, inverter, feature, crossref, animal, current, management, frequency, controlled, score, output, power, bird, production, distance, voltage","Editorial\n\nAcknowledgement to Reviewers of Logistics in 2019\nLogistics Editorial Office\nMDPI, St. Alban-Anlage 66, 4052 Basel, Switzerland\nPublished: 30 January 2020\n\nThe editorial team gre..."
Topic9,T9,0.00063,16,"analytics, yield, energy, blockchain, barrier, crossref, feature, measure, management, approach, treatment, growing, solution, pig, prediction, organization, algorithm, biomass, efficiency, techni...","Editorial\n\nAcknowledgement to Reviewers of Logistics in 2019\nLogistics Editorial Office\nMDPI, St. Alban-Anlage 66, 4052 Basel, Switzerland\nPublished: 30 January 2020\n\nThe editorial team gre..."
Topic10,T10,0.99948,8,"wild, west, italy, sample, antibody, crossref, positive, serum, animal, european, presence, region, adult, mammal, dis, dis crossref, surveillance, human, pcr, bird","animals\nArticle\n\nWest Nile Virus and Related Flavivirus in European\nWild Boar (Sus scrofa), Latium Region, Italy: A\nRetrospective Study\nAngela Petruccelli 1 , Tiziana Zottola 2 , Gianmarco F..."


# 8) Predicting Topics for New Research Papers


In [14]:
new_paper_files = glob.glob('test_txt.txt')
new_papers = []
for fn in new_paper_files:
    with open(fn, encoding='utf-8', errors='ignore', mode='r+') as f:
        data = f.read()
        new_papers.append(data)
              
print('Total Documents:', len(new_papers))

Total Documents: 1


In [15]:
norm_new_papers = normalize_corpus(new_papers)
cv_new_features = cv.transform(norm_new_papers)
cv_new_features.shape

(1, 5479)

In [16]:
print(norm_new_papers[0][:100])

['article', 'financial', 'spillover', 'effect', 'supply', 'chain', 'customer', 'supplier', 'really', 'benefit', 'erik', 'hofmann', 'yannick', 'sertori', 'institute', 'supply', 'chain', 'management', 'university', 'st', 'gallen', 'st', 'gallen', 'switzerland', 'correspondence', 'erik', 'hofmann', 'unisg', 'ch', 'received', 'february', 'accepted', 'march', 'published', 'march', 'abstract', 'study', 'shown', 'leading', 'supply', 'chain', 'company', 'associated', 'significantly', 'higher', 'company', 'financial', 'ratio', 'competitor', 'contrast', 'little', 'research', 'ha', 'focused', 'financial', 'performance', 'affiliated', 'supplier', 'customer', 'supply', 'chain', 'leader', 'scl', 'company', 'thus', 'central', 'purpose', 'paper', 'determine', 'financial', 'perspective', 'whether', 'supplier', 'customer', 'benefit', 'lose', 'participating', 'scl', 'network', 'called', 'financial', 'spillover', 'effect', 'company', 'ranked', 'gartner', 'supply', 'chain', 'top', 'selected', 'scls', 'sele

In [17]:
topic_predictions = lda_model.transform(cv_new_features)
best_topics = [[(topic, round(sc, 3)) 
                    for topic, sc in sorted(enumerate(topic_predictions[i]), 
                                            key=lambda row: -row[1])[:2]] 
                        for i in range(len(topic_predictions))]
best_topics

[[(2, 0.633), (11, 0.254)]]

In [21]:
results_df = pd.DataFrame()
results_df['Papers'] = range(1, len(new_papers)+1)
results_df['Dominant Topics'] = [[topic_num for topic_num, sc in item] for item in best_topics]
res = results_df.set_index(['Papers'])['Dominant Topics'].apply(pd.Series).stack().reset_index(level=1, drop=True)
results_df = pd.DataFrame({'Dominant Topics': res.values}, index=res.index)
results_df['Topic Score'] = [topic_sc for topic_list in 
                                        [[round(sc*100, 2) 
                                              for topic_num, sc in item] 
                                                 for item in best_topics] 
                                    for topic_sc in topic_list]

results_df['Topic Desc'] = [topics_df.iloc[t-1]['Terms per Topic'] for t in results_df['Dominant Topics'].values]
results_df['Paper Desc'] = [new_papers[i-1][:200] for i in results_df.index.values]

results_df

Unnamed: 0_level_0,Dominant Topics,Topic Score,Topic Desc,Paper Desc
Papers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2,63.3,"inverter, output, group, apple, crossref, management, cost, power, welfare, animal, feature, wild, transport, current, october, sd, approach, controlled, error, measure","Article\n\nFinancial Spillover Effects in Supply Chains:\nDo Customers and Suppliers Really Benefit?\nErik Hofmann * and Yannick Sertori\nInstitute of Supply Chain Management, University of St. Ga..."
1,11,25.4,"management, inverter, efficiency, energy, output, crossref, current, yield, group, controlled, technical, bc, farmer, technical efficiency, maize, score, transport, two three, like, signal","Article\n\nFinancial Spillover Effects in Supply Chains:\nDo Customers and Suppliers Really Benefit?\nErik Hofmann * and Yannick Sertori\nInstitute of Supply Chain Management, University of St. Ga..."


# 9) Visualizations

In [26]:
import pyLDAvis
import pyLDAvis.sklearn
import dill
import warnings

warnings.filterwarnings('ignore')
pyLDAvis.enable_notebook()

In [32]:

with open('LDA_model.pkl', 'wb') as f:
    dill.dump(lda_model, f)
with open('cv_features.pkl', 'wb') as f:
    dill.dump(cv_features, f)
with open('cv.pkl', 'wb') as f:
    dill.dump(cv, f)

In [34]:
with open('LDA_model.pkl', 'rb') as f:
    LDA_model = dill.load(f)
with open('cv_features.pkl', 'rb') as f:
    cv_features = dill.load(f)
with open('cv.pkl', 'rb') as f:
    cv = dill.load(f)

In [None]:
pyLDAvis.sklearn.prepare(LDA_model, cv_features, cv, mds='mmds')


# 10) Conclusion