In [1]:
# Import dataset
import numpy as np
import pandas as pd
import datetime
# Cosine 
from scipy import spatial
# KNN & TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
# spacy
import spacy
nlp = spacy.load('en_core_web_lg')
# Gensim
import gensim
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
# Plotting tools
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim
# Warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#data importing
def read_trim():
    df = pd.read_excel ('koiosDatabase_Concepts_and_Definitions.xlsx')
    usefulAttributes = ['CONCEPT_ID', 'TERM_ID', 'DEFINITION_ID' ,'DEFINITION_CONTENT_ID','SYNONYMS_ID', 
    'CONCEPT_TYPE_ID', 'SYNONYM_VALUE', 'DEFINITION','DEF_FULL_SOURCE_TEXT','TERM_SOURCE_ID']
    for x in df.columns.values:
        if x not in usefulAttributes:
            df.drop(x, 1, inplace=True)  
    return df

In [3]:
#tokenizing a description
#return list of "cleaned" words
def remove_punct_stop(description):
    low = [str(t) for t in nlp(description) if t.is_alpha and not t.is_stop]
    return low

#create a dictionary of terms
def term_concepts(df):
    # passed as a list of lists of words for gensim bigram
    #descriptions = []
    
    #descriptions = numpy.empty(len(df))
    x = 0
    
    keys = set(df['SYNONYM_VALUE'])
    term_to_concepts = dict.fromkeys(keys)


    for index, row in df.iterrows():

        term = row['SYNONYM_VALUE']
        concept_id = row['CONCEPT_ID']
        concept_type = row['CONCEPT_TYPE_ID']
        definition = row['DEFINITION']


       # descriptions.append(remove_punct_stop(definition))

        if term_to_concepts[term] == None:
            term_to_concepts[term] = {}

        #use concept id as keys for the dictionary of each term
        term_to_concepts[term][concept_id] = (concept_type, definition)
        
    return term_to_concepts, df['DEFINITION'].values

In [4]:
#delete terms with only one concept
def deletekeys(term_to_concepts):
    for x in list(term_to_concepts.keys()):
        if len(term_to_concepts[x]) == 1: #len(
            del term_to_concepts[x]

In [13]:
#train and return the bigram generator
def trainBigram(description, min_count, threshold):
    phrases = Phrases(description, min_count = min_count, threshold = threshold)
    bigram = Phraser(phrases)
    return bigram

#used for topic modelling
def lemmatization(text):
    allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
    doc = nlp(" ".join(text))
    returnnewtext = [t.lemma_ if t.lemma_ not in ['-PRON-'] 
                     else '' for t in doc if t.pos_ in allowed_postags]
    return returnnewtext

#removes stopwords and punctuations from each description
#return a list of cleaned descriptions
#generate lists of words for tfidf and topic modelling
def cleared_text_lists(term_to_concepts, bigram, nr_descriptions):
    tfidf_text = [None] * nr_descriptions
    topicmodel_text = [[] for _ in range(nr_descriptions)]
    
    x = 0
    for term in term_to_concepts:
        for concept in term_to_concepts[term]:
            
            t, d = term_to_concepts[term][concept]
            desc = bigram[remove_punct_stop(d)]

            #call lemmatization for the word list used in Topic Modelling
            #topicmodel_text.append(lemmatization(desc))
            
            topicmodel_text[x] = desc
            tfidf_text[x]  = ' '.join(desc)
    
            #tfidf_text.append(tfidf_list)
            x = x + 1
    return tfidf_text, topicmodel_text

In [6]:
# method for training the tf-idf on all of the descriptions
def trainer(l):
    tfidf = TfidfVectorizer(ngram_range = (1,1), max_df = 1000, max_features = 30000 )
    tfidf.fit(l)
    return tfidf

#train tfidf on descriptions associated with a single term
# def trainer_w_term(term):
#     text = []
#     for concept in list(term_to_concepts[term]):
#             t, d = term_to_concepts[term][concept]
#             d = ' '.join([str(t) for t in nlp(d) if not t.is_stop | t.is_punct ])
#             text.append(d)
#     tfidf = TfidfVectorizer(ngram_range = (1,2))
#     tfidf.fit(text)
#     return tfidf

# Approach I
# Nearest Neighbours training method
def knn_trainer(tfidf,l):
    #samples = []
    descriptions = []
    
    features = numpy.empty(shape=(len(l),20000))
    x = 0

    for d in l:
        v = tfidf.transform([d]).toarray()
        features[x] = v[0]
#         samples.append(v[0])
        x = x+1
        del v
        descriptions.append(d)
        del d

   # s = np.array(samples)
    print(features.shape)
   # del samples
    nn = NearestNeighbors(metric = 'cosine')
    nn.fit(features)
    
    return nn, descriptions

#return method for KNN
def return_top_k(k, d,nn, tfidf, bigram):  
    sentence =' '.join(bigram[remove_punct_stop(d)])
    v = tfidf.transform([sentence]).toarray()
    return nn.kneighbors([v[0]],n_neighbors = k)
     

In [7]:
#RADU'S TFIDF and KNN Pipeline
def tfidf_knn(l1, bigram, text,k):
    
    #call the tfidf trainer method, return a trained tfidf
    print("STARTED training TF-IDF AT: ",datetime.datetime.now())
    tfidf = trainer(l1)
    
    
    print("FINISHED training TF-IDF AT: ",datetime.datetime.now())
    
    #call the knn trainer method, return a trained knn and adjacent list of descriptions
    #in order to visualise the results
    print("STARTED training KNN AT: ", datetime.datetime.now())
    knn, ds = knn_trainer(tfidf,l1)
    print("FINISHED training KNN AT: ",datetime.datetime.now())

    #call the method to return the top k -first parameter- most similar
    #descriptions to the one passed as the second parameter
    print("RETURN results for example: ", datetime.datetime.now())
    top = return_top_k(5,text, knn, tfidf, bigram)
    print("Finished at: ", datetime.datetime.now())

    #visualise the results
    x,pos = top
    print(top)
    for i in pos[0]:
        print(ds[i])

In [8]:
# Step 3: Topic Modelling - LDA Online Training

# Create document-term matrix from preprocessed text(tokenize words, remove stopwords, creating bigrams, and word lemmatization)
def prepare_corpus(preprocessedDescription):
    # Create Dictionary
    dictionary = corpora.Dictionary(preprocessedDescription)
    
    # Create corpus
    texts = preprocessedDescription
    
    # Term Document Frequency
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    return dictionary,corpus

# Build the LDA - Online Learning Topic Model
def create_lda_model(preprocessedDescription, nr_topics):
    dictionary, corpus = prepare_corpus(preprocessedDescription)

    # Online LDA training - processes the whole corpus in one pass, then updates the model, then another pass and so on (Faster Execution)
    #lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=nr_topics, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=False)
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=nr_topics, random_state=100,passes=10)
    # Different methods of printing key topics and associated key words with scores
    print(lda_model.print_topics())
    #print(lda_model.top_topics(document_term_matrix, dictionary, coherence = 'c_v', topn = words))
    #for i, topic in lda_model.show_topics(formatted=True, num_topics = 10, num_words = 5):
        #print(str(i) +": "+ topic)
        #print()

    return lda_model

# Compute perplexity score and coherence score - performance measurement metrics that give an indication of how good the topic model is
def compute_perplexity_coherence(data_lemmatized, lda_model, corpus, dictionary):
    print('\n Perplexity: ', lda_model.log_perplexity(corpus))

    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\n Coherence Score: ', coherence_lda)

# Compute cohrerence scores for multiple trained lda models and compute corresponding coherence scores to determine optimal number of topics
def compute_coherence_values(dictionary, corpus, preprocessedDescriptions, limit, start=2, step=3):
    # dictionary: Gensim dictionary
    # corpus: Gensim corpus
    # preprocessedDescriptions: list of processed descriptions
    # limit: maximum number of topics
    
    coherence_values = []
    model_list = []
    
    for numberTopics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus, num_topics = numberTopics, id2word = dictionary, passes = 1)
        model_list.append(model)
        coherence_model = CoherenceModel(model = model, texts = preprocessedDescriptions, dictionary = dictionary, coherence = 'c_v')
        coherence_values.append(coherence_model.get_coherence())
        
    return model_list, coherence_values

# Plot the above results in the form of a line graph.
def plot_graph(preprocessedDescription, start, limit, step):
    dictionary, corpus = prepare_corpus(preprocessedDescription)
    model_list, coherence_values = compute_coherence_values(dictionary, corpus, preprocessedDescription, limit, start, step)
    
    x = range(start, limit, step)
    plt.plot(x,coherence_values)
    #plt.xticks(x)
    plt.xlabel("Number of topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc = 'best')
    plt.show()
    
    for m, cv in zip(x, coherence_values):
        print("Number of Topics =", m, " has Coherence Value of", round(cv, 4))

In [9]:
#Abhi's Topic Modelling using LDA (Online Training)

def topic_modelling(l2, nr_topics):
    # Method call - prepare_corpus
    dictionary, corpus = prepare_corpus(l2)
    print(corpus[:1])

    # Human readable format of corpus (term-frequency) - Understanding the document-term matrix
    [[(dictionary[id], freq) for id, freq in corp] for corp in corpus[:1]]

    # Method call - lda model creation and training
    lda_model = create_lda_model(l2, nr_topics)

    # Method call - perplexity/coherence score computation
    compute_perplexity_coherence(l2, lda_model, corpus, dictionary)

    # Visualise potential optimum number of topics to determine the diversity of topics and identify closely similar topics, that can be merged into one large topic
    lda_model = create_lda_model(l2, nr_topics)
    dictionary,corpus = prepare_corpus(l2)
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
    vis
    
    # Method call - plotting coherence score vs. number of topics
    start,limit,step = 2,50,5
    plot_graph(l2, start, limit, step) 

In [10]:
def main():
    
   #STAGE I - initialise the dataframes, dictionaries, bigrams and list
    print("STAGE I")
    df = read_trim()
    print("Term-Concept Dictionary is being loaded...")
    term_to_concepts, description = term_concepts(df)
    print("Bigram model is being trained...")
    bigram = trainBigram(description,1,1)
    print("Deleting keys")
    deletekeys(term_to_concepts)
    print("Generate Lists")
    l1, l2 = cleared_text_lists(term_to_concepts, bigram)
    
   #STAGE II - Tfidf
    print("STAGE II")
    k = 5
    text = "A set of triple-in-line package outline styles with through-hole leads in standard form of which each outline style can be described with the same group of data element types"
    tfidf_knn(l1,bigram,text,k)
    
   #StageIII - Topic Modelling
    print("STAGE III")
    topic_modelling(l2)

    
if __name__ == "__main__":
    main()

STAGE I
Term-Concept Dictionary is being loaded...
Bigram model is being trained...
Deleting keys
Generate Lists
STAGE II
STARTED training TF-IDF AT:  2019-07-15 10:08:28.594123
FINISHED training TF-IDF AT:  2019-07-15 10:08:28.828562
STARTED training KNN AT:  2019-07-15 10:08:28.831556
(14546, 40519)
FINISHED training KNN AT:  2019-07-15 10:08:44.623082
RETURN results for example:  2019-07-15 10:08:44.626075
Finished at:  2019-07-15 10:08:51.048194
(array([[0.        , 0.06335632, 0.16286706, 0.16641799, 0.17559802]]), array([[ 2378, 12924,  2381,  2380,  2382]], dtype=int64))
set_triple line_package outline_styles hole_leads standard_form outline_style described_group data_element types
set_triple line_package outline_styles hole_leads outline_style described_group data_element types
set_quad line_package outline_styles hole_leads standard_form outline_style described_group data_element types
set_dual line_package outline_styles hole_leads standard_form outline_style described_group 

TypeError: topic_modelling() missing 1 required positional argument: 'nr_topics'

In [14]:
#STAGE I - initialise the dataframes, dictionaries, bigrams and list

print(datetime.datetime.now())
print("STAGE I")
df = read_trim()
print("Term-Concept Dictionary is being loaded...")
term_to_concepts, description = term_concepts(df)
print("Bigram model is being trained...")
bigram = trainBigram(description,1,1)
#print("Deleting keys")
#deletekeys(term_to_concepts)
print("Generate Lists")
l1, l2 = cleared_text_lists(term_to_concepts, bigram, len(description))
print(datetime.datetime.now())

2019-07-18 10:06:20.000512
STAGE I
Term-Concept Dictionary is being loaded...
Bigram model is being trained...
Generate Lists
2019-07-18 10:16:29.793377


In [11]:
del df
del term_to_concepts
del description

In [28]:
import numpy
#APPROACH III

#KNN trainer using features from both tfidf and topic modelling
def new_knnTrainer(tfidf,l,lda_model, corpus, no_topics):
    #descriptions = numpy.empty(shape=(len(l),))
    
    samples = numpy.empty(shape=(len(l),30050))
    x = 0
    
    for d in l:
        tm_features = numpy.zeros(no_topics)
        d2 = dictionary.doc2bow(d)
        
        for tuples in lda_model[d2]:
            topic, score = tuples
            tm_features[topic] = score
        
        del d2
        
        d = ' '.join(t for t in d)
        v = tfidf.transform([d]).toarray()
        samples[x] = numpy.concatenate((v[0], tm_features), axis=0)
        del v
        del tm_features
        #descriptions[x] = numpy.fromstring(d)
        x = x+1
    
    #s = np.array(samples)
    
    print(samples.shape)
    
    nn = NearestNeighbors(algorithm='kd_tree')
    nn.fit(samples)
    
    return nn#, descriptions

#Approach II
#KNN trainer for topic modelling
# numpy array used to store the scores each description has for each topic
def KNN_trainer_topicModelling(lda_model, corpus, no_topics):  
    no_descriptions = len(corpus)
    features = numpy.empty(shape=(no_descriptions,no_topics))
    
    x = 0

    for sentence in corpus:
        #print(sentence)
        numbers = numpy.zeros(no_topics)
        #print(len(numbers))
        values = lda_model[sentence]
        #print(values)
        for tuples in values:
            topic, score = tuples
            numbers[topic] = score
        features[x] = numbers
        x = x + 1

    #features = scale(features, axis = 1)
    nn_topic = NearestNeighbors(algorithm = 'auto',metric = 'cosine')#(metric = 'euclidean')
    nn_topic.fit(features)
    
    return nn_topic


In [13]:
dictionary, corpus = prepare_corpus(l2)

In [18]:
tfidf = trainer(l1)

In [15]:
lda = gensim.models.ldamodel.LdaModel(corpus =corpus, id2word = dictionary, num_topics = 50, random_state = 100, passes = 10)
#compute_perplexity_coherence(l2,lda, corpus, dictionary)

In [45]:
#Running the KNN trainers
del nn
#nn = (new_knnTrainer(tfidf,l2,lda,corpus,50))
#nn = KNN_trainer_topicModelling(lda,corpus,25)

In [18]:
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda, new_corpus, dictionary)
# vis

In [42]:
#lda.print_topics()

In [44]:
#simulate a new text given as input to the KNN model
#text = "A set of triple-in-line package outline styles with through-hole leads in standard form of which each outline style can be described with the same group of data element types"
text = 'A set of bead package outline styles with axial wire leads of wich each outline style can be described with the same group of data element types'
text = bigram[remove_punct_stop(text)]

text_vec = dictionary.doc2bow(lemmatization(text))
sentence = ' '.join(text)
v = tfidf.transform([sentence]).toarray()

#preprocess the new text
processed = numpy.zeros(50)
for tuples in lda[text_vec]:
    topic, score = tuples
    processed[topic] = score

#new = [processed]                             
new = numpy.concatenate((v[0], processed), axis=0)
del v
del processed

top = nn.kneighbors([new], n_neighbors = 20)#, algorithm = 'auto')    

del new

x,pos = top
print(top)
for i in pos[0]:
    print(l1[i])
#    print("l1 version of the sentence: ",l1[i])
    print("************************************************************************")

(array([[0.29300081, 0.65108409, 0.66925296, 0.67425509, 0.67746258,
        0.68158059, 0.79497501, 0.80258987, 0.86435089, 0.89286119,
        0.9006797 , 0.90609412, 0.91167522, 0.91188016, 1.01245985,
        1.04067674, 1.04882952, 1.05689774, 1.05787626, 1.05801902]]), array([[54837, 54835,  1333, 54838, 54831, 12470, 30123, 28537, 40548,
         1325, 54833,  1328, 12468, 12469,  1329, 40734, 59015, 22997,
        61143, 36357]], dtype=int64))
set_bead package_outline styles_axial wire_leads outline_style described_group data_element types
************************************************************************
set_cylindrical package_outline styles_axial wire_leads outline_style described_group data_element types
************************************************************************
set_bead package_outline styles_straight wire_leads outline_style described_group data_element types
************************************************************************
set_rectangular packa

In [36]:
# # return a scipy sparse matrix representing the tfidf weight for each term of each document
# # have to convert into corpus to be loaded for lda model

# new_l2 = []
# for x in l2:
#     sentence = ' '.join(x)
#     new_l2.append(sentence)


# tfidf = trainer(new_l2)
# inverse =tfidf.fit_transform(new_l2)
# #print(inverse)
# new_corpus = []
# for row in inverse:
#     zipped = zip(row.indices, row.data)
#     new_corpus.append(list(zipped))