In [None]:
import spacy
from gensim.models.word2vec import Word2Vec as W2V
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import string
import random
from time import sleep

import os.path
from gensim import corpora, similarities
from gensim.models import LsiModel, TfidfModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
import warnings

from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
import numpy as np
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from matplotlib.pyplot import figure as fg
from bokeh.models import Label
from bokeh.io import output_notebook
import matplotlib.colors as mcolors
import matplotlib

warnings.simplefilter(action='ignore', category=FutureWarning)



tqdm.pandas()

# DatasetLoad

Model Param

In [None]:
df_col_names = ['stem_text', 'doc_text', 'lem_text']
df_col_select = df_col_names[0]
df_query_col_names = ['cl_q', 'stem_q', 'lem_q']
df_query_col_select = df_query_col_names[0]
test_dim = 0.2
print("\nTEXT: " + df_col_select + "\nQUERY: " + df_query_col_select + "\nTEST_DIM: " + str(test_dim))

In [None]:
path = "Docs/"
luc_retr = path+"raw_dev_Lucene_retrievals.csv"
g_truth_rank = path + "dev_data.csv"

In [None]:
path_cl = "ProcDocs/Split_"+str(test_dim)+"/"
docs_test_path = path_cl +"docs_test.csv"
docs_train_path = path_cl +"docs_train.csv"
queries_test_path = path_cl +"queries_test.csv"
queries_train_path = path_cl +"queries_train.csv"

In [None]:
model_path = "ProcDocs/LSI/"

In [None]:
docs_train_df = pd.read_csv(docs_train_path)
docs_test_df = pd.read_csv(docs_test_path)

In [None]:
queries_train_df = pd.read_csv(queries_train_path)
queries_test_df = pd.read_csv(queries_test_path)

In [None]:
luc_retr_df = pd.read_csv(luc_retr)
g_truth_r = pd.read_csv(g_truth_rank)

# Training Function

In [None]:
def prepare_corpus(doc_clean):
    """
    Input  : clean document
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    tfidf = TfidfModel(doc_term_matrix)
    corpus_tfidf = tfidf[doc_term_matrix]

    # generate LDA model
    return dictionary, corpus_tfidf, tfidf

In [None]:
def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
    """
    Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LSA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for number_of_topics in tqdm(range(start, stop, step)):
        # generate LSA model
        model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='u_mass')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

# Train Model

In [None]:
clean_text_tok=list(docs_train_df[df_col_select].apply(lambda x: str(x).split()))

In [None]:
number_of_topics = 5
dictionary,doc_term_matrix, tfidf=prepare_corpus(clean_text_tok)
model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)

# Query Evaluation

In [None]:
def average_precision(q_num, df_result):
    comparison = g_truth_r.loc[(g_truth_r.Query_number == q_num), ['Query_number', 'doc_number', 'label']].reset_index(drop=True)
    comparison = comparison[comparison.label == 1]
    n_tot_corr = len(comparison)
    df_result = df_result.iloc[:n_tot_corr]
    score = []
    score_pos = []
    
    for i in range(1,n_tot_corr):
        if int(df_result.iloc[i-1]['doc_number']) in list(comparison['doc_number']):
            score.append(1)
            score_pos.append((np.sum(score)/i))
        else:
            score.append(0)
    if np.sum(score) == 0:
        return 0
    return np.sum(score_pos)/len(score_pos)


In [None]:
def recall_at(q_num, df_result, rec_at=20):
    comparison = g_truth_r.loc[(g_truth_r.Query_number == q_num), ['Query_number', 'doc_number']].reset_index(drop=True)
    n_tot_corr = min(rec_at, len(comparison)) 
    df_result = df_result.iloc[:n_tot_corr]
    score = []
    for i in range(1,n_tot_corr):
        if int(df_result.iloc[i-1]['doc_number']) == int(comparison.iloc[i-1]['doc_number']):
            score.append(1)
        else:
            score.append(0)
    if np.sum(score) == 0:
        return 0
    return np.sum(score)/n_tot_corr

In [None]:
def query_docs_ranker(q_num, model, rec_at = [10]):
    #Selected query: true documents ids
    s_q_true = g_truth_r.loc[g_truth_r.Query_number == q_num, ['Query_number', 'doc_number']].reset_index(drop=True)
    #Selected query text
    s_query = queries_test_df.loc[queries_test_df.Query_number == q_num].reset_index(drop=True)

    #Selected query: true documents
    used_docs_query = docs_test_df[docs_test_df.doc_number.isin(list(s_q_true.doc_number))].reset_index(drop=True)

    doc_num = list(used_docs_query['doc_number'])
    #Used documents text
    used_docs_list = list(used_docs_query[df_col_select].apply(lambda x: str(x).split()))
    _, used_docs_matr,_ = prepare_corpus(used_docs_list)

    # convert the query to LSI space
    doc = s_query[df_query_col_select].values[0]
    vec_bow = dictionary.doc2bow(doc.lower().split())
    vec_bow_tfidf = tfidf[vec_bow]
    vec_lsi = model[vec_bow_tfidf]  

    index = similarities.MatrixSimilarity(model[used_docs_matr]) 

    # perform a similarity query against the corpus
    sims = index[vec_lsi]  

    sims_1 = sorted(enumerate(sims), key=lambda item: -item[1])
    df_result = pd.DataFrame(columns=['doc_number', 'sim'])
    for doc_position, doc_score in sims_1:
        df_result = df_result.append({'doc_number': int(doc_num[doc_position]),'sim': doc_score}, ignore_index = True)

    rec_0 = recall_at(q_num, df_result, rec_at=rec_at[0])
    rec_1 = recall_at(q_num, df_result, rec_at=rec_at[1])
    rec_2 = recall_at(q_num, df_result, rec_at=rec_at[2])
    ai_OR= average_precision(q_num, df_result)
        
    return [rec_0,rec_1,rec_2, ai_OR]

In [None]:
tmp = queries_test_df.Query_number.progress_apply(lambda x: query_docs_ranker(x, model, rec_at=[5,10,20])).to_list()
score_df = pd.DataFrame(tmp)
print("rec@5: "+str(np.mean(score_df.iloc[:,0].to_list())*100))
print("rec@10: "+str(np.mean(score_df.iloc[:,1].to_list())*100))
print("rec@20: "+str(np.mean(score_df.iloc[:,2].to_list())*100))
print("avg_p: "+str(np.mean(score_df.iloc[:,3].to_list())*100))

# Parameter Optimization

In [None]:
clean_text_tok=list(docs_train_df[df_col_select].apply(lambda x: str(x).split()))

In [None]:
dictionary,doc_term_matrix, tfidf=prepare_corpus(clean_text_tok)

In [None]:
model_list_d = dict()
coherence_values_d = dict()

In [None]:
k_test = list(range(3, 41))

In [None]:
for x in tqdm(k_test):
    start,stop,step=x, x+1, 1
    model_list, coherence_values = compute_coherence_values(dictionary, doc_term_matrix,clean_text_tok, stop, start, step)
    model_list_d[x] = model_list[0]
    coherence_values_d[x] = coherence_values[0]

In [None]:
model = model_list_d[35]

## Verify num of topics with coherence

In [None]:
ordered_coherence_values_d = []
for x in k_test:
    ordered_coherence_values_d.append(coherence_values_d[x])
fg(figsize=(11, 10), dpi=80)
plt.plot(k_test, ordered_coherence_values_d)
plt.vlines(np.arange(k_test[0],k_test[len(k_test)-1],1), -3, 0, color = 'grey', linestyle = "--")
plt.xticks(np.arange(k_test[0],k_test[len(k_test)-1],1))
plt.xlabel("Number of Topics")
plt.ylabel("Coherence score")
plt.show()

## Verify num of topics with k-maens

Code Source: https://www.geeksforgeeks.org/elbow-method-for-optimal-value-of-k-in-kmeans/

In [None]:
X_list = []

In [None]:
for key in k_test:
    x = model_list_d[key]
    topic_weights = []
    for i, row_list in enumerate(x[doc_term_matrix]):
        topic_weights.append([w for i, w in row_list])
    # Array of topic weights    
    arr = pd.DataFrame(topic_weights).fillna(0).values
    # Keep the well separated points (optional)
    arr = arr[np.amax(arr, axis=1) > 0.10]
    # Dominant topic number in each doc
    topic_num = np.argmax(arr, axis=1)
    # tSNE Dimension Reduction
    tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
    tsne_lda = tsne_model.fit_transform(arr)
    x1 = np.array(tsne_lda[:,0])
    x2 = np.array(tsne_lda[:,1])
    X = np.array(list(zip(x1, x2))).reshape(len(x1), 2)
    X_list.append(X)

In [None]:
print(len(X_list))

## Save and load reduced dimension

In [None]:
end_fix = model_path+"param_opt"
np.save(model_path+"param_opt/red_dim.npy",np.array(X_list, dtype=object))

In [None]:
end_fix = model_path+"param_opt"
X_list = list(np.load(model_path+"param_opt/red_dim.npy", allow_pickle = True))

In [None]:
distortions = []
inertias = []
mapping1 = {}
mapping2 = {}
K = k_test
 
for k in K:
    # Building and fitting the model
    kmeanModel = KMeans(n_clusters=k).fit(X_list[k-3])
    kmeanModel.fit(X_list[k-3])
 
    distortions.append(sum(np.min(cdist(X_list[k-3], kmeanModel.cluster_centers_,
                                        'euclidean'), axis=1)) / X_list[k-3].shape[0])
    inertias.append(kmeanModel.inertia_)
 
    mapping1[k] = sum(np.min(cdist(X_list[k-3], kmeanModel.cluster_centers_,
                                   'euclidean'), axis=1)) / X_list[k-3].shape[0]
    mapping2[k] = kmeanModel.inertia_

In [None]:
fg(figsize=(13, 13), dpi=100)
plt.vlines(np.arange(0,k_test[len(k_test)-1],1), 0, 23, color = 'grey', linestyle = "--", alpha = 0.3)
plt.vlines([11], 0, distortions[8], color = 'red', linestyle = "--")
plt.vlines([16], 0, distortions[13], color = 'red', linestyle = "--")
plt.vlines([35], 0, distortions[32], color = 'red', linestyle = "--")
plt.plot(K, distortions, 'bx-')
plt.subplot().axline((11, distortions[8]), (14, distortions[11]), color='orange', label='by points', linestyle = "--")
plt.subplot().axline((16, distortions[13]), (25, distortions[22]), color='green', label='by points', linestyle = "--")
plt.subplot().axline((35, distortions[32]), (40, distortions[37]), color='brown', label='by points', linestyle = "--")
plt.xticks(np.arange(0,k_test[len(k_test)-1],1))
plt.yticks(np.arange(0,23,1))
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('The Elbow Method using Distortion')
plt.legend(["Distortion", "loc@11", "loc@16", "loc@35"])
plt.show()

In [None]:
max(inertias)

In [None]:
inertias = list(map(lambda i: i/1000000, inertias))

In [None]:
fg(figsize=(13, 13), dpi=100)
plt.vlines(np.arange(0,k_test[len(k_test)-1],1), 0, 5, color = 'grey', linestyle = "--", alpha = 0.3)
plt.vlines([11], 0, inertias[8], color = 'red', linestyle = "--")
plt.vlines([16], 0, inertias[13], color = 'red', linestyle = "--")
plt.vlines([35], 0, inertias[32], color = 'red', linestyle = "--")
plt.plot(K, inertias, 'bx-')
plt.subplot().axline((11, inertias[8]), (14, inertias[11]), color='orange', label='by points', linestyle = "--")
plt.subplot().axline((16, inertias[13]), (25, inertias[22]), color='green', label='by points', linestyle = "--")
plt.subplot().axline((35, inertias[32]), (40, inertias[37]), color='brown', label='by points', linestyle = "--")
plt.xticks(np.arange(0,k_test[len(k_test)-1],1))
plt.yticks(np.arange(0,6,1))
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.legend(["Ineritias", "loc@11", "loc@16", "loc@35"])
plt.show()

## Save Models

In [None]:
for x in tqdm(k_test):
    end_fix = model_path+"param_opt"
    end_fix = end_fix+"/LSI_model_"+str(x)
    try: 
        os.mkdir(end_fix)
    except:
        continue
    model_list_d[x].save(end_fix + "/mod")

Loads models if previously created

In [None]:
for x in tqdm(k_test):
    end_fix = model_path+"param_opt"
    end_fix = end_fix+"/LSI_model_"+str(x)
    model_list_d[x] = LsiModel.load(end_fix + "/mod")
    coherence_values_d[x] = CoherenceModel(model=model_list_d[x], texts=clean_text_tok, dictionary=dictionary, coherence='u_mass').get_coherence()

# Plot

In [None]:
model = model_list_d[11]

TSNE Plot from: https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/

In [None]:
tsne_lda_16 = tsne_lda
topic_num_16 = topic_num

In [None]:
# Get topic weights
topic_weights = []
for j, row_list in enumerate(model[doc_term_matrix]):
    topic_weights.append([w for i, w in row_list])

# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values

# Keep the well separated points (optional)
arr = arr[np.amax(arr, axis=1) > 0.1]

# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

In [None]:
import seaborn as sns

In [None]:
# Plot the Topic Clusters using Bokeh
output_notebook()
n_topics = 11
mycolors = np.array([color for name, color in mcolors.CSS4_COLORS.items()])
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), plot_width=600, plot_height=600)
mycolors = np.array(sns.color_palette(n_colors=40).as_hex())
random.shuffle(mycolors)
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
show(plot)

In [None]:
fg(figsize=(13, 13), dpi=100)
plt.vlines(np.arange(0,k_test[len(k_test)-1],1), 0, 5, color = 'grey', linestyle = "--", alpha = 0.3)
plt.vlines([11], 0, inertias[8], color = 'red', linestyle = "--")
plt.vlines([16], 0, inertias[13], color = 'red', linestyle = "--")
plt.vlines([35], 0, inertias[32], color = 'red', linestyle = "--")
plt.plot(K, inertias, 'bx-')
plt.subplot().axline((11, inertias[8]), (14, inertias[11]), color='orange', label='by points', linestyle = "--")
plt.subplot().axline((16, inertias[13]), (25, inertias[22]), color='green', label='by points', linestyle = "--")
plt.subplot().axline((35, inertias[32]), (40, inertias[37]), color='brown', label='by points', linestyle = "--")
plt.xticks(np.arange(0,k_test[len(k_test)-1],1))
plt.yticks(np.arange(0,6,1))
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.legend(["Ineritias", "loc@11", "loc@16", "loc@35"])
plt.show()

In [None]:
fg(figsize=(8, 8), dpi=80)
x = np.arange(4)
avg = [76.37, 76.80, 74.29, 59.30]
width = 0.5
  
# plot data in grouped manner of bar type
plt.bar(x, avg, width)
plt.xticks(x, ['Skip-Gram', 'CBOW', 'CADE-CBOW', 'LSI'])
plt.yticks(np.arange(0,101,10))
plt.ylabel('Avg Precision %')
plt.legend(["Average Precision"])
for i in range(len(x)):
    plt.text(i-0.15, avg[i]/2, str(avg[i]) + '%', color='orange', fontweight='bold')

In [None]:
fg(figsize=(8, 8), dpi=80)
x = np.arange(4)
y5 = [10.1,9.1, 10.4, 3.5]
y10 = [7.90, 6.89, 7.51, 3.9]
width = 0.40
  
# plot data in grouped manner of bar type
plt.bar(x-0.2, y5, width)
plt.bar(x+0.2, y10, width)
plt.xticks(x, ['Skip-Gram', 'CBOW', 'CADE-CBOW', 'LSI'])
plt.ylabel('Recall %')
plt.legend(["Rec@5", "Rec@10"])
for i in range(len(x)):
    plt.text(i-0.31, y5[i]/2, str(y5[i]) + '%', color='orange', fontweight='bold')
    plt.text(i+0.06, y10[i]/2, str(y10[i]) + '%', color='blue', fontweight='bold')