# **Classification using Unified Similarity Measure**

In [1]:
import pandas as pd
import nltk
import gensim
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from gensim.parsing.preprocessing import STOPWORDS
import numpy as np
from gensim import corpora,models
import time
import pickle
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.metrics.pairwise import cosine_similarity
import warnings
from gensim.models import Word2Vec,FastText
from sklearn.decomposition import PCA
from scipy.spatial import distance
warnings.filterwarnings("ignore", category=DeprecationWarning)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abdul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abdul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# !pip install glove_python

In [3]:
from glove import Glove, Corpus

In [4]:
#Mounting google drive
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
# open a file, where you stored the pickled data
f = open('CSV/bow_corpus.pickle', 'rb')
bow_corpus=pickle.load(f)

file = open('CSV/dictionary.pickle', 'rb')
dictionary=pickle.load(file)

# later on, load trained model from file
lda_model =  models.LdaModel.load('CSV/lda_model.model')

In [6]:
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 5:
            result.append(lemmatize(token))
    return result

In [7]:

# importing all the clusters created using LDA based topic modeling
for c in range(10):
  exec('topic_{} = pd.read_csv("CSV/topic_{}.csv")'.format(c,c))
  exec("topic_{}= topic_{}.drop(columns=['Unnamed: 0'])".format(c,c))
  exec("topic_{}['Description'] = topic_{}['Description'].map(preprocess)".format(c,c))

In [8]:
#import the duplicate reports for testing purpose
test = pd.read_csv('CSV/duplicate_reports.csv')
test = test.drop(columns=['Unnamed: 0'])

In [9]:
test['Description']= test['Description'].map(preprocess)

In [10]:
for mod in range(10):
  #import all the trained Word2Vec models
  exec('w2vmodel{} = Word2Vec.load("CSV/word2vec{}.model")'.format(mod, mod))

  #import all the trained FastText models
  exec('ftmodel{} = FastText.load("CSV/ftmodel{}.model")'.format(mod, mod))

  #import all the trained GloVe models
  exec('glove{} = Glove.load("CSV/glove{}.model")'.format(mod, mod))

In [11]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

### **Selection of Top-n clusters**

In [12]:
#This will return the index of cluster in which the master report of duplicate report may reside
def sim_with_clusters_lda_topn(DR, n):
    vec_bow = dictionary.doc2bow(DR)
    x= lda_model[vec_bow]
    topic = np.asarray(x)
    # max_sim = int(topic[np.argmax(topic[:,1]),0]) 
    # max_sim
    sim=[]
    x= topic[np.argsort(topic[:,1])[-n:][::-1],0]
    for i in range(len(x)):
        sim.append(int(x[i]))
    # return max_sim
    return sim

In [13]:
# Convert multiple word embeddings into a single document vector by averaging the word embeddings by GloVe model

def average_word_vectors_glove(words, model, vocabulary, num_features):  
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.  

    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.word_vectors[model.dictionary[word]])

    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    


def averaged_word_vectorizer_glove(corpus, model, num_features):
    vocabulary = set(model.dictionary)
    if(any(isinstance(i, list) for i in corpus)):
      features = [average_word_vectors_glove(tokenized_sentence, model, vocabulary, num_features)
                      for tokenized_sentence in corpus]
      return np.array(features)
    else:
      features = average_word_vectors_glove(corpus, model, vocabulary, num_features)
      return np.array(features)


In [14]:
# Convert multiple word embeddings into a single document vector by averaging the word embeddings by FastText or Word2Vec model

def average_word_vectors_w2v(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    
def averaged_word_vectorizer_w2v(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    if(any(isinstance(i, list) for i in corpus)):
      features = [average_word_vectors_w2v(tokenized_sentence, model, vocabulary, num_features)
                      for tokenized_sentence in corpus]
      return np.array(features)
    else:
      features = average_word_vectors_w2v(corpus, model, vocabulary, num_features)
      return np.array(features)



### **Unified Similarity Measure**

In [15]:
# Similarity between two feature vectors using the average of cosine similarity and euclidean similarity
def sim(vec1, vec2): 
  sim1 = 1/(1+distance.euclidean(vec1, vec2))
  sim2 = cosine_similarity(vec1, vec2)
  sim=(sim1+sim2)/2 
  return sim

### **Multimodality Fusions**

In [16]:
# Different kinds of fusion of two master report feature vectors and two duplicate report feature vectors
def fusion(vec1, vec2, vec3, vec4, fusion_no):

  # fusion_no = 1 : concatenation of the vectors
  if (fusion_no == '1'):
    master = np.concatenate((vec1, vec2), axis=1)
    vec_duplicate = np.concatenate((vec3, vec4), axis=0)
    vec_duplicate=[vec_duplicate]
    return vec_duplicate, master

  #fusion_no = 2 : average of the vectors
  elif (fusion_no == '2'):
    vec3 = [vec3]
    vec4 = [vec4]
    avg1 = (np.add(vec1, vec2))/2
    avg2 = (np.add(vec3, vec4))/2
    return avg2, avg1

  #fusion_no = 3 : Dimensionality reduction using PCA on concatenation of the vectors
  elif (fusion_no == '3'):
    master = np.concatenate((vec1, vec2), axis=1)
    pca = PCA(n_components=100)
    avg_fit = pca.fit(master)
    master = pca.transform(master)
    vec_duplicate = np.concatenate((vec3, vec4), axis=0)
    vec_duplicate=[vec_duplicate]
    vec_duplicate = pca.transform(vec_duplicate)
    return vec_duplicate, master

  #fusion_no = 3 : Dimensionality reduction using PCA on average of the vectors
  elif (fusion_no == '4'):
    vec3 = [vec3]
    vec4 = [vec4]
    avg1 = (np.add(vec1, vec2))/2
    pca = PCA(n_components=100)
    avg_fit = pca.fit(avg1)
    master = pca.transform(avg1)
    avg2 = (np.add(vec3, vec4))/2
    vec_duplicate = pca.transform(avg2)
    return vec_duplicate, master

  else:
    raise ValueError('Invalid value for fusion')



### **Creation of Feature Vectors using Multimodality and Single modality Feature Extraction**

In [17]:
# creation of feature vectors by multimodality feature extraction
def feature_vectors_multi_modality(DR, corpus, model1, model2, fusion_no):
  master_ft1 = averaged_word_vectorizer_w2v(corpus=sent, model=model1, num_features=100)
  master_glove2 = averaged_word_vectorizer_glove(corpus=sent, model=model2, num_features=100)

  vec_duplicate1 = averaged_word_vectorizer_w2v(corpus=DR, model=model1, num_features=100)
  vec_duplicate2 = averaged_word_vectorizer_glove(corpus=DR, model=model2, num_features=100)

  #for fusion 1 and fusion 3 :
  # vec_duplicate, master= fusion_3(master_ft1, master_glove2, vec_duplicate1, vec_duplicate2)

  #for this for fusion 2 and 4:
  vec_duplicate , master= fusion(master_ft1, master_glove2, vec_duplicate1, vec_duplicate2, fusion_no)

  return vec_duplicate,master

In [18]:
# creation of feature vectors by singlemodality feature extraction
def feature_vectors_single_modality(DR, corpus, model1):
  master = averaged_word_vectorizer_w2v(corpus=sent, model=model1, num_features=100)

  vec_duplicate = averaged_word_vectorizer_w2v(corpus=DR, model=model1, num_features=100)

  vec_duplicate = [vec_duplicate]

  return vec_duplicate, master

### **Top-N Recommendations**

In [19]:
# Returns Top-N master reports

def compare_topn(model1, model2, cluster, sent, DR, topn, modal, fusion_no):
  similarity=[]

  if (modal == 'multi'):
  #create feature vectors for duplicate and master reports using multimodality
    vec_duplicate, master= feature_vectors_multi_modality(DR, sent, model1, model2, fusion_no)

  # #create feature vectors for duplicate and master reports using single modality
  elif (modal == 'single'):
    vec_duplicate, master= feature_vectors_single_modality(DR, sent, model1)

  else:
    raise ValueError('Invalid Modality entered')

  for doc in range(len(master)):
    vec_master = master[doc]
    vec_master= [vec_master]
    unified_sim = sim(vec_duplicate, vec_master)

    similarity.append(unified_sim)
  similarity = np.asarray(similarity)
  similarity= np.concatenate(similarity, axis=0 )
  similarity= np.concatenate(similarity, axis=0 )
  max_similar_reports=similarity.argsort()[-topn:][::-1]
  # # # for d,f in enumerate(max_similar_reports):
  # # #     similar_reports= similar_reports.append(cluster.loc[[f]])
  return(max_similar_reports)

### **Evalation of the Approach using Recall Rate**

In [20]:
# Recall Rate for Top-2.5K reports (Because Top-N where N = n * topn (2.5K = 3*833)) 
vec_acc=[]
t1 = time.time()
no_of_test_samples= int(200)
for i in range(no_of_test_samples):
  sample = test.Description[i] #The test sample (duplicate report)
  n = 3
  max_cluster =sim_with_clusters_lda_topn(sample, n)
  v=[]
  print(i)
  for max in max_cluster:
    exec('cluster = topic_{}'.format(max))              #The predicted cluster
    exec('model1 = ftmodel{}'.format(max))              #The trained FastText model for the predicted cluster   (can be changed to other model as well viz. glove or word2vec)
    exec('model2 = glove{}'.format(max))                #The trained Word2Vec model for the predicted cluster   (Doesn't count if using single modality)
    exec('sent = topic_{}.Description'.format(max))     #The vocabulary for the predicted cluster
    topn = 833                                          #The number of predicted master report for single predicted cluster
    fusion_no = '4'   #Doesn't count if single modality #The selection of fusion used to fuse the word embeddings of two different models  (4 gives the best results)
    modal = 'multi'                                    #Whether you want to use single feature extraction model or multi model ( for single, it'll consider just model1)
     #This will return the Top-N predicted master reports
    max_sim = compare_topn(model1, model2, cluster, sent, sample, topn, modal, fusion_no)
    t2 = time.time()

    #Comparing the predicted value to the ground truth
    for num in max_sim:
      if (cluster.Issue_id[num] == test.Duplicated_issue[i]):
          v.append("1")
      else:
          v.append("0")
  
  if(all(x==v[0] for x in v)):
    vec_acc.append("0")
  else:
    vec_acc.append("1")

#Evaluating the performance by Recall Rate
sum = 0
for i,num in enumerate(vec_acc):
    sum = sum + int(num)
recall_rate = (sum/len(vec_acc))*100
print("Recall Rate : {} %".format(recall_rate))
print("Time : ", (t2-t1)/60, "min")

0


TypeError: 'FastText' object is not subscriptable