## Syntatic similarity: TFIDF 

In [None]:
import numpy as np 
import scipy 
import sklearn
import re
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import spacy
import nltk
from nltk.corpus import stopwords

In [None]:
tickets = pd.read_csv("customer_realted_issues_v1.2.csv", error_bad_lines=False, delimiter="|")

In [None]:
tickets.head()

In [None]:
df = pd.DataFrame(tickets, columns=['JIRA ID', 'Summary','IssueType', 'Project', 'Components'])
defect_tickets = df[df['IssueType']=="Defect"]
defect_tickets = defect_tickets[defect_tickets['Project']=='POD Customer Deployment']
defect_tickets.head()

In [None]:
# # Check for any nulls values
# defect_tickets.isnull().sum()

In [None]:
# Remove null values and unneeded features
defect_tickets = defect_tickets.dropna()

In [None]:
defect_tickets.head()

In [None]:
# Inspecting some of the tickets
for i in range(5):
    print("Ticket #",i+1)
    print(defect_tickets.Summary.iloc[i])
    print()

In [None]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [None]:
def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

In [None]:
# # Clean the summaries and texts
# for summary in defect_tickets.Summary:
#     defect_tickets['CleanSummary'] = clean_text(summary, remove_stopwords=False)
# print("Summaries are complete.")

defect_tickets['CleanSummary'] = defect_tickets.apply(lambda row: clean_text(row['Summary'], remove_stopwords=False), axis=1)

In [None]:
defect_tickets.head()

In [None]:
# Inspect the cleaned summaries and texts to ensure they have been cleaned well
for i in range(5):
    print("Clean Review #",i+1)
    print(defect_tickets.CleanSummary.iloc[i])
    print()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(list(defect_tickets['CleanSummary'].values.astype('U')))
# print(X.shape)
# to print words in vocabulary
# print(vectorizer.get_feature_names())

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transfomer = TfidfTransformer(use_idf=False).fit(X)
X_tf = tf_transfomer.transform(X)
# X_tf.shape

In [None]:
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
def get_similar_issues(input_issue):
    new_x = vectorizer.transform([input_issue])
    
    cosine_similarity_predicted = cosine_similarity(new_x, X_tfidf)
    cosine_similarity_predicted = cosine_similarity_predicted[0]
    related_docs_indices_predicted = cosine_similarity_predicted.argsort()
    related_docs_indices_predicted = related_docs_indices_predicted[-5:-1]
    
    print("\nInput issue: " + input_issue + "\n")
    print("Similar Tickets:")
    print(defect_tickets.CleanSummaries.iloc(related_docs_indices_predicted[3]), cosine_similarity_predicted[related_docs_indices_predicted[3]])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

new_x = vectorizer.transform(["[PECloud]Accolade - WWE - Multiple agents report getting kicked out of Genesys and some calls dropped on 03/26/2018"])
    
cosine_similarity_predicted = cosine_similarity(new_x, X_tfidf)
print(cosine_similarity_predicted[0].argsort()[-5:-1])

index = 3451
print(defect_tickets.CleanSummary.iloc[index])
print(defect_tickets.Summary.iloc[index])
print(defect_tickets.Components.iloc[index])
print(defect_tickets.Project.iloc[index])
print(defect_tickets['JIRA ID'].iloc[index])

In [None]:
import pandas as pnd
d = pnd.Timestamp('2013-01-01 16:00')
dates = pnd.bdate_range(start=d, end = d+pnd.DateOffset(days=10), normalize = False)

df = pnd.DataFrame(index=dates, columns=['a'])
df['a'] = 6
print(df)

In [None]:
test_issues = [] ## for test issues 

for test_issue in test_issues:
    get_similar_issues(test_issue)

## BEC Semantic similarity: WMD using PuLP python module

In [None]:
from itertools import product
from collections import defaultdict

import numpy as np
from scipy.spatial.distance import euclidean
import pulp
import gensim

# Original research paper link for word mover distance http://proceedings.mlr.press/v37/kusnerb15.pdf

In [None]:
def tokens_to_fracdict(tokens):
    cntdict = defaultdict(lambda : 0)
    for token in tokens:
        cntdict[token] += 1
    totalcnt = sum(cntdict.values())
    return {token: float(cnt)/totalcnt for token, cnt in cntdict.items()}

In [None]:
def word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel, lpFile=None):
    all_tokens = list(set(first_sent_tokens+second_sent_tokens))
    wordvecs = {token: wvmodel[token] for token in all_tokens}

    first_sent_buckets = tokens_to_fracdict(first_sent_tokens)
    second_sent_buckets = tokens_to_fracdict(second_sent_tokens)

    T = pulp.LpVariable.dicts('T_matrix', list(product(all_tokens, all_tokens)), lowBound=0)

    prob = pulp.LpProblem('WMD', sense=pulp.LpMinimize)
    prob += pulp.lpSum([T[token1, token2]*euclidean(wordvecs[token1], wordvecs[token2])
                        for token1, token2 in product(all_tokens, all_tokens)])
    for token2 in second_sent_buckets:
        prob += pulp.lpSum([T[token1, token2] for token1 in first_sent_buckets])==second_sent_buckets[token2]
    for token1 in first_sent_buckets:
        prob += pulp.lpSum([T[token1, token2] for token2 in second_sent_buckets])==first_sent_buckets[token1]

    if lpFile!=None:
        prob.writeLP(lpFile)

    prob.solve()

    return prob

In [None]:
def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, lpFile=None):
    prob = word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel, lpFile=lpFile)
    return pulp.value(prob.objective)

In [None]:
#wvmodel = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
# prob = word_mover_distance_probspec(['Obama', 'speaks', 'Chicago'], ['President', 'addresses', 'media', 'Illinois'], wvmodel)
# print(pulp.value(prob.objective))
# for v in prob.variables():
#     if v.varValue!=0:
#         print(v.name, '=', v.varValue)

In [None]:
def load_word2vec(file):
    word2vec = {}
    fin= open(file)
    for line in fin:
        items = line.replace('\r','').replace('\n','').split(' ')
        if len(items) < 10: continue
        word = items[0]
        vect = np.array([float(i) for i in items[1:] if len(i) > 1])
        word2vec[word] = vect
    return  word2vec

In [None]:
from glove import Corpus, Glove
import pickle

In [None]:
# ./glove_embedding_300_all_summary_2.model
# ./glove_embedding_400_window_10.model

glove = Glove.load('./glove_embedding_400_window_10.model')

In [None]:
# ./gloVe_embedding_300_all_summary_2.pickle
# ./gloVe_embedding_400_windows_10.pickle
dictionary_file = "./gloVe_embedding_400_windows_10.pickle"

dictionary_fh = open(dictionary_file, 'rb')
dictionary = pickle.load(dictionary_fh)

In [None]:
glove.add_dictionary(dictionary)

In [None]:
glove.word_vectors[dictionary['sip']]

In [None]:
import numpy as np

In [None]:
unknown_vector = np.random.rand(1, 100)

In [None]:
unknown_vector

In [None]:
def word_vector(word):
    try:
        return glove.word_vectors[dictionary[word]]
    except KeyError:
        return unknown_vector

In [None]:
word_vector('sip')

In [None]:
def word_mover_distance_probspec_glove(first_sent_tokens, second_sent_tokens, word_vector, lpFile=None):
    all_tokens = list(set(first_sent_tokens+second_sent_tokens))
    wordvecs = {token: word_vector(token.lower()) for token in all_tokens}

    first_sent_buckets = tokens_to_fracdict(first_sent_tokens)
    second_sent_buckets = tokens_to_fracdict(second_sent_tokens)

    T = pulp.LpVariable.dicts('T_matrix', list(product(all_tokens, all_tokens)), lowBound=0)

    prob = pulp.LpProblem('WMD', sense=pulp.LpMinimize)
    prob += pulp.lpSum([T[token1, token2]*euclidean(wordvecs[token1], wordvecs[token2])
                        for token1, token2 in product(all_tokens, all_tokens)])
    for token2 in second_sent_buckets:
        prob += pulp.lpSum([T[token1, token2] for token1 in first_sent_buckets])==second_sent_buckets[token2]
    for token1 in first_sent_buckets:
        prob += pulp.lpSum([T[token1, token2] for token2 in second_sent_buckets])==first_sent_buckets[token1]

    if lpFile!=None:
        prob.writeLP(lpFile)

    prob.solve()

    return prob

In [None]:
prob = word_mover_distance_probspec_glove(['unable', 'login'], ['cannot', 'login'], word_vector)
print(pulp.value(prob.objective))
for v in prob.variables():
    if v.varValue!=0:
        print(v.name, '=', v.varValue)

In [None]:
test_issues = [] ### test issues should be passed as list of issues 

In [None]:
cleaned_issues = []
for issue in test_issues:
    print(clean_text(issue, remove_stopwords=False))
    cleaned_issues.append(clean_text(issue, remove_stopwords=False))

In [None]:
def iterator(cleaned_issues, filename):
    fh = open(filename, 'w+')
    result = []
    fh.write("Input Issue|Similar Issue|Dismilarity Score")
    for issue in cleaned_issues:
        left_summary = issue.split()
        print(left_summary)
        result_summaries = []
        for j in range(len(clean_summaries)):
            right_summary = clean_summaries[j].split()
            if not (i==j):
                prob = word_mover_distance_probspec_glove(left_summary, right_summary, word_vector)
                result_summaries.append((pulp.value(prob.objective),right_summary))
                fh.write(str(issue) + "|" +str(' '.join(right_summary))+ "|" + str(pulp.value(prob.objective))+"\n")
        result_summaries.sort(key=lambda element:element[0])
        result.append((left_summary,result_summaries))
    fh.close()
    return result

In [None]:
result = iterator(cleaned_issues, 'semantic_wmd_test_run_3_with_new_embedding.csv')
print(result)