In [1]:
# Glove embedding 
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from glove import Corpus, Glove
import itertools
import gensim 
import logging
import pandas as pd

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# Reading data from csv file 
df = pd.read_csv("all_tickets.csv")
pd.options.display.max_colwidth = 50
df.head(2)
df = df[df['issueType']=="Defect"]
df.head(5)

Unnamed: 0,jiraId,summary,description,components,issueType,project,projectCategory
68,WRTCMS-111,Investigate Packet Loss with Opus Codec,See linked issue for details. Michail Epikt...,WebRTC MS,Defect,WebRTC Media Service,R&D - Customer Engagement - Inbound
72,WRTCMS-107,PSTN call dropped in staging tenant 21-56,I am having a problem with PSTN call (dialing ...,WebRTC MS,Defect,WebRTC Media Service,R&D - Customer Engagement - Inbound
75,WRTCMS-104,Tenant2063:Pstn call to RP->PlayTreatment->Ans...,Pstn call from 16504661008 to RP (15714492709)...,WebRTC MS,Defect,WebRTC Media Service,R&D - Customer Engagement - Inbound
76,WRTCMS-103,Tenant2063:No media stream on outbound call to...,3010 3pcc make call to +16505346093. Call answ...,WebRTC MS,Defect,WebRTC Media Service,R&D - Customer Engagement - Inbound
79,WRTCMS-100,When record started -call members can not hear...,record=true is set on Extension 3011 3010 rtc...,WebRTC MS,Defect,WebRTC Media Service,R&D - Customer Engagement - Inbound


In [3]:
df.isnull().sum()

jiraId                 0
summary                0
description        16289
components         12728
issueType              0
project                0
projectCategory        0
dtype: int64

In [4]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [5]:
def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    try:# Convert words to lower case
        text = text.lower()
    
        # Replace contractions with their longer forms 
        if True:
            text = text.split()
            new_text = []
            for word in text:
                if word in contractions:
                    new_text.append(contractions[word])
                else:
                    new_text.append(word)
            text = " ".join(new_text)
    
        # Format words and remove unwanted characters
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text = re.sub(r'\<a href', ' ', text)
        text = re.sub(r'&amp;', '', text) 
        text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
        text = re.sub(r'<br />', ' ', text)
        text = re.sub(r'\'', ' ', text)
        text = re.sub(r'ƒ', '', text)
        
        # Optionally, remove stop words
        if remove_stopwords:
            text = text.split()
            stops = set(stopwords.words("english"))
            text = [w for w in text if not w in stops]
            text = " ".join(text)
        return text
    except AttributeError:
        print(text)

In [6]:
import re
# Clean the summaries and texts
clean_summaries = []
for summary in df.summary:
    clean_summaries.append(clean_text(summary, remove_stopwords=False))
print("Summaries are complete.")

Summaries are complete.


In [7]:
# some pre-processing and merging summary - description column
df = df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1)
print(list(df.columns.values))
df['summary'] = df['summary'].str.lower().str.replace(',', '').str.replace('@', '').str.replace('ƒ', '').str.lower()
#df['description'] = df['description'].str.replace(',', '').str.replace('@', '').str.lower()
df['summary_description'] = df['summary']#.astype(str) + df['description']
df['summary_description'].head(1000).tail(10)

# Inspect the cleaned summaries and texts to ensure they have been cleaned well
for i in range(5):
    print("Clean Review #",i+1)
    print(clean_summaries[i])
    print()

['jiraId', 'summary', 'description', 'components', 'issueType', 'project', 'projectCategory']
Clean Review # 1
investigate packet loss with opus codec

Clean Review # 2
pstn call dropped in staging tenant 21 56

Clean Review # 3
tenant2063 pstn call to rp >playtreatment >answer by rtp  no media path

Clean Review # 4
tenant2063 no media stream on outbound call to pstn number

Clean Review # 5
when record started  call members can not hear each other



In [8]:
#tokenizing and puting to list

def read_input(desc):
    for content in desc:
#         print(content)
        #if (did%100000==0):
         #   logging.info ("read {0} reviews".format (did))
        # do some pre-processing and return a list of words for each review text
        yield(gensim.utils.simple_preprocess(str(content).encode("utf-8")))

documents = list(read_input(clean_summaries))
logging.info ("Done reading data file")

# documents is list of list 
print(documents[:100])

2018-07-13 17:39:29,815 : INFO : Done reading data file


[['investigate', 'packet', 'loss', 'with', 'opus', 'codec'], ['pstn', 'call', 'dropped', 'in', 'staging', 'tenant'], ['tenant', 'pstn', 'call', 'to', 'rp', 'playtreatment', 'answer', 'by', 'rtp', 'no', 'media', 'path'], ['tenant', 'no', 'media', 'stream', 'on', 'outbound', 'call', 'to', 'pstn', 'number'], ['when', 'record', 'started', 'call', 'members', 'can', 'not', 'hear', 'each', 'other'], ['monitoring', 'of', 'internal', 'call', 'no', 'media'], ['pcc', 'does', 'not', 'work'], ['media', 'connection', 'problem', 'after', 'pcc', 'ss', 'transfer', 'on', 'internal', 'dns'], ['ice', 'renegotiation', 'does', 'not', 'correctly', 'work', 'with', 'webrtc', 'endpoint'], ['pcc', 'hold', 'on', 'internal', 'call', 'does', 'not', 'work'], ['singlestep', 'transfer', 'of', 'internal', 'call', 'the', 'destination', 'can', 'not', 'hear', 'caller'], ['pcc', 'does', 'not', 'work', 'with', 'wrtc', 'media', 'service'], ['test'], ['handle', 'direct', 'calls', 'ts', 'only', 'config', 'handle', 'direct', 'c

In [9]:
# Training the model
corpus = Corpus()
corpus.fit(documents, window=5)
glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=100, no_threads=4, verbose=True)
glove.save('glove_embedding_300_defect_summary_windows_5.model')

Performing 100 training epochs with 4 threads
Epoch 0
Test done
Epoch 1
Test done
Epoch 2
Test done
Epoch 3
Test done
Epoch 4
Test done
Epoch 5
Test done
Epoch 6
Test done
Epoch 7
Test done
Epoch 8
Test done
Epoch 9
Test done
Epoch 10
Test done
Epoch 11
Test done
Epoch 12
Test done
Epoch 13
Test done
Epoch 14
Test done
Epoch 15
Test done
Epoch 16
Test done
Epoch 17
Test done
Epoch 18
Test done


KeyboardInterrupt: 

In [None]:
from itertools import islice
glove.add_dictionary(corpus.dictionary)
print(list(islice(corpus.dictionary, 5)))
print("\n\n total words: "+str(len(corpus.dictionary)))

In [None]:
print(type(glove))
print(type(corpus.dictionary))
import pickle
trained_data = corpus.dictionary
# glove = Glove.load('glove_embedding.model')
# print(glove)

with open('gloVe_embedding_300_defect_summary_windows_5.pickle', 'wb') as handle:
    pickle.dump(trained_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

glove.most_similar('sip', 10)

In [None]:
# Creating test dataset

from collections import Counter
# removing stop_words
import string
from nltk.corpus import stopwords

def clean_intoList(listoflist):
    flattened_list = []
    for x in listoflist:
        for y in x:
            flattened_list.append(y)

    stop_words = set(stopwords.words('english'))
    li = [w for w in flattened_list if not w in stop_words]
    return li

Counter = Counter(clean_intoList(documents))
most_occur = Counter.most_common(10)

# finding most similar words for words in list
for word in ['sip', 'sipfs', 'sipvm','wwe', 'workspace', 'designer', 'call', 'server', 'agent', 'error']:
    print(word)
    sim = glove.most_similar(word, number=20)
    print(sim)
    print("\n")

#finding most simillar words for most frequent words:
for word in most_occur:
    print(word[0])
    sim = glove.most_similar(word[0], number=20)
    print(sim)
    print("\n")