In [1]:
import pandas as pd
import numpy as np
# LDA, tSNE
from sklearn.manifold import TSNE
from gensim.models.ldamodel import LdaModel
# NLTK
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import re
# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
%matplotlib inline
import seaborn as sns
# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Slider
from bokeh.layouts import column
from bokeh.palettes import all_palettes,Viridis256,Category20 
output_notebook()



In [2]:
xlsx = pd.ExcelFile('incident_mar_april2018.xlsx')
#fields = ['Short description', 'Assignment group']

data_sheets = []
for sheet in xlsx.sheet_names:
    data_sheets.append(pd.read_excel(xlsx,sheet, usecols=[1,3,4,5]))
df = pd.concat(data_sheets)
df=df.reset_index(drop=True)
print("Input Dataframe Shape(rows,cols):",df.shape)

Input Dataframe Shape(rows,cols): (71430, 4)


In [4]:
df.columns = [c.replace(' ', '_') for c in df.columns]

In [111]:
df1= df.sample(frac=0.2)
df1.shape

(14286, 6)

In [57]:
df['month'] = pd.to_datetime(df.Created).dt.month

In [59]:
df1.head()

Unnamed: 0,Created,Priority,Short_description,Category,tokens,month
47594,2018-04-02 10:10:41,4 - Low,WOW / Workstation on Wheels || WOW / Workstati...,End User Services,"[wow, workstat, wheel, wow, workstat, wheel, i...",4
14471,2018-03-10 17:00:13,3 - Moderate,'mmr-pweb-001.chw.edu':CPU Utilization is '96'...,Server,"[mmr, pweb, chw, edu, cpu, util, threshold, se...",3
62734,2018-04-12 11:39:41,4 - Low,Java pop up -,HelpDesk,"[java, pop]",4
32260,2018-03-22 00:17:52,3 - Moderate,nmscust02-2 : CHW-as1-2west-dsc : 10.1.5.6 : S...,Network,"[nmscust, chw, west, dsc, syslog, critic]",3
57201,2018-04-09 08:58:47,4 - Low,Allscript - not able to log in ||,Application,"[allscript, abl, log]",4


In [112]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop = stopwords.words('english')
snowball = SnowballStemmer("english")  
lemmer=WordNetLemmatizer()

def rem_dup_word(nltk_tokens):
    ordered_tokens = set()
    result = []
    for word in nltk_tokens:
        if word not in ordered_tokens:
            ordered_tokens.add(word)
            result.append(word)
    return result

def preprocess(X):
    X = X.map(lambda x: re.sub(r'\d+', '', str(x)))
    X = X.map(lambda x: x.lower())
    
    X = X.map(lambda x: RegexpTokenizer(r'\w+').tokenize(x))
    #stem
    #X = X.map(lambda x: [snowball.stem(token) for token in x])
    #lemm
    X = X.map(lambda x: [lemmer.lemmatize(token) for token in x])
    #stop words
    X = X.map(lambda x: [t for t in x if t not in stop])
    X = X.map(lambda x: rem_dup_word(x))
    #remove all "extremely short" words (that have less than 2 characters):
    X = X.map(lambda x: [t for t in x if len(t) > 1])
    return X

df1['tokens'] =preprocess(df1['Short_description'])


In [82]:
#print(df.loc[:5, ['tokens','Short_description']])

In [113]:
#df['tokens'] =preprocess(df['Short_description'])
#print(df1['tokens'][:20],df1['Short_description'][:20])

36324           [account, unable, login, user, id, schung]
42493    [jabber, working, took, remote, helped, user, ...
57786    [nmscust, chw, agg, mdf, main, sjmc, syslog, c...
21455    [invision, unable, login, user, id, awiley, de...
67831    [nmscust, chw, wc, mdf, main, sjmc, ap, associ...
16334                                     [desktop, issue]
31617          [notification, text, amcore, date, mshdadm]
62664                                       [phone, issue]
18918           [cerner, careb, access, ha, changed, rgil]
7981     [need, help, finding, correct, group, request,...
45705               [active, job, completed, exit, status]
34017    [phx, vctx, chw, edu, utilization, threshold, ...
63810                   [wow, bar, code, scanner, working]
64696    [cerner, powerchart, unable, access, username,...
38477           [journey, unable, login, username, jvalle]
40128                                     [desktop, issue]
37685    [network, account, unable, login, locked, user.

In [None]:
from gensim import corpora, models
np.random.seed(2017)
texts = df1['tokens'].values
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = models.ldamodel.LdaModel(corpus, id2word=dictionary,  num_topics=15,update_every=5, chunksize=1000, passes=100)

In [None]:
for idx, topic in ldamodel.show_topics(formatted=False, num_words= 20):
    print('Topic: {} \nWords: {}'.format(idx, [w[0] for w in topic]))

In [None]:
hm = np.array([[y for (x,y) in ldamodel[corpus[i]]] for i in range(len(corpus))])

In [None]:
tsne = TSNE(random_state=2017, perplexity=30)
tsne_embedding = tsne.fit_transform(hm)
tsne_embedding = pd.DataFrame(tsne_embedding, columns=['x','y'])
tsne_embedding['hue'] = hm.argmax(axis=1)

In [109]:
source = ColumnDataSource(
        data=dict(
            x = tsne_embedding.x,
            y = tsne_embedding.y,
            colors = [Category20[20][i] for i in tsne_embedding.hue],
            description = df1.tokens,
            year = df1.month,
            alpha = [0.9] * tsne_embedding.shape[0],
            size = [7] * tsne_embedding.shape[0]
        )
    )
hover_tsne = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Tokens</span>
            <span style="font-size: 12px">@description</span>

        </div>
    </div>
    """)

#            <span style="font-size: 12px; font-weight: bold;">Month:</span>
#            <span style="font-size: 12px">@year</span>
#
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(plot_width=700, plot_height=700, tools=tools_tsne, title='Incident')
plot_tsne.circle('x', 'y', size='size', fill_color='colors', 
                 alpha='alpha', line_alpha=0, line_width=0.01, source=source, name="df")

callback = CustomJS(args=dict(source=source), code="""
    var data = source.data;
    var f = cb_obj.value
    x = data['x']
    y = data['y']
    colors = data['colors']
    alpha = data['alpha']
    title = data['title']
    year = data['year']
    size = data['size']
    for (i = 0; i < x.length; i++) {
        if (year[i] <= f) {
            alpha[i] = 0.9
            size[i] = 7
        } else {
            alpha[i] = 0.05
            size[i] = 4
        }
    }
    source.trigger('change');
""")

slider = Slider(start=df1.month.min(), end=df1.month.max(), value=3, step=1, title="month")
slider.js_on_change('value', callback)

layout = column( plot_tsne)  #slider

In [110]:
show(layout)