In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline
from numpy import argmax
from sklearn.preprocessing import OneHotEncoder
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [13]:
xlsx = pd.ExcelFile('user_incidents_dumps_60days.xlsx')
#fields = ['Short description', 'Assignment group']

data_sheets = []
for sheet in xlsx.sheet_names:
    data_sheets.append(pd.read_excel(xlsx,sheet, usecols=[3,6]))
df = pd.concat(data_sheets)

print("Input Dataframe Shape(rows,cols):",df.shape)

Input Dataframe Shape(rows,cols): (62502, 2)


In [14]:
#replace space in header with _
df.columns = [c.replace(' ', '_') for c in df.columns]

In [15]:
df.head()

Unnamed: 0,Short_description,Assignment_group
0,Network Account || Unable to login || Account ...,DH-Enterprise IT Service Cntr
1,Ms4 - phxasp01 - job needs to be killed / job...,DH-Enterprise IT Service Cntr
2,Printer || Unable to print the strips through...,DH-NC-EUS Stockton
3,Clairiva - unable to login / user id: ltimpog ...,DH-Enterprise IT Service Cntr
4,WOW || Unable to turn on || Black Screen || De...,DH-SC-EUS Bakersfield MSH


In [16]:
#drop duplicate rows
df2 = df.drop_duplicates()
print(df2.shape)

(47373, 2)


In [17]:
#get data with at leaset count(assignment group)>200
df2=df2.groupby("Assignment_group").filter(lambda x: len(x) > 200)
print("dataframe size after filtering(rows,cols):",df2.shape)

dataframe size after filtering(rows,cols): (39121, 2)


In [58]:
#print count of assignment groups
df2.Assignment_group.value_counts()


DH-Enterprise IT Service Cntr        24901
DHE-HR Tier 2 - Talent Management     1164
DH-SW-EUS StJoseph                     923
DH-Helpdesk RRE                        791
DH-ClinApps NAS                        748
DH-MPS Kyocera                         659
DH-SW-EUS Chandler                     602
DH-NC-EUS Redding                      513
DH-SC-EUS StJohnRMC                    471
DHE-SecAdmin                           461
DH-SW-EUS Phoenix                      442
DH-NC-EUS Stockton                     440
DH-SC-EUS Bakersfield BMH              399
DH-GB-EUS Dominican                    374
DH-NC-EUS Sac MET                      365
DH-SC-EUS NLA NrthrdgRoscoe            363
DHE-RCM-AppOpSupport-MS4               360
DH-ClinApps CPOE                       354
DH-NC-EUS Sac MGH                      338
DH-SW-EUS Gilbert                      328
DH-SC-EUS SLA CalifornHospMC           293
DH-SW-EUS StRoseSiena                  282
DH-NC-Telcom Sacramento                277
DH-ClinApps

In [41]:
#No. of unique assignment groups =  num of output classes
num_classes = df2.Assignment_group.nunique()
print("Number of output classes: ",num_classes)

Number of output classes:  37


In [99]:
#save data to pickle format if need to use later
df1.to_pickle("snow_v1_43k_20out_0910.pkl")
#df = pd.read_pickle("snow_v1_dataframe.pkl")

In [18]:
#Create input and output data sets
X = df2.Short_description
Y = df2.Assignment_group
#print(X.shape,Y.shape)

In [19]:
stop = stopwords.words('english')
stemmer = SnowballStemmer("english")

In [20]:
#word count
sum([len(s.split()) for s in X])

465446

In [21]:
#remove stop words
X1 = X.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
print(sum([len(s.split()) for s in X1]))


400329


In [22]:
#remove special chars
pat = r'[^A-Za-z0-9 ]+'
X2 = X1.str.replace(pat, ' ',regex=True)
print(sum([len(s.split()) for s in X2]))

351291


In [23]:
#stemming words
X3 = X2.apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
print(sum([len(s.split()) for s in X3]))

351291


In [25]:
print(X2.shape,X3.shape,X1.shape,X.shape)

(39121,) (39121,) (39121,) (39121,)


In [32]:
print("ORIG:     ",X[0])
print("STOPWORD: ",X1[0])
print("SP. CHAR: ",X2[0])
print("STEM:     ",X3[0])

ORIG:      Network Account || Unable to login || Account Locked || unlocked Account || Customer able to login
STOPWORD:  Network Account || Unable login || Account Locked || unlocked Account || Customer able login
SP. CHAR:  Network Account   Unable login   Account Locked   unlocked Account   Customer able login
STEM:      network account unabl login account lock unlock account custom abl login


In [33]:
#Encode outputs into N- dim one hot encoded matrix
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(Y)

onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
#print(onehot_encoded)
# invert first example to return original, must for return api --how to inverse transform post deploy
#inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
#print(inverted)

In [34]:
print("Size of one hot encoded output vector: ",onehot_encoded.shape)

Size of one hot encoded output vector:  (39121, 37)


In [36]:
#max word length of input dataset 'short description colun' 
df_col_len = int(X3.str.split().str.len().max())
print(df_col_len)


27


In [53]:
#Split into train and test(15%)  --x1-after stop words
X_train,X_test,Y_train,Y_test = train_test_split(X1,onehot_encoded,test_size=0.15)
print("Training samples: ",X_train.shape)
print("Test samples: ",X_test.shape)

Training samples:  (33252,)
Test samples:  (5869,)


In [78]:
#Split into train and test(15%)  --X2
X_train,X_test,Y_train,Y_test = train_test_split(X2,onehot_encoded,test_size=0.15)
print("Training samples: ",X_train.shape)
print("Test samples: ",X_test.shape)

Training samples:  (33252,)
Test samples:  (5869,)


In [79]:
# Tokenize and pad , init max sizes...should find way to eff value max words 
max_words = 10000
max_len = 20 #df_col_len #35
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)
print(sequences_matrix.shape)

(33252, 20)


In [39]:
#define LSTM  model
#tweak parameters/layers to inc. eff, smtimes 1D conv also used
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs) #50 dim
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(num_classes,name='out_layer')(layer)  #num_classes=# of outputs
    layer = Activation('softmax')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [70]:
#MODEL 2 
#tweak parameters/layers to inc. eff, smtimes 1D conv also used
def RNN2():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs) #50 dim
    layer = LSTM(100, dropout=0.2, recurrent_dropout=0.2)(layer)
    #layer = Dense(256,name='FC1')(layer)
    #layer = Activation('relu')(layer)
    #layer = Dropout(0.5)(layer)
    layer = Dense(num_classes,name='out_layer')(layer)  #num_classes=# of outputs
    layer = Activation('softmax')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

model = RNN2()
#model.summary()
model.compile(loss='categorical_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

In [76]:
#MODEL 3 
#tweak parameters/layers to inc. eff, smtimes 1D conv also used
from keras.layers import Bidirectional
def RNN2():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs) #50 dim    
    layer = Bidirectional(LSTM(64))(layer)
    layer = Dropout(0.2)(layer)
    #layer = Dense(256,name='FC1')(layer)
    #layer = Activation('relu')(layer)
    #layer = Dropout(0.5)(layer)
    layer = Dense(num_classes,name='out_layer')(layer)  #num_classes=# of outputs
    layer = Activation('softmax')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

model = RNN2()
#model.summary()
model.compile(loss='categorical_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

In [77]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 20)                0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 20, 50)            500000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               58880     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
out_layer (Dense)            (None, 37)                4773      
_________________________________________________________________
activation_5 (Activation)    (None, 37)                0         
Total params: 563,653
Trainable params: 563,653
Non-trainable params: 0
_________________________________________________________________


In [42]:
#compile model with optimizer -  can use RMSProp or Adam
model = RNN()
model.summary()
model.compile(loss='categorical_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 20)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 20, 50)            500000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                29440     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation_2 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 37)                9509      
__________

In [80]:
# RUN on MODEL 3 -Bidirectional LSTM
#run after X2 stop words and sp char, vocab size 10k, out=37
#execute the model, early stopping if model stops converging, batch size can be tweaked. 20% data for validation each epoch
history = model.fit(sequences_matrix,Y_train,batch_size=64,epochs=15,
          validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Train on 29926 samples, validate on 3326 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15


In [71]:
# RUN on MODEL 2 - LSTM 100
#run after X1 stop words, vocab size 10k, out=37
#execute the model, early stopping if model stops converging, batch size can be tweaked. 20% data for validation each epoch
history = model.fit(sequences_matrix,Y_train,batch_size=64,epochs=15,
          validation_split=0.15,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Train on 29926 samples, validate on 3326 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15


In [55]:
#run after X1 stop words, vocab size 10k, out=37
#execute the model, early stopping if model stops converging, batch size can be tweaked. 20% data for validation each epoch
history = model.fit(sequences_matrix,Y_train,batch_size=64,epochs=15,
          validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Train on 29926 samples, validate on 3326 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15


In [49]:
#run after X3 stemming, vocab size 10k, out=37
#execute the model, early stopping if model stops converging, batch size can be tweaked. 20% data for validation each epoch
history = model.fit(sequences_matrix,Y_train,batch_size=64,epochs=15,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Train on 26601 samples, validate on 6651 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15


In [46]:
#run after X2 - stop words and remove special chars, vocab size 10k, out=37
#execute the model, early stopping if model stops converging, batch size can be tweaked. 20% data for validation each epoch
history = model.fit(sequences_matrix,Y_train,batch_size=64,epochs=15,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Train on 26601 samples, validate on 6651 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15


In [43]:
#run on Orig X
#execute the model, early stopping if model stops converging, batch size can be tweaked. 20% data for validation each epoch
history = model.fit(sequences_matrix,Y_train,batch_size=64,epochs=15,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Train on 26601 samples, validate on 6651 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15


In [72]:
#save model to file
model.save('model_snow_v1_1_lstm100_37out_0911.h5')

In [None]:
##notes
#>200: input=24k, output classes=27,BS=64,RMS,LSTM(64,256,),dropout 0.5,epochs=9, eff=75%; if BS=128,epoch=2,eff same
#>200: input=55kk,output classes=57,BS=64,RMS,LSTM(64,256,),drop 0.5,epoch 5, eff=64%
#>500: input=43k,out classes=20,BS=64,RMS,LSTM(64,256,),drop 0.5,epoch 9, eff=80.8%
#drop duplicates,>200: input=39k,out=37,BS=64,RMS,LSTM(64,256,),drop 0.5,epoch 8, eff=73%, test=.1, val=.15
#drop duplicates,>300: input=35k,out=20,BS=64,RMS,LSTM(64,256,),drop 0.5,epoch 6, eff=80%

In [81]:
#Create sequences for test data
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [82]:
#Calculate accuracy on test data
accr = model.evaluate(test_sequences_matrix,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3%}'.format(accr[0],accr[1]))

Test set
  Loss: 1.080
  Accuracy: 74.629%


In [47]:
#load unseen data for validation
df_val = read_excel('validate2018.xlsx', sheet_name = 'Sheet1')
Xnew=df_val.Description
Ynew_orig = df_val.Group
print(Xnew.shape,Ynew_orig.shape)

(13, 2)


In [115]:
#convert unseen data to word tokens
Xnew_2 = sequence.pad_sequences(tok.texts_to_sequences(Xnew),maxlen=max_len)

In [116]:
#predict output on unseen data
ynew = model.predict(Xnew_2)

In [117]:
#Print unseen data, predicted value, original value
for i in range(len(Xnew)):
	print("X=%s,\nPredicted=%s,\nOriginal=%s\n" % (Xnew[i], label_encoder.inverse_transform([argmax(ynew[i, :])]),Ynew_orig[i]))

X=zOther Clinical Application Issue || Helpdesk - Clinical Application Issue,
Predicted=['DH-Helpdesk HDAG'],
Original=DH-Helpdesk HDAG

X=Monitor ||  How to increase the brightness || ,
Predicted=['DH-Enterprise IT Service Cntr'],
Original=DH-Enterprise IT Service Cntr

X=Outlook || Outlook Application Issue,
Predicted=['DH-Helpdesk RRE'],
Original=DH-Helpdesk RRE

X=Phone || Phone Issue,
Predicted=['DH-NC-Telcom Sacramento'],
Original=DH-NC-Telcom Sacramento

X=Kyocera Printer Issue || Kyocera Printer Issue,
Predicted=['DH-MPS Kyocera'],
Original=DH-MPS Kyocera

X=Emergency Account Disablement || user ID : aguise001 || Need to be disabled Immediately ,
Predicted=['DHE-SecAdmin'],
Original=DHE-SecAdmin

X=Network, account lock, username : jfunk002,
Predicted=['DH-Enterprise IT Service Cntr'],
Original=DH-Enterprise IT Service Cntr

X=printer || patient data not printing up from cerner ,
Predicted=['DH-ClinApps NAS'],
Original=DH-ClinApps HIM

X=network account  || password reset || us