### Using GPU

In [96]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 1394149687539972430
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3155650150
locality {
  bus_id: 1
  links {
  }
}
incarnation: 6982757343084970374
physical_device_desc: "device: 0, name: GeForce GTX 1050, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


In [97]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [98]:
import tensorflow as tf
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

In [99]:
import keras
config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 8} ) 
sess = tf.Session(config=config) 
keras.backend.set_session(sess)

#### Importing the necessary libraries, modules

In [100]:

import pandas as pd # Pandas library for reading '.csv' files as dataframes
import numpy as np  # Numpy library for creating and modifying arrays.
import tensorflow
from keras.layers import Dense, SimpleRNN, GRU, LSTM, Embedding # Import layers from Keras
from keras.models import Sequential

In [101]:
raw_data = pd.read_csv('Train.csv', encoding='latin-1') # Read the data as a DataFrame using Pandas
raw_test_data = pd.read_csv('Validation.csv', encoding='latin-1')

print(raw_data.shape) # Print the dimensions of train DataFrame
print(raw_data.columns) # Print the column names of the DataFrame
print('\n')
raw_data.head(5) # Print the top few records

(43694, 9)
Index(['title', 'body', 'ticket_type', 'category', 'sub_category1',
       'sub_category2', 'business_service', 'urgency', 'impact'],
      dtype='object')




Unnamed: 0,title,body,ticket_type,category,sub_category1,sub_category2,business_service,urgency,impact
0,car allowance record,october pm allowance record hello think july seems incorrect allowance record amount effective st july inserted about employees sheet attached applies employees possible records corrected by script incorrect each employees record thanks,1,4,3,0,40,3,4
1,project resources decommission write,october pm resources decommission hello please log several calls resources decommission please log call every resource needed closed questions please let thank best regards senior engineer,1,4,2,87,4,3,4
2,access to the internal,thursday hello writing ask question regarding right zone awards application station please provide urgent because preparing demo lot application functionalities based kind regards developer,1,6,22,7,41,3,4
3,new project code fusion,code hi please create code commercial kicking off client code requested vice president,1,4,3,7,70,3,4
4,password reset for,re available has assigned hi guys did till receive also work please status hello since then forward order per procedure please continue follow instructions dear please follow procedure unlock help her ahead best regards senior engineer tuesday pm available has assigned hi did remitted by yourself works nowhere also,1,4,2,88,4,3,4


In [102]:
raw_data.shape
#raw_data.describe

(43694, 9)

In [103]:
# import pandas as pd
# import numpy as np

# corr = raw_data.corr()
# corr.style.background_gradient(cmap='coolwarm')
# # 'RdBu_r' & 'BrBG' are other good diverging colormaps

In [104]:
raw_test_data.shape

(4855, 9)

### Printing the unique classes and their counts/frequencies

In [105]:
# Print the unique classes and their counts/frequencies
classes = np.unique(raw_data['urgency'], return_counts=True) # np.unique returns a tuple with class names and counts
print(classes[0]) #Print the list of unique classes
print(classes[1]) #Print the list of frequencies of the above classes

[0 1 2 3]
[ 1487  6073  4975 31159]


In [106]:
pd.value_counts(raw_data['urgency'])

3    31159
1    6073 
2    4975 
0    1487 
Name: urgency, dtype: int64

### Converting unstructured text to structured numeric form
This includes:
1. Tokenizing
2. Converting sequence of words to sequence of word indeces
3. Converting varing length sequences to fixed length sequences through padding

In [107]:
max_num_words = 10000
seq_len = 50
embedding_size = 100

In [108]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(raw_data.body)


In [109]:
x_train = tokenizer.texts_to_sequences(raw_data.body)

In [110]:
x_train[0]

[36,
 2,
 1974,
 450,
 6,
 264,
 24,
 238,
 967,
 1974,
 450,
 687,
 1417,
 199,
 24,
 2566,
 129,
 444,
 579,
 33,
 2489,
 444,
 114,
 876,
 1836,
 21,
 1263,
 967,
 141,
 444,
 450,
 9]

In [111]:
pd.set_option('display.max_colwidth', -1)

In [112]:
raw_data.head()

Unnamed: 0,title,body,ticket_type,category,sub_category1,sub_category2,business_service,urgency,impact
0,car allowance record,october pm allowance record hello think july seems incorrect allowance record amount effective st july inserted about employees sheet attached applies employees possible records corrected by script incorrect each employees record thanks,1,4,3,0,40,3,4
1,project resources decommission write,october pm resources decommission hello please log several calls resources decommission please log call every resource needed closed questions please let thank best regards senior engineer,1,4,2,87,4,3,4
2,access to the internal,thursday hello writing ask question regarding right zone awards application station please provide urgent because preparing demo lot application functionalities based kind regards developer,1,6,22,7,41,3,4
3,new project code fusion,code hi please create code commercial kicking off client code requested vice president,1,4,3,7,70,3,4
4,password reset for,re available has assigned hi guys did till receive also work please status hello since then forward order per procedure please continue follow instructions dear please follow procedure unlock help her ahead best regards senior engineer tuesday pm available has assigned hi did remitted by yourself works nowhere also,1,4,2,88,4,3,4


In [113]:
tokenizer.word_index

{'please': 1,
 'pm': 2,
 'hi': 3,
 'regards': 4,
 'thank': 5,
 'hello': 6,
 'you': 7,
 're': 8,
 'thanks': 9,
 'for': 10,
 'sent': 11,
 'kind': 12,
 'help': 13,
 'tuesday': 14,
 'wednesday': 15,
 'dear': 16,
 'thursday': 17,
 'friday': 18,
 'best': 19,
 'have': 20,
 'by': 21,
 'with': 22,
 'can': 23,
 'july': 24,
 'engineer': 25,
 'error': 26,
 'has': 27,
 'ext': 28,
 'issue': 29,
 'log': 30,
 'be': 31,
 'let': 32,
 'attached': 33,
 'date': 34,
 'change': 35,
 'october': 36,
 'information': 37,
 'we': 38,
 'senior': 39,
 'also': 40,
 'november': 41,
 'add': 42,
 'form': 43,
 'details': 44,
 'name': 45,
 'order': 46,
 'your': 47,
 'analyst': 48,
 'access': 49,
 'leaver': 50,
 'update': 51,
 'december': 52,
 'number': 53,
 'code': 54,
 'could': 55,
 'officer': 56,
 'if': 57,
 'site': 58,
 'provide': 59,
 'leave': 60,
 'work': 61,
 'march': 62,
 'client': 63,
 'create': 64,
 'high': 65,
 'report': 66,
 'issues': 67,
 'si': 68,
 'or': 69,
 'did': 70,
 'but': 71,
 'days': 72,
 'february': 7

In [114]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_num_words) #Tokenizer is used to tokenize text
tokenizer.fit_on_texts(raw_data.body) #Fit this to our corpus

x_train = tokenizer.texts_to_sequences(raw_data.body) #'text to sequences converts the text to a list of indices
x_train = pad_sequences(x_train, maxlen=50) #pad_sequences makes every sequence a fixed size list by padding with 0s 
x_test = tokenizer.texts_to_sequences(raw_test_data.body) 
x_test = pad_sequences(x_test, maxlen=50)

x_train.shape, x_test.shape # Check the dimensions of x_train and x_test  

((43694, 50), (4855, 50))

##### Preparing the target vectors for the network

In [115]:
unique_labels = list(raw_data.urgency.unique())
print(unique_labels)

[3, 2, 1, 0]


In [116]:
from keras.utils import to_categorical # This convert the labels to one-hot vectors(Dummies)

y_train = np.array([unique_labels.index(i) for i in raw_data.urgency]) # Convert the word labels to indeces
y_train = to_categorical(y_train) # Dummify the labels
y_test = np.array([unique_labels.index(i) for i in raw_test_data.urgency])
y_test = to_categorical(y_test)

In [117]:
y_train

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]], dtype=float32)

##### Building and training an LSTM model

In [118]:
# Building an LSTM model
model_lstm = Sequential() # Call Sequential to initialize a network
model_lstm.add(Embedding(input_dim = max_num_words, 
                    input_length = seq_len, 
                    output_dim = embedding_size)) # Add an embedding layer which represents each unique token as a vector
model_lstm.add(LSTM(30, return_sequences=True)) # Add an LSTM layer
model_lstm.add(LSTM(10, return_sequences=True)) # Add an LSTM layer
model_lstm.add(LSTM(5, return_sequences=False))
model_lstm.add(Dense(4, activation='softmax')) # Add an ouput layer. Since classification, 3 nodes for 3 classes.

In [119]:
model_lstm.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 100)           1000000   
_________________________________________________________________
lstm_7 (LSTM)                (None, 50, 30)            15720     
_________________________________________________________________
lstm_8 (LSTM)                (None, 50, 10)            1640      
_________________________________________________________________
lstm_9 (LSTM)                (None, 5)                 320       
_________________________________________________________________
dense_3 (Dense)              (None, 4)                 24        
Total params: 1,017,704
Trainable params: 1,017,704
Non-trainable params: 0
_________________________________________________________________


In [120]:
from keras.optimizers import Adam
adam = Adam(lr=0.001)

In [121]:
# Mention the optimizer, Loss function and metrics to be computed
model_lstm.compile(optimizer=adam,                  # 'Adam' is a variant of gradient descent technique
              loss='categorical_crossentropy', # categorical_crossentropy for multi-class classification
              metrics=['accuracy'])            # These metrics are computed for evaluating and stored in history

# model_lstm.fit(x_train, y_train, epochs=5, validation_split=0.25)

In [122]:
import keras

# model_lstm.save('model_lstm_V1.hdf5')
model_lstm_V1_20_epochs = keras.models.load_model('model_lstm_V1.hdf5')

### Prediction and evaluation on test data
1. Check the network output on test data. What do these values represent?
2. Predict the class labels on test data
2. Evaluate the model on test data

Hint: Check model.predict, model.predict_classes, model.evaluate in keras

In [123]:
model_lstm_V1_20_epochs.input_shape

(None, 50)

In [124]:
test_prob = model_lstm_V1_20_epochs.predict(x_test)
test_prob.shape
print(test_prob)

[[9.99947667e-01 1.13099723e-05 1.62411179e-05 2.47946173e-05]
 [9.92705100e-06 9.21769977e-01 6.01206236e-02 1.80995911e-02]
 [9.99944806e-01 1.21708099e-05 1.73272347e-05 2.57321353e-05]
 ...
 [9.99945045e-01 1.19486995e-05 1.68534534e-05 2.61700134e-05]
 [9.99921560e-01 1.71057891e-05 2.11598908e-05 4.00836689e-05]
 [9.99937773e-01 1.39852982e-05 1.90127885e-05 2.92914883e-05]]


In [125]:
test_prob[:5]

array([[9.9994767e-01, 1.1309972e-05, 1.6241118e-05, 2.4794617e-05],
       [9.9270510e-06, 9.2176998e-01, 6.0120624e-02, 1.8099591e-02],
       [9.9994481e-01, 1.2170810e-05, 1.7327235e-05, 2.5732135e-05],
       [9.9995041e-01, 1.0544163e-05, 1.6460259e-05, 2.2691620e-05],
       [1.5985581e-05, 8.9685416e-01, 8.0560334e-02, 2.2569416e-02]],
      dtype=float32)

In [126]:
test_classes = model_lstm_V1_20_epochs.predict_classes(x_test)
test_classes.shape

(4855,)

In [127]:
test_classes = np.argmax(test_prob, axis=1)
test_classes.shape

(4855,)

In [128]:
test_classes[:11]

array([0, 1, 0, 0, 1, 0, 1, 0, 0, 2, 0], dtype=int64)

### Evaluation

#### Loss and Accuracy

In [129]:
score = model_lstm_V1_20_epochs.evaluate(x_test, y_test)
print('Test Loss:', score[0])
print('Test Accuracy:', score[1])

Test Loss: 0.3764079353571644
Test Accuracy: 0.862821833161689


#### Confusion Matrix

In [130]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(np.argmax(y_test, axis=1), test_classes))

[[3451    1   10    0]
 [   9  311  213   20]
 [   4  240  392   39]
 [   4   55   71   35]]


### Approach 2: Considering body, impact and catery for predicting the urgency 

In [131]:
id(x_train)

1771780292976

In [132]:
x_train_approach2 = x_train.copy()

In [133]:
id(x_train_approach2)

1776735919840

In [134]:
x_test_approach2 = x_test.copy()

In [135]:
id(x_test_approach2)

1776735919920

#### 1. Impact 

    a. Dummifying the unique labels in impact column and concatenating the same with x_train

In [136]:
from keras.utils import to_categorical

unique_labels_impact = list(raw_data.impact.unique())
#print(unique_labels_impact)

impactDummy = np.array([unique_labels_impact.index(i) for i in raw_data.impact]) # Convert the word labels to indeces
impactDummy = to_categorical(impactDummy) # Dummify the labels
print(type(impactDummy))

x_train_approach2.shape, x_test_approach2.shape # Check the dimensions of x_train_approach2 and x_test_approach2  

<class 'numpy.ndarray'>


((43694, 50), (4855, 50))

In [137]:
x_train_approach2 = np.concatenate((x_train_approach2, impactDummy), axis = 1)

In [138]:
x_train_approach2.shape, x_test_approach2.shape # Check the dimensions of x_train and x_test 

((43694, 55), (4855, 50))

    b. Dummifying the unique labels in impact column and concatenating the same with x_test

In [139]:
unique_test_labels_impact = list(raw_test_data.impact.unique())
#print(unique_labels_impact)

impactDummy = np.array([unique_test_labels_impact.index(i) for i in raw_test_data.impact]) # Convert the word labels to indeces
impactDummy = to_categorical(impactDummy) # Dummify the labels
type(impactDummy)

numpy.ndarray

In [140]:
x_test_approach2 = np.concatenate((x_test_approach2, impactDummy), axis = 1)

In [141]:
x_train_approach2.shape, x_test_approach2.shape # Check the dimensions of x_train and x_test 

((43694, 55), (4855, 55))

#### 2. Category
    a. Dummifying the unique labels in category column and concatenating the same with x_train

In [142]:
unique_labels_category = list(raw_data.category.unique())
#print(unique_labels_category)

category_Dummy = np.array([unique_labels_category.index(i) for i in raw_data.category]) # Convert the word labels to indeces
category_Dummy = to_categorical(category_Dummy) # Dummify the labels
type(category_Dummy)

numpy.ndarray

In [143]:
x_train_approach2 = np.concatenate((x_train_approach2, category_Dummy), axis = 1)

    b. Dummifying the unique labels in category column and concatenating the same with x_test

In [144]:
unique_test_labels_category = list(raw_test_data.category.unique())
#print(unique_labels_category)

category_Dummy = np.array([unique_test_labels_category.index(i) for i in raw_test_data.category]) # Convert the word labels to indeces
category_Dummy = to_categorical(category_Dummy) # Dummify the labels
type(category_Dummy)

numpy.ndarray

In [145]:
x_test_approach2 = np.concatenate((x_test_approach2, category_Dummy), axis = 1)

In [146]:
x_train_approach2.shape, x_test_approach2.shape # Check the dimensions of x_train and x_test 

((43694, 68), (4855, 67))

#### Using pad_sequence so that x_train and x_test will have same length

In [147]:
x_train_approach2 = pad_sequences(x_train_approach2, maxlen=70)
x_test_approach2 = pad_sequences(x_test_approach2, maxlen=70)

In [148]:
x_train_approach2.shape, x_test_approach2.shape # Check the dimensions of x_train and x_test 

((43694, 70), (4855, 70))

### Approach2: Building and training an LSTM model

In [149]:
max_num_words = 10000
seq_len_approach2 = 70
embedding_size = 100

In [150]:
# Building an LSTM model
model_lstm_approach2 = Sequential() # Call Sequential to initialize a network
model_lstm_approach2.add(Embedding(input_dim = max_num_words, 
                    input_length = seq_len_approach2, 
                    output_dim = embedding_size)) # Add an embedding layer which represents each unique token as a vector
model_lstm_approach2.add(LSTM(30, return_sequences=True)) # Add an LSTM layer
model_lstm_approach2.add(LSTM(10, return_sequences=True)) # Add an LSTM layer
model_lstm_approach2.add(LSTM(5, return_sequences=False))
model_lstm_approach2.add(Dense(4, activation='softmax')) # Add an ouput layer. Since classification, 3 nodes for 3 classes.

In [151]:
model_lstm_approach2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 70, 100)           1000000   
_________________________________________________________________
lstm_10 (LSTM)               (None, 70, 30)            15720     
_________________________________________________________________
lstm_11 (LSTM)               (None, 70, 10)            1640      
_________________________________________________________________
lstm_12 (LSTM)               (None, 5)                 320       
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 24        
Total params: 1,017,704
Trainable params: 1,017,704
Non-trainable params: 0
_________________________________________________________________


In [152]:
from keras.optimizers import Adam
adam = Adam(lr=0.001)

In [153]:
# Mention the optimizer, Loss function and metrics to be computed
model_lstm_approach2.compile(optimizer=adam,                  # 'Adam' is a variant of gradient descent technique
              loss='categorical_crossentropy', # categorical_crossentropy for multi-class classification
              metrics=['accuracy'])            # These metrics are computed for evaluating and stored in history



In [154]:
# model_lstm_approach2.fit(x_train_approach2, y_train, epochs=3, validation_split=0.25)

In [155]:
import keras
# model_lstm_approach2.save('model_lstm_V3.hdf5')
model_lstm_V2_with_impact_category_3_epochs = keras.models.load_model('model_lstm_V3.hdf5')

### Approach2: Prediction and evaluation on test data
1. Check the network output on test data. What do these values represent?
2. Predict the class labels on test data
2. Evaluate the model on test data

Hint: Check model.predict, model.predict_classes, model.evaluate in keras

In [156]:
model_lstm_V2_with_impact_category_3_epochs.input_shape

(None, 70)

In [157]:
test_prob_approach2 = model_lstm_V2_with_impact_category_3_epochs.predict(x_test_approach2)
test_prob_approach2.shape
print(test_prob_approach2)

[[9.9829572e-01 7.1178738e-04 7.9652417e-04 1.9602239e-04]
 [2.2686180e-03 3.9962882e-01 4.8040229e-01 1.1770027e-01]
 [9.9829572e-01 7.1171846e-04 7.9647213e-04 1.9601904e-04]
 ...
 [9.9829584e-01 7.1170466e-04 7.9646008e-04 1.9601402e-04]
 [9.9829608e-01 7.1157143e-04 7.9635507e-04 1.9599704e-04]
 [9.9829441e-01 7.1234017e-04 7.9704478e-04 1.9618784e-04]]


In [158]:
test_prob_approach2[:5]

array([[9.98295724e-01, 7.11787376e-04, 7.96524168e-04, 1.96022389e-04],
       [2.26861797e-03, 3.99628818e-01, 4.80402291e-01, 1.17700271e-01],
       [9.98295724e-01, 7.11718458e-04, 7.96472130e-04, 1.96019042e-04],
       [9.98294652e-01, 7.12210662e-04, 7.96940469e-04, 1.96138499e-04],
       [2.25384627e-03, 3.96106422e-01, 4.84644324e-01, 1.16995454e-01]],
      dtype=float32)

In [159]:
test_classes_approach2 = model_lstm_V2_with_impact_category_3_epochs.predict_classes(x_test_approach2)
test_classes_approach2.shape

(4855,)

In [160]:
test_classes_approach2 = np.argmax(test_prob_approach2, axis=1)
test_classes_approach2.shape

(4855,)

In [161]:
test_classes_approach2[:11]

array([0, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0], dtype=int64)

### Approach2: Evaluation

#### Loss and Accuracy

In [162]:
score_approach2 = model_lstm_V2_with_impact_category_3_epochs.evaluate(x_test_approach2, y_test)
print('Test Loss:', score_approach2[0])
print('Test Accuracy:', score_approach2[1])

Test Loss: 0.30524546695234356
Test Accuracy: 0.8543769309130315


#### Confusion Matrix

In [163]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(np.argmax(y_test, axis=1), test_classes_approach2))

[[3459    3    0    0]
 [   8   52  493    0]
 [  10   28  637    0]
 [   7   11  147    0]]


In [164]:
print("Class predicted by first model for first 30 tickets: \n", test_classes[:30], "\n")

print("Class predicted by second model (approach2) for first 30 tickets: \n", test_classes_approach2[:30])


Class predicted by first model for first 30 tickets: 
 [0 1 0 0 1 0 1 0 0 2 0 2 0 0 0 0 0 2 0 0 0 0 1 0 0 0 0 2 1 2] 

Class predicted by second model (approach2) for first 30 tickets: 
 [0 2 0 0 2 0 2 0 0 2 0 2 0 0 0 0 0 2 0 0 0 0 2 0 0 0 0 1 1 2]


In [165]:
print("Class wise probabilities for first 10 tickets predicted by first model: \n", test_prob[:10], "\n")
print()
print("Class wise probabilities for first 10 tickets predicted by second model (approach2): \n", test_prob_approach2[:10])

Class wise probabilities for first 10 tickets predicted by first model: 
 [[9.99947667e-01 1.13099723e-05 1.62411179e-05 2.47946173e-05]
 [9.92705100e-06 9.21769977e-01 6.01206236e-02 1.80995911e-02]
 [9.99944806e-01 1.21708099e-05 1.73272347e-05 2.57321353e-05]
 [9.99950409e-01 1.05441632e-05 1.64602588e-05 2.26916200e-05]
 [1.59855808e-05 8.96854162e-01 8.05603340e-02 2.25694161e-02]
 [9.99944806e-01 1.21456605e-05 1.83488646e-05 2.47250282e-05]
 [1.11780892e-05 8.65991831e-01 1.10134572e-01 2.38624923e-02]
 [9.99944091e-01 1.19399165e-05 1.73598237e-05 2.65845138e-05]
 [9.99938011e-01 1.30618937e-05 1.89897055e-05 2.99402582e-05]
 [2.13179865e-05 6.86056241e-02 9.27409410e-01 3.96369537e-03]] 


Class wise probabilities for first 10 tickets predicted by second model (approach2): 
 [[9.98295724e-01 7.11787376e-04 7.96524168e-04 1.96022389e-04]
 [2.26861797e-03 3.99628818e-01 4.80402291e-01 1.17700271e-01]
 [9.98295724e-01 7.11718458e-04 7.96472130e-04 1.96019042e-04]
 [9.98294652e-01

In [166]:
import pandas as pd

# Result

In [167]:
data = {'Model': ['First model' , 'Second model'], \
        'Accuracy': [score[1], score_approach2[1]]}
Final_Output =pd.DataFrame.from_dict(data)#, orient='index')
Final_Output

Unnamed: 0,Model,Accuracy
0,First model,0.862822
1,Second model,0.854377
