In [1]:
import string
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
df=pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0,message\t,category
0,RAS KERNEL INFO instruction cache parity error...,cache.error
1,RAS KERNEL INFO instruction cache parity error...,cache.error
2,RAS KERNEL INFO instruction cache parity error...,cache.error
3,RAS KERNEL INFO 63543 double-hummer alignment ...,alignment.exception
4,RAS KERNEL FATAL data storage interrupt\t,data.error


In [4]:
message_lines=list()
lines = df['message\t'].values.tolist()

In [5]:
for line in lines:
    tokens=word_tokenize(line)
    #lowecase
    tokens=[w.lower() for w in tokens]
    #remove punctuations
    table=str.maketrans('', '', string.punctuation)
    stripped= [w.translate(table) for w in tokens]
    #removing remaining tokens that are not albhabets
    words= [word for word in stripped if word.isalpha]
    #filter stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    message_lines.append(words)

In [6]:
len(message_lines)

149

In [7]:
import gensim
EMBEDDING_DIM=100
#Train word2vec model
model = gensim.models.Word2Vec(sentences=message_lines, vector_size=EMBEDDING_DIM, window=5, workers=4, min_count=1)
#vocab size
words=list(model.wv.key_to_index)
print("Vocabulary size: %d" % len(words))

Vocabulary size: 40


In [8]:
#similar words(semantic info)
model.wv.most_similar('error')

[('message', 0.25165846943855286),
 ('0', 0.20777487754821777),
 ('info', 0.19602042436599731),
 ('purpose', 0.19170905649662018),
 ('spent', 0.18937692046165466),
 ('fatal', 0.18103456497192383),
 ('microseconds', 0.16546376049518585),
 ('exception', 0.12770980596542358),
 ('ciostream', 0.12643198668956757),
 ('parity', 0.11647110432386398)]

In [9]:
#save model
filename = 'message_embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [10]:
import os

embeddings_index={}
f = open(os.path.join('', 'message_embedding_word2vec.txt'), encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close    

<function TextIOWrapper.close()>

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
#vectorise the text samples into a 2D tensor
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(message_lines)
sequences = tokenizer_obj.texts_to_sequences(message_lines)

#max_length=max(len(s.split() for s in message_lines))

#pad sequences
word_index = tokenizer_obj.word_index
print("Found %s unique tokens" %len(word_index))

message_pad = pad_sequences(sequences, padding='pre')
category=df['category'].values
print('Shape of message tensor:', message_pad.shape)
print('Shape of category tensor:', category.shape)

Found 40 unique tokens
Shape of message tensor: (149, 11)
Shape of category tensor: (149,)


In [12]:
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
category=lb.fit_transform(category)

In [13]:
category

array([1, 1, 1, 0, 3, 6, 3, 1, 3, 3, 0, 0, 0, 3, 3, 1, 3, 0, 1, 6, 6, 6,
       1, 6, 1, 6, 0, 0, 3, 3, 1, 3, 3, 0, 0, 1, 3, 3, 3, 0, 0, 0, 3, 3,
       5, 3, 1, 1, 1, 1, 1, 5, 6, 4, 6, 4, 6, 5, 6, 5, 5, 6, 6, 6, 5, 5,
       6, 5, 5, 5, 6, 4, 5, 5, 6, 6, 5, 5, 5, 5, 6, 8, 8, 8, 8, 8, 4, 8,
       4, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 4, 8, 8, 8, 8, 8, 8,
       8, 7, 2, 2, 2, 2, 2, 7, 7, 7, 2, 2, 2, 7, 7, 7, 7, 7, 7, 7, 2, 2,
       7, 7, 7, 7, 2, 2, 7, 7, 7, 2, 2, 2, 2, 2, 2, 2, 2])

In [14]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [15]:
print(num_words)

41


In [16]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers import Embedding
from keras.initializers import Constant

#define model

model = Sequential()
embedding_layer = Embedding(num_words, EMBEDDING_DIM,
                          # embeddings_initializer=Constant(embedding_matrix),
                           trainable=False)

model.add(embedding_layer)
model.add(GRU(units=40,dropout=0.2))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         4100      
                                                                 
 gru (GRU)                   (None, 40)                17040     
                                                                 
Total params: 21,140
Trainable params: 17,040
Non-trainable params: 4,100
_________________________________________________________________
None


In [17]:
VALIDATION_SPLIT =0.2

indices = np.arange(message_pad.shape[0])
np.random.shuffle(indices)
message_pad= message_pad[indices]
category=category[indices]
num_validation_samples=int(VALIDATION_SPLIT * message_pad.shape[0])

X_train_pad = message_pad[: -num_validation_samples]
y_train = category[:-num_validation_samples]
X_test_pad = message_pad[-num_validation_samples:]
y_test = category[-num_validation_samples:]

print('Shape of X_train_pad tensor:', X_train_pad.shape)
print('Shape of y_train tensor:', y_train.shape)

print('Shape of X_test_pad tensor:', X_test_pad.shape)
print('Shape of y_test tensor:', y_test.shape)




Shape of X_train_pad tensor: (120, 11)
Shape of y_train tensor: (120,)
Shape of X_test_pad tensor: (29, 11)
Shape of y_test tensor: (29,)


In [18]:
print("Training model...")


model.fit(X_train_pad, y_train,batch_size=18, epochs =10, validation_data=(X_test_pad, y_test), verbose=2)

Training model...
Epoch 1/10
7/7 - 2s - loss: 7.5546 - accuracy: 0.1833 - val_loss: 5.4267 - val_accuracy: 0.1379 - 2s/epoch - 322ms/step
Epoch 2/10
7/7 - 0s - loss: 4.7346 - accuracy: 0.2667 - val_loss: 3.5947 - val_accuracy: 0.1379 - 89ms/epoch - 13ms/step
Epoch 3/10
7/7 - 0s - loss: 3.0995 - accuracy: 0.4000 - val_loss: 3.2632 - val_accuracy: 0.3448 - 85ms/epoch - 12ms/step
Epoch 4/10
7/7 - 0s - loss: 2.7931 - accuracy: 0.4583 - val_loss: 3.0729 - val_accuracy: 0.4483 - 77ms/epoch - 11ms/step
Epoch 5/10
7/7 - 0s - loss: 2.5891 - accuracy: 0.5500 - val_loss: 2.9032 - val_accuracy: 0.5517 - 98ms/epoch - 14ms/step
Epoch 6/10
7/7 - 0s - loss: 2.3764 - accuracy: 0.6750 - val_loss: 2.6916 - val_accuracy: 0.5517 - 85ms/epoch - 12ms/step
Epoch 7/10
7/7 - 0s - loss: 2.1617 - accuracy: 0.7583 - val_loss: 2.4385 - val_accuracy: 0.5862 - 81ms/epoch - 12ms/step
Epoch 8/10
7/7 - 0s - loss: 1.8965 - accuracy: 0.7833 - val_loss: 2.1032 - val_accuracy: 0.7586 - 87ms/epoch - 12ms/step
Epoch 9/10
7/7 

<keras.callbacks.History at 0x1ea6968f880>

In [19]:
score, acc = model.evaluate(X_test_pad, y_test,
                        verbose=2)

print('Test accuracy:', acc)

1/1 - 0s - loss: 0.6368 - accuracy: 0.8621 - 34ms/epoch - 34ms/step
Test accuracy: 0.8620689511299133


In [20]:
prediction= model.predict(X_test_pad)
predictions = np.argmax(prediction, axis = 1)



In [None]:
model.predict()

In [21]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, predictions))

[[4 0 0 0 0 0 0 0 0 0]
 [0 3 0 0 0 0 0 0 0 0]
 [0 0 3 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 4]
 [0 0 0 0 5 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 4 0 0 0]
 [0 0 0 0 0 0 0 3 0 0]
 [0 0 0 0 0 0 0 0 2 0]
 [0 0 0 0 0 0 0 0 0 0]]


In [22]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.8620689655172413

In [23]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         3
           2       1.00      1.00      1.00         3
           3       0.00      0.00      0.00         4
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         4
           7       1.00      1.00      1.00         3
           8       1.00      1.00      1.00         2
          28       0.00      0.00      0.00         0

    accuracy                           0.86        29
   macro avg       0.80      0.80      0.80        29
weighted avg       0.86      0.86      0.86        29



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
