In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

from nlpaug.util import Action
import nlpaug.augmenter.word as word_augmenter

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, Dropout, Flatten, Bidirectional, LSTM, GRU


2023-08-19 03:34:44.925902: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-19 03:34:44.958820: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-19 03:34:44.959925: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train_url = '/home/thinker/NLP/NLP/FromScratch/RNN_LSTM_GRU/error detection/data/data/train_data.csv'
test_url = '/home/thinker/NLP/NLP/FromScratch/RNN_LSTM_GRU/error detection/data/data/test_data.csv'
df_train = pd.read_csv(train_url)
df_test = pd.read_csv(test_url)
stop_words_df = pd.read_excel('/home/thinker/NLP/NLP/FromScratch/RNN_LSTM_GRU/error detection/data/stopwords_bangla.xlsx',index_col=False)

In [5]:
STOPWORDS = set([word.strip() for word in stop_words_df['words']])

In [6]:
import re
def preprocess(x):
    html_pattern = re.compile('<.*?>')
    x = html_pattern.sub(r'', x)
    x = " ".join([word for word in str(x).split() if word not in STOPWORDS])
    return x
df_train['Comment'] = df_train['Comment'].apply(lambda x: preprocess(x))
df_test['Comment'] = df_test['Comment'].apply(lambda x:preprocess(x))

In [8]:
message_train = df_train['Comment'].values
message_test = df_test['Comment'].values
category_train = df_train['Error'].values
category_test = df_test['Error'].values


In [10]:
mx = 0
for i in df_train['Comment']:
    mx = max(mx,len(i))
for i in df_test['Comment']:
    mx = max(mx,len(i))
VOCAB_SIZE = 1000
MAX_LEN = mx
EMBEDDING_VECTOR_SIZE = 6

In [11]:
encoded_message_train = [one_hot(msg,VOCAB_SIZE) for msg in message_train]
encoded_message_test = [one_hot(msg,VOCAB_SIZE) for msg in message_test]

In [12]:
padded_message_train = pad_sequences(encoded_message_train, maxlen=MAX_LEN, padding='post')
padded_message_test = pad_sequences(encoded_message_test, maxlen=MAX_LEN, padding='post')

In [13]:
X_train = padded_message_train
X_test = padded_message_test
y_train = category_train
y_test = category_test

In [14]:
model = Sequential()
model.add(Embedding(VOCAB_SIZE, EMBEDDING_VECTOR_SIZE, input_length = MAX_LEN, name='Embedding'))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(GRU(64)))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))


In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Embedding (Embedding)       (None, 317, 6)            6000      
                                                                 
 bidirectional (Bidirection  (None, 317, 256)          138240    
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               123648    
 onal)                                                           
                                                                 
 flatten (Flatten)           (None, 128)               0         
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 32)                4

In [16]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [17]:
history = model.fit(X_train, y_train, validation_data = (X_test,y_test), epochs=15, batch_size=100)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [18]:
loss, acc = model.evaluate(X_test, y_test)



In [19]:
acc = np.round(acc, 2) * 100 
loss = np.round(loss, 2) * 100

print(f"Accuracy on unseen data is: { acc } %")
print(f'Loss on unseen data is: { loss } %')

Accuracy on unseen data is: 61.0 %
Loss on unseen data is: 70.0 %


In [20]:
pred = (model.predict(X_test) > 0.5).astype("int32")



In [21]:
print(classification_report(y_test, pred, target_names = ['Correct','Incorrect']))

              precision    recall  f1-score   support

     Correct       0.67      0.66      0.67      1167
   Incorrect       0.54      0.55      0.55       844

    accuracy                           0.61      2011
   macro avg       0.61      0.61      0.61      2011
weighted avg       0.62      0.61      0.62      2011

