In [51]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

# from nlpaug.util import Action
# import nlpaug.augmenter.word as word_augmenter

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, Dropout, Flatten, Bidirectional, LSTM, GRU


In [52]:
train_url = '/kaggle/input/revised-corrector-dataset/train_corr.csv'
test_url = '/kaggle/input/revised-corrector-dataset/test_corr.csv'
df_train = pd.read_csv(train_url)
df_test = pd.read_csv(test_url)
stop_words_df = pd.read_excel('/kaggle/input/bangla-stopwords/stopwords_bangla.xlsx',index_col=False)

In [53]:
STOPWORDS = set([word.strip() for word in stop_words_df['words']])

In [54]:
import re
def preprocess(x):
    html_pattern = re.compile('<.*?>')
    x = html_pattern.sub(r'', x)
    x = " ".join([word for word in str(x).split() if word not in STOPWORDS])
    return x
df_train['Comment'] = df_train['Comment'].apply(lambda x: preprocess(x))
df_test['Comment'] = df_test['Comment'].apply(lambda x:preprocess(x))

In [55]:
message_train = df_train['Comment'].values
message_test = df_test['Comment'].values
category_train = df_train['Error'].values
category_test = df_test['Error'].values


In [56]:
mx = 0
for i in df_train['Comment']:
    mx = max(mx,len(i))
for i in df_test['Comment']:
    mx = max(mx,len(i))
VOCAB_SIZE = 1000
MAX_LEN = mx
EMBEDDING_VECTOR_SIZE = 6

In [57]:
encoded_message_train = [one_hot(msg,VOCAB_SIZE) for msg in message_train]
encoded_message_test = [one_hot(msg,VOCAB_SIZE) for msg in message_test]

In [58]:
padded_message_train = pad_sequences(encoded_message_train, maxlen=MAX_LEN, padding='post')
padded_message_test = pad_sequences(encoded_message_test, maxlen=MAX_LEN, padding='post')

In [59]:
X_train = padded_message_train
X_test = padded_message_test
y_train = category_train
y_test = category_test

In [60]:
model = Sequential()
model.add(Embedding(VOCAB_SIZE, EMBEDDING_VECTOR_SIZE, input_length = MAX_LEN, name='Embedding'))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))


In [61]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Embedding (Embedding)       (None, 1007, 6)           6000      
                                                                 
 bidirectional_4 (Bidirectio  (None, 1007, 256)        138240    
 nal)                                                            
                                                                 
 flatten_2 (Flatten)         (None, 257792)            0         
                                                                 
 dropout_2 (Dropout)         (None, 257792)            0         
                                                                 
 dense_4 (Dense)             (None, 32)                8249376   
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                      

In [62]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [63]:
history = model.fit(X_train, y_train, validation_data = (X_test,y_test), epochs=5, batch_size=100)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [64]:
loss, acc = model.evaluate(X_test, y_test)



In [65]:
acc = np.round(acc, 2) * 100 
loss = np.round(loss, 2) * 100

print(f"Accuracy on unseen data is: { acc } %")
print(f'Loss on unseen data is: { loss } %')

Accuracy on unseen data is: 60.0 %
Loss on unseen data is: 69.0 %


In [66]:
pred = (model.predict(X_test) > 0.5).astype("int32")



In [67]:
print(classification_report(y_test, pred, target_names = ['Correct','Incorrect']))

              precision    recall  f1-score   support

     Correct       0.47      0.47      0.47      1910
   Incorrect       0.67      0.68      0.67      3112

    accuracy                           0.60      5022
   macro avg       0.57      0.57      0.57      5022
weighted avg       0.60      0.60      0.60      5022

