In [12]:


import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))



In [13]:
def text_to_word_list(text):
    text = text.split()
    return text

def replace_strings(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\u00C0-\u017F"          #latin
                           u"\u2000-\u206F"          #generalPunctuations
                               
                           "]+", flags=re.UNICODE)
    english_pattern=re.compile('[a-zA-Z0-9]+', flags=re.I)
    #latin_pattern=re.compile('[A-Za-z\u00C0-\u00D6\u00D8-\u00f6\u00f8-\u00ff\s]*',)
    
    text=emoji_pattern.sub(r'', text)
    text=english_pattern.sub(r'', text)

    return text

def remove_punctuations(my_str):
    # define punctuation
    punctuations = '''````£|¢|Ñ+-*/=EROero৳০১২৩৪৫৬৭৮৯012–34567•89।!()-[]{};:'"“\’,<>./?@#$%^&*_~‘—॥”‰⚽️✌�￰৷￰'''
    
    no_punct = ""
    for char in my_str:
        if char not in punctuations:
            no_punct = no_punct + char

    # display the unpunctuated string
    return no_punct



def joining(text):
    out=' '.join(text)
    return out

def preprocessing(text):
    out=remove_punctuations(replace_strings(text))
    return out

In [16]:
train_url = '/kaggle/input/rev-corr/train8020.csv'
test_url = '/kaggle/input/rev-corr/test8020.csv'
df_train = pd.read_csv(train_url)
df_test = pd.read_csv(test_url)
stop_words_df = pd.read_excel('/kaggle/input/bangla-stopwords/stopwords_bangla.xlsx',index_col=False)
STOPWORDS = set([word.strip() for word in stop_words_df['words']])

In [17]:
df_train['Comment'] = df_train.Comment.apply(lambda x: preprocessing(str(x)))
df_test['Comment'] = df_test.Comment.apply(lambda x:preprocessing(str(x)))
df = pd.concat([df_train,df_test],ignore_index = True)

In [18]:
set(df['Category'].values)

{'Code Switching', 'Correct', 'Grammatical', 'Multiple Errors', 'Spelling'}

In [20]:
def encode(s):
    d = {
        "Code Switching":0,
        "Grammatical":1,
        "Multiple Errors":2,
        "Spelling":3,
        "Correct":4
    }
    if s in d:
        return d[s]
    else:
        return 4
df['Category'] = df.Category.apply(lambda x: encode(x))
df_train['Category'] = df_train.Category.apply(lambda x: encode(x))
df_test['Category'] = df_test.Category.apply(lambda x: encode(x))

In [22]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
data = df_train['Comment']
labels = df_train['Category']
# Tokenize text data
max_words = 1000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['Comment'])
sequences = tokenizer.texts_to_sequences(data)
word_index = tokenizer.word_index
max_sequence_length = 100  # Adjust this based on your data

# Pad sequences
X = pad_sequences(sequences, maxlen=max_sequence_length)

# One-hot encode labels
Y = tf.keras.utils.to_categorical(labels, num_classes=5)

# Create CNN-LSTM model
model = Sequential()
model.add(Embedding(max_words, 100, input_length=max_sequence_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(LSTM(64))
model.add(Dense(64, activation='relu'))
model.add(Dense(5, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, Y, epochs=5, batch_size=16, validation_split=0.2)

# Evaluate the model on test data (replace with your test data)
test_data = df_test['Comment'] # Replace with your actual test data
test_labels = df_test['Category']  # Replace with your actual test labels

test_sequences = tokenizer.texts_to_sequences(test_data)
X_test = pad_sequences(test_sequences, maxlen=max_sequence_length)
Y_test = tf.keras.utils.to_categorical(test_labels, num_classes=5)

loss, accuracy = model.evaluate(X_test, Y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 1.4261767864227295, Test Accuracy: 0.4635603427886963


In [23]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
y_pred = model.predict(X_test)

# Calculate macro recall
macro_recall = recall_score(Y_test.argmax(axis=1), y_pred.argmax(axis=1), average='macro')
macro_precision = precision_score(Y_test.argmax(axis=1), y_pred.argmax(axis=1), average='macro')
# Print the results
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n  Macro Recall: {:0.3f}\n  Macro Precision: {:0.3f}'.format(loss, accuracy, macro_recall, macro_precision))

Test set
  Loss: 1.426
  Accuracy: 0.464
  Macro Recall: 0.310
  Macro Precision: 0.349
