In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
train_url = '/kaggle/input/revised-corrector-dataset/train_corr.csv'
test_url = '/kaggle/input/revised-corrector-dataset/test_corr.csv'
df_train = pd.read_csv(train_url)
df_test = pd.read_csv(test_url)
stop_words_df = pd.read_excel('/kaggle/input/bangla-stopwords/stopwords_bangla.xlsx')

In [3]:
STOPWORDS = set([word.strip() for word in stop_words_df['words']])

In [4]:
import re
def preprocess(x):
    html_pattern = re.compile('<.*?>')
    x = html_pattern.sub(r'', x)
    x = " ".join([word for word in str(x).split() if word not in STOPWORDS])
    return x
df_train['Comment'] = df_train['Comment'].apply(lambda x: preprocess(x))
df_test['Comment'] = df_test['Comment'].apply(lambda x:preprocess(x))

In [5]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(df_train.Error)
Test_Y = Encoder.fit_transform(df_test.Error)

In [6]:
df_all  = pd.concat([df_train, df_test], ignore_index=True)
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df_all['Comment'])
Train_X_Tfidf = Tfidf_vect.transform(df_train['Comment'])
Test_X_Tfidf = Tfidf_vect.transform(df_test['Comment'])

In [8]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
predictions_NB = Naive.predict(Test_X_Tfidf)

In [9]:
print(metrics.classification_report(Test_Y, predictions_NB,digits = 4))

              precision    recall  f1-score   support

           0     0.5198    0.1099    0.1815      1910
           1     0.6319    0.9377    0.7550      3112

    accuracy                         0.6229      5022
   macro avg     0.5758    0.5238    0.4682      5022
weighted avg     0.5893    0.6229    0.5369      5022



In [14]:
from sklearn.pipeline import Pipeline

# Create a pipeline with TfidfVectorizer and MultinomialNB
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', naive_bayes.MultinomialNB())
])

# Parameter grid for the pipeline
params = {
    'tfidf__max_features': [5000, 10000, 20000],
    'nb__alpha': [0.1, 0.5, 1.0],
    # Add other parameters to tune
}

# Grid search for the pipeline
grid_search = GridSearchCV(pipeline, params, cv=5)
grid_search.fit(df_all['Comment'], df_all['Error'])

# Best parameters
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Transform the test data using the best model
Test_X_Tfidf = best_model.named_steps['tfidf'].transform(df_test['Comment'])

# Make predictions
predictions_NB = best_model.named_steps['nb'].predict(Test_X_Tfidf)

# Evaluate performance
print(metrics.classification_report(Test_Y, predictions_NB, digits=4))


              precision    recall  f1-score   support

           0     0.7919    0.1953    0.3133      1910
           1     0.6623    0.9685    0.7866      3112

    accuracy                         0.6744      5022
   macro avg     0.7271    0.5819    0.5500      5022
weighted avg     0.7116    0.6744    0.6066      5022



In [15]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import pandas as pd

# Assuming df_train and df_test are defined
df_all = pd.concat([df_train, df_test], ignore_index=True)

# Create a pipeline with TfidfVectorizer and MultinomialNB
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

# Parameter grid for the pipeline
params = {
    'tfidf__max_features': [5000, 10000, 20000, 30000],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__sublinear_tf': [True, False],
    'tfidf__smooth_idf': [True, False],
    'tfidf__norm': ['l1', 'l2'],
    'nb__alpha': [0.1, 0.5, 1.0, 1.5],
    # Add other parameters to tune
}

# Grid search for the pipeline
grid_search = GridSearchCV(pipeline, params, cv=5)
grid_search.fit(df_all['Comment'], df_all['Error'])

# Best parameters
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Transform the test data using the best model
Test_X_Tfidf = best_model.named_steps['tfidf'].transform(df_test['Comment'])

# Make predictions
predictions_NB = best_model.named_steps['nb'].predict(Test_X_Tfidf)

# Evaluate performance
print(metrics.classification_report(Test_Y, predictions_NB, digits=4))

              precision    recall  f1-score   support

           0     0.9422    0.3330    0.4921      1910
           1     0.7069    0.9875    0.8240      3112

    accuracy                         0.7386      5022
   macro avg     0.8246    0.6602    0.6580      5022
weighted avg     0.7964    0.7386    0.6977      5022

