Step 1: Import Required Libraries



In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
import nltk

# Download required NLTK tokenizer
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Step 2: Load the Dataset

In [None]:
# Load test data with UTF-16 encoding
test_data_path = '/content/test_dataset.txt'

with open(test_data_path, 'r', encoding='utf-16') as file:
    sentences = [line.strip().split(" ", 1) for line in file.readlines()]

# Process the sentences and load into DataFrame
import pandas as pd

df = pd.DataFrame(sentences, columns=["label", "sentence"])
df["label"] = df["label"].astype(int)

# Check the first few rows of the dataframe to ensure it's loaded correctly
df.head()

Unnamed: 0,label,sentence
0,0,මම යති
1,0,මම යත්වා
2,0,මම යනවා
3,0,මම යනවාලා
4,0,මම යනු


Step 3: Handle Missing Values


In [None]:
# Check for missing values and handle them
df['sentence'] = df['sentence'].fillna('')  # Replace NaN with an empty string

# Optional: Drop rows with missing sentences if needed
# df = df.dropna(subset=['sentence'])

print("Missing values handled. Dataset preview:")
print(df.isnull().sum())  # Verify no missing values remain


Missing values handled. Dataset preview:
label       0
sentence    0
dtype: int64


Step 4: Split Data into Features and Labels


In [None]:
# Split data into sentences (features) and labels (target)
X = df['sentence']  # Features: sentences
y = df['label']     # Labels: 1 (correct), 0 (incorrect)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train-test split completed.")
print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")


Train-test split completed.
Training samples: 13536, Testing samples: 3384


Step 5: Text Vectorization Using CountVectorizer



In [None]:
!pip install nltk
import nltk

# Download the required NLTK data package
nltk.download('punkt_tab')

# Initialize CountVectorizer with a custom tokenizer
vectorizer = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)

# Convert text data into numeric vectors
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

print("Vectorization completed. Feature matrix size:")
print(f"Training data: {X_train_vectorized.shape}, Testing data: {X_test_vectorized.shape}")



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Vectorization completed. Feature matrix size:
Training data: (13536, 195), Testing data: (3384, 195)


Step 6: Train the Random Forest Classifier


In [None]:
# Initialize the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the vectorized training data
model.fit(X_train_vectorized, y_train)

print("Model training completed.")


Model training completed.


Step 7: Evaluate the Model


In [None]:
# Predict on the test set
y_pred = model.predict(X_test_vectorized)

# Calculate and display accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.96


Step 8: Predict and Correct Sentences


In [None]:
# Define a function to predict and correct sentences
def grammar_checker(sentence):
    # Vectorize the input sentence
    vectorized_sentence = vectorizer.transform([sentence])

    # Predict whether the sentence is correct or incorrect
    prediction = model.predict(vectorized_sentence)[0]

    if prediction == 1:
        print(f"Input Sentence: {sentence}")
        print("Sentence is Correct.")
    else:
        print(f"Input Sentence: {sentence}")
        print("Sentence is Incorrect.")

        # Suggest a correction (mock correction here for demonstration)
        # In practice, this could involve further analysis
        print("Correct Sentence: (Correction logic required)")

# Test the function with a sample sentence
sample_sentence = "මම යනවා"
grammar_checker(sample_sentence)


Input Sentence: මම යනවා
Sentence is Incorrect.
Correct Sentence: (Correction logic required)


Final Pipeline Code



In [None]:
def grammar_checker(sentence, vectorizer, model, df):
    """
    Check the grammar of a sentence and suggest a correction if it's incorrect.

    Parameters:
    - sentence: The input sentence to check.
    - vectorizer: The trained CountVectorizer.
    - model: The trained classification model.
    - df: The original dataset (used to find corrections).
    """
    # Vectorize the input sentence
    vectorized_sentence = vectorizer.transform([sentence])

    # Predict whether the sentence is correct or incorrect
    prediction = model.predict(vectorized_sentence)[0]

    if prediction == 1:
        print(f"Input Sentence: {sentence}")
        print("Sentence is Correct.")
    else:
        print(f"Input Sentence: {sentence}")
        print("Sentence is Incorrect.")

        # Find a similar correct sentence from the dataset
        correct_sentences = df[df['label'] == 1]['sentence']
        most_similar = None
        max_similarity = 0

        for correct_sentence in correct_sentences:
            # Calculate similarity between input and dataset sentences
            similarity = len(set(sentence.split()).intersection(set(correct_sentence.split())))
            if similarity > max_similarity:
                max_similarity = similarity
                most_similar = correct_sentence

        if most_similar:
            print(f"Correct Sentence: {most_similar}")
        else:
            print("Correct Sentence: No suggestion found.")


In [None]:
# Example input sentence to check
sample_sentence = "මම යනවා"

# Call the grammar_checker function
grammar_checker(sample_sentence, vectorizer, model, df)


Input Sentence: මම යනවා
Sentence is Incorrect.
Correct Sentence: මම නැටුම් නටද්දී නුබ යනවා


In [None]:
sentences = ["මම යනවා", "මම යත්වා", "මම ගියෙමි"]
for sentence in sentences:
    grammar_checker(sentence, vectorizer, model, df)
    print("\n")


Input Sentence: මම යනවා
Sentence is Incorrect.
Correct Sentence: මම නැටුම් නටද්දී නුබ යනවා


Input Sentence: මම යත්වා
Sentence is Incorrect.
Correct Sentence: මම නැටුම් නටද්දී ළමයා යත්වා


Input Sentence: මම ගියෙමි
Sentence is Correct.




Experiment with Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_vectorized, y_train)
best_model = grid_search.best_estimator_


Fitting 3 folds for each of 81 candidates, totalling 243 fits


Evaluate Using Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_train_vectorized, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {scores.mean():.2f}")


Cross-Validation Accuracy: 0.95


Update Grammar Checker for Accuracy

In [None]:
def grammar_checker(sentence, vectorizer, model):
    vectorized_sentence = vectorizer.transform([sentence])
    prediction = model.predict(vectorized_sentence)[0]
    return "Correct" if prediction == 1 else "Incorrect"

sentences = ["මම යනවා", "මම යත්වා", "මම ගියෙමි"]
for sentence in sentences:
    result = grammar_checker(sentence, vectorizer, best_model)
    print(f"Sentence: {sentence} -> {result}")


Sentence: මම යනවා -> Incorrect
Sentence: මම යත්වා -> Incorrect
Sentence: මම ගියෙමි -> Correct


Use Confusion Matrix for Insights

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

y_test_pred = model.predict(X_test_vectorized)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))


[[2920    0]
 [ 123  341]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      2920
           1       1.00      0.73      0.85       464

    accuracy                           0.96      3384
   macro avg       0.98      0.87      0.91      3384
weighted avg       0.97      0.96      0.96      3384



In [None]:
sentences = ["අපි යනු", "අපි යමු", "අපි යවමි"]
for sentence in sentences:
    result = grammar_checker(sentence, vectorizer, best_model)
    print(f"Sentence: {sentence} -> {result}")

Sentence: අපි යනු -> Incorrect
Sentence: අපි යමු -> Correct
Sentence: අපි යවමි -> Incorrect


In [33]:
def grammar_checker(sentence, vectorizer, model, df):

    # Vectorize the input sentence
    vectorized_sentence = vectorizer.transform([sentence])

    # Predict whether the sentence is correct or incorrect
    prediction = model.predict(vectorized_sentence)[0]

    if prediction == 1:
        print(f"Input Sentence: {sentence}")
        print("Sentence is Correct.")
    else:
        print(f"Input Sentence: {sentence}")
        print("Sentence is Incorrect.")

        # Find a similar correct sentence from the dataset
        correct_sentences = df[df['label'] == 1]['sentence']
        most_similar = None
        max_similarity = 0

        for correct_sentence in correct_sentences:
            # Calculate similarity between input and dataset sentences
            # Using Jaccard similarity for better results
            set1 = set(sentence.split())
            set2 = set(correct_sentence.split())
            similarity = len(set1.intersection(set2)) / len(set1.union(set2)) if set1.union(set2) else 0

            if similarity > max_similarity:
                max_similarity = similarity
                most_similar = correct_sentence

        if most_similar:
            print(f"Correct Sentence: {most_similar}")
        else:
            print("Correct Sentence: No suggestion found.")


# Example usage (assuming you have vectorizer, model, and df defined)
# Replace with your actual vectorizer, model and DataFrame
sentences = ["අපි යනු", "අපි යමු", "අපි යවමි"]
for sentence in sentences:
    grammar_checker(sentence, vectorizer, best_model, df)
    print("\n")

Input Sentence: අපි යනු
Sentence is Incorrect.
Correct Sentence: අපි යන්නෙමු


Input Sentence: අපි යමු
Sentence is Correct.


Input Sentence: අපි යවමි
Sentence is Incorrect.
Correct Sentence: මම යවමි




In [32]:
sentences = ["මම ඔහුගෙන් පොතක් ගත්තෙමි", "නුබ ගියෙහි", "ළමයි වේගයෙන් ගියෝය" , " මම ගෙදර යමු", "නුබලා ගෙදර වේගයෙන් ගියෙහි","ළමයා පොත බලා ගෙදර ගියෙහු"]
for sentence in sentences:
    grammar_checker(sentence, vectorizer, best_model, df)
    print("\n")

Input Sentence: මම ඔහුගෙන් පොතක් ගත්තෙමි
Sentence is Correct.


Input Sentence: නුබ ගියෙහි
Sentence is Correct.


Input Sentence: ළමයි වේගයෙන් ගියෝය
Sentence is Correct.


Input Sentence:  මම ගෙදර යමු
Sentence is Incorrect.
Correct Sentence: මම ගෙදර යන්නෙමි


Input Sentence: නුබලා ගෙදර වේගයෙන් ගියෙහි
Sentence is Incorrect.
Correct Sentence: නුබ ගෙදර වේගයෙන් ගියෙහි


Input Sentence: ළමයා පොත බලා ගෙදර ගියෙහු
Sentence is Incorrect.
Correct Sentence: නුබලා පොත බලා ගෙදර ගියෙහු


