**EC9640 - Artificial Intelligence Project**(2020/E/031,2020/E/076)

Grammar checker for Tamil

In [1]:
!pip install stanza
import stanza
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Load Tamil language model for Stanza
stanza.download('ta')
nlp = stanza.Pipeline('ta')


Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.10.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.14.0 stanza-1.10.1


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: ta (Tamil) ...


Downloading https://huggingface.co/stanfordnlp/stanza-ta/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/ta/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: ta (Tamil):
| Processor | Package      |
----------------------------
| tokenize  | ttb          |
| mwt       | ttb          |
| pos       | ttb_nocharlm |
| lemma     | ttb_nocharlm |
| depparse  | ttb_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


In [3]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

# Load dataset with error handling
dataset_path = '/content/drive/MyDrive/tamil_grammar_dataset2.csv'
data = pd.read_csv(dataset_path)

Mounted at /content/drive


In [5]:
# Print original columns for inspection
print("Original Dataset Columns:", data.columns)
data.columns = data.columns.str.strip()
print("Dataset Columns:", data.columns)
if 'Error Sentence' not in data.columns or 'Corrected Sentence' not in data.columns:
    raise ValueError("Dataset must contain 'Error Sentence' and 'Corrected Sentence' columns.")


Original Dataset Columns: Index(['Error Sentence', 'Corrected Sentence'], dtype='object')
Dataset Columns: Index(['Error Sentence', 'Corrected Sentence'], dtype='object')


In [6]:
# Split dataset
X = data['Error Sentence']
y = data['Corrected Sentence']

# Convert text data to numerical features
vectorizer = CountVectorizer(ngram_range=(1, 2))
X_vectorized = vectorizer.fit_transform(X)

# Train a logistic regression model
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

# Function to predict subject-verb agreement errors using ML
def predict_errors_ml(sentence):
    sentence_vector = vectorizer.transform([sentence])
    prediction = model.predict(sentence_vector)[0]
    return prediction

# Function to process a paragraph and return corrected version and accuracy
def process_paragraph_ml(paragraph):
    sentences = paragraph.split('. ')
    corrected_paragraph = []
    original_sentences = []
    for idx, sentence in enumerate(sentences, start=1):
        if sentence.strip():
            original_sentences.append(sentence.strip())
            corrected_sentence = predict_errors_ml(sentence.strip())
            corrected_paragraph.append(corrected_sentence)

    # Display results
    print("\nOriginal Paragraph:")
    print(". ".join(original_sentences) + ".")
    print("\nCorrected Paragraph:")
    print(". ".join(corrected_paragraph) + ".")

    # Calculate accuracy
    matches = sum([1 for original, corrected in zip(original_sentences, corrected_paragraph) if original == corrected])
    accuracy = (matches / len(original_sentences)) * 100
    print(f"\nAccuracy of Model Suggestion: {accuracy:.2f}%")

# User interface for paragraphs
paragraph = input("Enter the paragraph: ")
process_paragraph_ml(paragraph)


Enter the paragraph: நூலகம் அவர்கள் சென்றாய். வேலை நான் செய்தாய். பாட்டு அவர்கள் பாடினாய். பாட்டு அவர்கள் பாடியது.

Original Paragraph:
நூலகம் அவர்கள் சென்றாய். வேலை நான் செய்தாய். பாட்டு அவர்கள் பாடினாய். பாட்டு அவர்கள் பாடியது..

Corrected Paragraph:
அவர்கள் நூலகம் சென்றார்கள்.  நான் பாட்டு பாடினேன்.  அவர்கள் பாட்டு பாடினார்கள்.  அவர்கள் பாட்டு பாடினார்கள்.

Accuracy of Model Suggestion: 0.00%
