<b>"PART – A "</b>

<b>Task 1: Grammar Error Correction Application </b>

Objective:
Develop a grammar error correction application that can detect and correct grammatical errors in written text. 
This system should be capable of handling a wide range of common errors, including but not limited to sentence structure mistakes, 
subject-verb agreement errors, punctuation issues, and incorrect word usage.
The application should include both a web-based interface and backend processing using Flask and Python libraries.

Requirements:
Web Interface (3 Marks)
Front-end Development:
•	Create a web-based front-end using HTML and JavaScript.
•	Provide a text input area for users to enter the text they wish to have corrected, along with a file upload option that enables them to upload a text file for batch correction.
User Input:
•	Display the corrected word immediately when manually entered.
•	Correct grammar errors in the uploaded text file and generate a corrected version.


In [None]:
# Solutions apporach:
grammar_error_correction/
├── app.py                   # Flask backend
├── templates/
│   └── index.html           # Frontend HTML
├── static/
│   ├── script.js            # JavaScript for frontend
│   └── styles.css           # CSS for styling
├── requirements.txt         # Python dependencies



SyntaxError: invalid character '├' (U+251C) (3397285747.py, line 3)

In [None]:
# 1. app.py

from flask import Flask, request, jsonify, render_template
import language_tool_python
import os

# Initialize Flask app
app = Flask(__name__)
app.config["UPLOAD_FOLDER"] = "uploads"

# Initialize LanguageTool
tool = language_tool_python.LanguageTool("en-US")

def correct_grammar(text):
    """Correct grammar using LanguageTool."""
    matches = tool.check(text)
    corrected_text = language_tool_python.utils.correct(text, matches)
    return corrected_text

@app.route("/")
def home():
    """Render the homepage."""
    return render_template("index.html")

@app.route("/correct", methods=["POST"])
def correct_text():
    """Correct a single text input."""
    data = request.json
    input_text = data.get("text", "")
    corrected_text = correct_grammar(input_text)
    return jsonify({"corrected_text": corrected_text})

@app.route("/upload", methods=["POST"])
def upload_file():
    """Handle file upload for batch grammar correction."""
    file = request.files["file"]
    if file and file.filename.endswith(".txt"):
        file_path = os.path.join(app.config["UPLOAD_FOLDER"], file.filename)
        file.save(file_path)

        # Read file content and correct grammar line by line
        with open(file_path, "r") as f:
            lines = f.readlines()
        corrected_lines = [correct_grammar(line.strip()) for line in lines]

        # Save corrected file
        corrected_file_path = os.path.join(app.config["UPLOAD_FOLDER"], "corrected_" + file.filename)
        with open(corrected_file_path, "w") as f:
            f.write("\n".join(corrected_lines))

        return jsonify({"corrected_file": corrected_file_path})
    else:
        return jsonify({"error": "Invalid file type. Please upload a .txt file."}), 400

if __name__ == "__main__":
    if not os.path.exists(app.config["UPLOAD_FOLDER"]):
        os.makedirs(app.config["UPLOAD_FOLDER"])
    app.run(debug=True)


In [None]:
pip install language_tool_python

In [None]:
# 2. templates/index.html

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Grammar Correction</title>
    <link rel="stylesheet" href="/static/styles.css">
    <script src="/static/script.js" defer></script>
</head>
<body>
    <h1>Grammar Correction Application</h1>

    <div class="manual-correction">
        <h2>Manual Text Correction</h2>
        <textarea id="inputText" placeholder="Type your text here..."></textarea><br>
        <button onclick="correctText()">Correct Grammar</button>
        <h3>Corrected Text:</h3>
        <p id="outputText">The corrected text will appear here.</p>
    </div>

    <div class="batch-correction">
        <h2>Batch File Correction</h2>
        <form id="fileForm" enctype="multipart/form-data">
            <input type="file" id="fileInput" name="file" accept=".txt"><br>
            <button type="button" onclick="uploadFile()">Upload and Correct</button>
        </form>
        <h3>Corrected File:</h3>
        <p id="fileOutput"></p>
    </div>
</body>
</html>


In [None]:
# 3. static/script.js

async function correctText() {
    const inputText = document.getElementById("inputText").value;

    // Send user input to the backend
    const response = await fetch("/correct", {
        method: "POST",
        headers: { "Content-Type": "application/json" },
        body: JSON.stringify({ text: inputText }),
    });

    // Display the corrected text
    const data = await response.json();
    document.getElementById("outputText").innerText = data.corrected_text || "No corrections made.";
}

async function uploadFile() {
    const fileInput = document.getElementById("fileInput");
    const formData = new FormData();
    formData.append("file", fileInput.files[0]);

    // Send file to the backend
    const response = await fetch("/upload", {
        method: "POST",
        body: formData,
    });

    // Display the link to the corrected file
    const data = await response.json();
    if (data.corrected_file) {
        document.getElementById("fileOutput").innerHTML = `File corrected successfully! Download <a href="/${data.corrected_file}" target="_blank">here</a>.`;
    } else {
        document.getElementById("fileOutput").innerText = data.error || "Error processing the file.";
    }
}


In [None]:
# 4. Static/style.css

body {
    font-family: Arial, sans-serif;
    margin: 20px;
    padding: 20px;
    background-color: #f9f9f9;
}

h1, h2, h3 {
    color: #333;
}

textarea {
    width: 100%;
    height: 100px;
    margin-bottom: 10px;
    padding: 10px;
    font-size: 14px;
    border: 1px solid #ccc;
    border-radius: 5px;
}

button {
    padding: 10px 20px;
    font-size: 14px;
    background-color: #007BFF;
    color: #fff;
    border: none;
    border-radius: 5px;
    cursor: pointer;
}

button:hover {
    background-color: #0056b3;
}

.manual-correction, .batch-correction {
    margin-top: 20px;
    padding: 20px;
    border: 1px solid #ccc;
    border-radius: 10px;
    background-color: #fff;
}

p {
    font-size: 14px;
    color: #555;
}


In [None]:
# 5. Install dependencies
pip install -r requirements.txt

Flask
language-tool-python
Werkzeug
nltk
pandas
matplotlib
numpy
transformers
datasets
# evaluate rouge score
rouge
torch
pytorch-lightning
datasets 
tqdm 
pandas
sentencepiece
transformers
wandb

In [None]:
# Process flow: Create a workspace in VS with all folder structure as mentioned above and 

#     1. run app.py
#     2. http://127.0.0.1:5000/
#     3. Test the application:

# Enter text for manual correction.
# Upload a .txt file for batch correction.



<b>Grammar Error Correction Application (3 Marks)</b>

Backend Implementation:

Use Flask to develop the backend for handling user requests and responses.
Use any Python library, to create grammar error correction with given text as corpus.
Use text from below source as corpus for calculating probabilities.
Use https://www.kaggle.com/datasets/dariocioni/c4200mLinks to an external site. dataset. ( This dataset is huge so use 1/4th of the dataset).
Integration: (2 Marks)

Integrate the front-end and back-end components to ensure seamless functionality.
Process user input effectively, perform necessary corrections, and present results in a clear and user-friendly manner on the web page.

<b>Integration: (2 Marks)</b>

Integrate the front-end and back-end components to ensure seamless functionality.
Process user input effectively, perform necessary corrections, and present results in a clear and user-friendly manner on the web page.

In [None]:
grammar_error_correction/
├── preprocess_dataset.py      # Preprocess dataset (create incorrect-correct pairs)
├── train_model.py             # Fine-tune T5 model
├── evaluate_model.py          # Evaluate model performance
├── app.py                     # Flask backend
├── templates/
│   └── index.html             # Frontend HTML
├── static/
│   ├── script.js              # JavaScript
│   └── styles.css             # CSS for styling
├── data/
│   ├── C4_200M_sampled.csv    # Sampled dataset
│   ├── grammar_dataset.csv    # Preprocessed dataset
│   └── corrected_file.txt     # Corrected text file (generated after file upload)
├── models/
│   └── grammar_model/         # Fine-tuned T5 model
├── requirements.txt           # Python dependencies

In [None]:
# 1. Preprocessing the data
# preprocess_dataset.py
import dask.dataframe as dd
from nltk.tokenize import word_tokenize
import nltk
import re
import  tqdm
from tqdm import tqdm
import time

# Example loop with a progress bar
for i in tqdm(range(100), desc="Processing Items"):
    time.sleep(0.1)  # Simulating work

# Download required NLTK resources
nltk.download("punkt")

# File paths
file_path = r"D:\BITS Pilani Sem 3\NLP Applications\VS-Code 1\grammar_error_correction\data\C4_200M.tsv-00000-of-00010"  # Full dataset file
output_sample_file = r"D:\BITS Pilani Sem 3\NLP Applications\VS-Code 1\grammar_error_correction\data\C4_200M_sampled.csv"
output_tokenized_file = r"D:\BITS Pilani Sem 3\NLP Applications\VS-Code 1\grammar_error_correction\data\tokenized_corpus.txt"

# Define the fraction for sampling (1/4th of the dataset)
sampling_fraction = 0.01

# Load dataset with Dask
print("Loading dataset with Dask...")
ddf = dd.read_csv(file_path, sep="\t", header=None)

# Sample 1/4th of the dataset
print("Sampling 1/4th of the dataset...")
sampled_ddf = ddf.sample(frac=sampling_fraction, random_state=42)

# Save sampled data to a CSV
print(f"Saving sampled dataset to '{output_sample_file}'...")
sampled_ddf = sampled_ddf.compute()  # Convert Dask DataFrame to Pandas DataFrame
sampled_ddf.columns = ["id", "text"]  # Assign column names if necessary
sampled_ddf.to_csv(output_sample_file, index=False)

# Tokenization and preprocessing
def preprocess_text(text):
    # Remove URLs and special characters
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s.,!?']", "", text)
    return text

print("Tokenizing the sampled dataset...")
tokenized_corpus = []
for text in sampled_ddf["text"].dropna().astype(str):
    preprocessed_text = preprocess_text(text)
    tokens = word_tokenize(preprocessed_text.lower())
    tokenized_corpus.extend(tokens)

# Save the tokenized corpus to a file
with open(output_tokenized_file, "w") as f:
    f.write(" ".join(tokenized_corpus))
print(f"Tokenized corpus saved to '{output_tokenized_file}'.")


In [None]:
import pandas as pd
import random
import re

# File paths
input_file = r"D:\BITS Pilani Sem 3\NLP Applications\VS-Code 1\grammar_error_correction\data\C4_200M_sampled.csv"  # Input sampled dataset
output_file = r"D:\BITS Pilani Sem 3\NLP Applications\VS-Code 1\grammar_error_correction\data\grammar_dataset.csv"  # Preprocessed dataset for training

# Function to introduce synthetic grammar errors
def introduce_errors(sentence):
    errors = {
        "is": "are",
        "was": "were",
        "he": "him",
        "she": "her",
        "their": "there",
        "your": "you're",
        "has": "have",
    }
    words = sentence.split()
    for i in range(len(words)):
        if words[i] in errors and random.random() < 0.3:  # 30% chance of replacing
            words[i] = errors[words[i]]
    return " ".join(words)

# Load dataset
print("Loading dataset...")
data = pd.read_csv(input_file)
data = data.dropna(subset=["text"])  # Ensure no missing text
data = data.reset_index(drop=True)

# Create incorrect-correct pairs
print("Generating incorrect-correct pairs...")
pairs = []
for text in data["text"]:
    correct_sentence = text.strip()
    incorrect_sentence = introduce_errors(correct_sentence)
    if incorrect_sentence != correct_sentence:
        pairs.append({"incorrect": incorrect_sentence, "correct": correct_sentence})

# Save grammar dataset
print(f"Saving grammar dataset to '{output_file}'...")
grammar_dataset = pd.DataFrame(pairs)
grammar_dataset.to_csv(output_file, index=False)
print("Done!")


In [None]:
# train model
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm
import os
import pandas as pd
# Dataset class
class GrammarDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_len=128):
        self.data = pd.read_csv(file_path).sample(1000)
        self.data = self.data.dropna(subset=["incorrect", "correct"])
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        incorrect = str(self.data.iloc[idx]["incorrect"])
        correct = str(self.data.iloc[idx]["correct"])

        input_tokens = self.tokenizer.encode_plus(
            incorrect,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        target_tokens = self.tokenizer.encode_plus(
            correct,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": input_tokens["input_ids"].squeeze(),
            "attention_mask": input_tokens["attention_mask"].squeeze(),
            "labels": target_tokens["input_ids"].squeeze(),
        }

# Load model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Dataset and DataLoader
file_path = r"D:\BITS Pilani Sem 3\NLP Applications\VS-Code 1\grammar_error_correction\data\grammar_dataset.csv"
dataset = GrammarDataset(file_path, tokenizer)
data_loader = DataLoader(dataset, batch_size=8, shuffle=True)

# Training configuration
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    progress_bar = tqdm(data_loader, desc=f"Epoch {epoch + 1}/{epochs}")
    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch {epoch + 1}/{epochs}, Total Loss: {epoch_loss:.4f}")

# Save the fine-tuned model
model_dir = "models/grammar_model"
os.makedirs(model_dir, exist_ok=True)
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)
print(f"Model saved to '{model_dir}'.")


In [None]:
pip install -r requirements.txt

In [None]:
!pip install rouge_score

In [3]:
# evaluate model

from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
import pandas as pd

# Load dataset
file_path = r"D:\BITS Pilani Sem 3\NLP Applications\VS-Code 1\grammar_error_correction\data\grammar_dataset.csv"
data = pd.read_csv(file_path)

# Initialize scorers
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

# BLEU scores
references = [[row["correct"].split()] for _, row in data.iterrows()]
candidates = [row["incorrect"].split() for _, row in data.iterrows()]
bleu_score = corpus_bleu(references, candidates)
print(f"Corpus BLEU Score: {bleu_score:.4f}")

# ROUGE scores
rouge1, rouge2, rougeL = 0, 0, 0
for _, row in data.iterrows():
    correct = row["correct"]
    incorrect = row["incorrect"]
    scores = scorer.score(correct, incorrect)
    rouge1 += scores["rouge1"].fmeasure
    rouge2 += scores["rouge2"].fmeasure
    rougeL += scores["rougeL"].fmeasure

n = len(data)
print(f"Average ROUGE-1: {rouge1 / n:.4f}")
print(f"Average ROUGE-2: {rouge2 / n:.4f}")
print(f"Average ROUGE-L: {rougeL / n:.4f}")


Corpus BLEU Score: 0.9089
Average ROUGE-1: 0.9459
Average ROUGE-2: 0.8896
Average ROUGE-L: 0.9459


In [None]:
!pip install SentencePiece

In [5]:
# app.py

from flask import Flask, request, jsonify, render_template
from transformers import T5ForConditionalGeneration, T5Tokenizer
import os
import torch

# Load fine-tuned model and tokenizer
model_name = r"D:\BITS Pilani Sem 3\NLP Applications\Grammar_Err\models\grammar_model"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Initialize Flask app
app = Flask(__name__)
app.config["UPLOAD_FOLDER"] = "data"

@app.route("/")
def home():
    """Serve the main webpage."""
    return render_template("index.html")

@app.route("/correct", methods=["POST"])
def correct_text():
    """Corrects a single text string."""
    data = request.json
    input_text = data.get("text", "")

    # Tokenize input and generate corrected text
    input_tokens = tokenizer.encode(input_text, return_tensors="pt", max_length=128, truncation=True).to(device)
    outputs = model.generate(input_tokens, max_length=128, num_beams=4, early_stopping=True)
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return jsonify({"corrected_text": corrected_text})

@app.route("/upload", methods=["POST"])
def upload_file():
    """Handles file uploads for batch grammar correction."""
    file = request.files["file"]
    if file and file.filename.endswith(".txt"):
        file_path = os.path.join(app.config["UPLOAD_FOLDER"], file.filename)
        file.save(file_path)

        # Read the file and correct line by line
        with open(file_path, "r") as f:
            lines = f.readlines()
        corrected_lines = []
        for line in lines:
            input_tokens = tokenizer.encode(line.strip(), return_tensors="pt", max_length=128, truncation=True).to(device)
            outputs = model.generate(input_tokens, max_length=128, num_beams=4, early_stopping=True)
            corrected_line = tokenizer.decode(outputs[0], skip_special_tokens=True)
            corrected_lines.append(corrected_line)

        # Save corrected lines to a new file
        corrected_file_path = os.path.join(app.config["UPLOAD_FOLDER"], "corrected_" + file.filename)
        with open(corrected_file_path, "w") as f:
            f.write("\n".join(corrected_lines))
        
        return jsonify({"corrected_file": corrected_file_path})
    else:
        return jsonify({"error": "Invalid file type. Please upload a .txt file."}), 400

if __name__ == "__main__":
    app.run(debug=True)


 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on


 * Running on http://127.0.0.1:5000
 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
INFO:werkzeug:[33mPress CTRL+C to quit[0m
 * Restarting with watchdog (windowsapi)
INFO:werkzeug: * Restarting with watchdog (windowsapi)


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
