<a href="https://colab.research.google.com/github/NikhithaU/Projects/blob/main/Text_%26_Grammar_Correction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string

# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

def get_wordnet_pos(tag):
    """Map POS tag to first character used by WordNet"""
    tag = tag[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def correct_text(text):
    # Tokenize text into sentences
    sentences = sent_tokenize(text)

    # Process each sentence separately
    corrected_sentences = []
    for sentence in sentences:
        # Tokenize sentence into words and apply POS tagging
        words = word_tokenize(sentence)
        tagged_words = pos_tag(words)

        # Correct grammar and punctuation errors
        corrected_words = []
        for i in range(len(tagged_words)):
            word = tagged_words[i][0]
            pos = tagged_words[i][1]

            # Remove trailing punctuation
            if i < len(tagged_words) - 1:
                next_word = tagged_words[i+1][0]
                if next_word in string.punctuation:
                    word = word.rstrip(string.punctuation)

            # Lemmatize and correct POS tag
            lemmatizer = nltk.WordNetLemmatizer()
            lemma = lemmatizer.lemmatize(word, get_wordnet_pos(pos))
            corrected_pos = pos if pos != 'PRP$' else 'NN' # fix possessive pronouns

            # Append corrected word to list
            corrected_words.append((lemma, corrected_pos))

        # Detokenize corrected words into sentence
        corrected_sentence = TreebankWordDetokenizer().detokenize(
            [word for word, pos in corrected_words])  # convert tuples to strings
        corrected_sentences.append(corrected_sentence)

    # Join corrected sentences into text
    corrected_text = ' '.join(corrected_sentences)

    return corrected_text


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
!pip install --upgrade nltk




In [3]:
text = "Shila bought a basket of apples,and then she started eating"
corrected_text = correct_text(text)
print(corrected_text)
# Output: "He walked to the store, and then he bought some apples."


Shila buy a basket of apple, and then she start eat
