In [None]:
import io
import pdfminer
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
from spellchecker import SpellChecker
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
        page_interpreter = PDFPageInterpreter(resource_manager, converter)
        
        for page in PDFPage.get_pages(file, check_extractable=True):
            page_interpreter.process_page(page)
            text += fake_file_handle.getvalue()
        
        converter.close()
        fake_file_handle.close()
    
    return text

def is_url(word):
    # Regular expression pattern to match URL patterns
    url_pattern = re.compile(r"(?:^|\s)((?:www\.|(?:https?://|ftp://)\S+?)(\w+.\.(?:com|in|uk\.org))\b)")
    return bool(url_pattern.match(word))


def spell_check_document(pdf_path):
    spell = SpellChecker()
    document_text = extract_text_from_pdf(pdf_path)
    
    # Tokenize the text using special characters
    words = word_tokenize(document_text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]
    
    # Exclude URLs from spell checking
    words = [word for word in words if not is_url(word) and word.isalpha()]
    
    misspelled_words = spell.unknown(words)
    
    if len(misspelled_words) > 0:
        print("Misspelled words:")
        for misspelled_word in misspelled_words:
            print(misspelled_word)
    else:
        print("No misspelled words found.")

# Example usage
pdf_path = './DATA/book.pdf'
spell_check_document(pdf_path)


In [None]:
import io
import pdfminer
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
from langdetect import detect
from language_tool_python import LanguageTool

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
        page_interpreter = PDFPageInterpreter(resource_manager, converter)
        
        for page in PDFPage.get_pages(file, check_extractable=True):
            page_interpreter.process_page(page)
            text += fake_file_handle.getvalue()
        
        converter.close()
        fake_file_handle.close()
    
    return text

def detect_grammatical_errors(pdf_path):
    document_text = extract_text_from_pdf(pdf_path)
    language = detect(document_text)
    
    # Initialize LanguageTool for the detected language
    tool = LanguageTool(language)
    
    # Perform grammatical error detection
    errors = tool.check(document_text)
    
    if len(errors) > 0:
        print("Grammatical errors:")
        for error in errors:
            print(error)
    else:
        print("No grammatical errors found.")

# Example usage
pdf_path = './DATA/book.pdf'
detect_grammatical_errors(pdf_path)


In [None]:
import PyPDF2
from transformers import pipeline

import io
import pdfminer
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
from langdetect import detect
from language_tool_python import LanguageTool

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
        page_interpreter = PDFPageInterpreter(resource_manager, converter)
        
        for page in PDFPage.get_pages(file, check_extractable=True):
            page_interpreter.process_page(page)
            text += fake_file_handle.getvalue()
        
        converter.close()
        fake_file_handle.close()
    
    return text

def process_chunks(text, chunk_size, prompt):
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    nlp = pipeline("text-generation", model="gpt2")
    generated_texts = []
    for chunk in chunks:
        input_text = prompt + chunk
        generated_text = nlp(input_text, max_length=100, num_return_sequences=1)[0]['generated_text']
        generated_texts.append(generated_text)
    return generated_texts

# Example usage
pdf_path = './DATA/book.pdf'
chunk_size = 500
prompt = "Please generate a response for the following text:\n\n"

text = extract_text_from_pdf(pdf_path)
generated_texts = process_chunks(text, chunk_size, prompt)

# Print the generated texts
for i, text in enumerate(generated_texts):
    print(f"Generated Text {i+1}:")
    print(text)
    print("---")


In [None]:
import PyPDF2
from transformers import pipeline

import io
import pdfminer
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
from langdetect import detect
from language_tool_python import LanguageTool

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
        page_interpreter = PDFPageInterpreter(resource_manager, converter)
        
        for page in PDFPage.get_pages(file, check_extractable=True):
            page_interpreter.process_page(page)
            text += fake_file_handle.getvalue()
        
        converter.close()
        fake_file_handle.close()
    
    return text

def process_chunks(text, chunk_size, prompt):
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    nlp = pipeline("text-generation", model="gpt2")
    generated_texts = []
    for chunk in chunks:
        input_text = prompt + chunk
        generated_text = nlp(input_text, max_length=100, num_return_sequences=1)[0]['generated_text']
        generated_texts.append(generated_text)
    return generated_texts

# Example usage
pdf_path = './DATA/book.pdf'
chunk_size = 500
prompt = "Please generate a response for the following text:\n\n"

text = extract_text_from_pdf(pdf_path)
generated_texts = process_chunks(text, chunk_size, prompt)

# Print the generated texts
for i, text in enumerate(generated_texts):
    print(f"Generated Text {i+1}:")
    print(text)
    print("---")


In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
model_name = 'deep-learning-analytics/GrammarCorrector'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def correct_grammar(input_text,num_return_sequences):
  batch = tokenizer([input_text],truncation=True,padding='max_length',max_length=164, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=164,num_beams=4, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

## Testing the model on text
text = ''' I want to dedicate this book to my family. Rachel, Connor, and Ben, you are my life. To my 
wonderful mother, Susan, and my sister, Kristy, - thank you for the love and support. I also 
would like to thank the faculty, staff, and students at both Notre Dame of Maryland 
University and Johns Hopkins University for allowing me to fulfill my passion for teaching 
the teachers of today and tomorrow. -Ryan L. Schaaf 
 
I want to dedicate this book to my parents, who always promoted the value of a strong 
education. From installing my pretend classroom in our basement and pretending to be my 
first students, to encouraging me throughout college and graduate school, I am forever 
grateful. To my husband, Nick, thank you for being my first editor and always allowing me to 
bounce ideas off of you. We are a team. Thank you for always supporting me. Ian and Ryan, 
thank you for this unbelievable opportunity and all that you have taught me throughout this 
experience. -Becky Zayas 
 
This book is intended to celebrate the exceptional dedication and courage educators have 
exhibited, and to acknowledge their demonstrated capacity to adapt and innovate in 
extraordinarily challenging and uncertain conditions. Now is the time for us to recognize the 
exceptional role they play, and to empower them with the training, professional development, 
support, and working conditions needed to effectively deploy their talents. For the education 
system to recover from the COVID pandemic requires sustained investment in the well-
being, training, professional development and working conditions of the world’s 71 million 
educators. Education recovery will only be successful if it is conducted hand-in-hand with 
teachers, giving them both voice and agency to participate in the critical change process. - Ian 
Jukes 
    
 '''

print(correct_grammar(text, num_return_sequences=2))

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

def load_model():
    model_name = "bert-base-uncased"  # Replace with the appropriate pre-trained model name
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
    return model, tokenizer

def classify_grammar(sentence, model, tokenizer):
    inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1)
    predicted_class_index = torch.argmax(probabilities).item()
    label_mapping = {0: "Grammatically Correct", 1: "Grammatically Incorrect"}
    predicted_class_label = label_mapping[predicted_class_index]
    return predicted_class_label

# Example usage
sentence = "She don't likes ice cream.."

# "She don't likes ice cream."
# "He plays the piano and tennis."
# "My dogs eat cookies and eats cake."

model, tokenizer = load_model()
predicted_label = classify_grammar(sentence, model, tokenizer)
print(f"Sentence: {sentence}")
print(f"Predicted Label: {predicted_label}")
