In [None]:
!pip install transformers torch fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
import torch
import re
from transformers import T5ForConditionalGeneration, T5Tokenizer
from google.colab import drive

print("Mounting Google Drive... Please authorize.")
drive.mount('/content/drive')

Mounting Google Drive... Please authorize.
Mounted at /content/drive


In [None]:
# 1. Define Model Path and Load Model

# This MUST match the 'output_dir' you used for training
MODEL_PATH = "/content/drive/MyDrive/final-punctuation-model/checkpoint-31074"

print(f"Loading model from: {MODEL_PATH}")

try:
    # Load the fine-tuned model and tokenizer
    tokenizer = T5Tokenizer.from_pretrained(MODEL_PATH)
    model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH)

    # 2. Setup Device and Eval Mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    print("Model loaded successfully and moved to device:", device)

except OSError:
    print(f"Error: Could not find model at {MODEL_PATH}")
    print("Please make sure the path is correct and training was completed.")

# 3. Define the Prediction Function

def correct(text):
    """
    Takes a raw string, preprocesses it, and returns the
    corrected text using the fine-tuned T5 model.
    """
    if 'model' not in globals():
        print("Model is not loaded. Cannot run correction.")
        return None

    # Preprocess: Add prefix, lowercase, and remove punctuation
    input_text = "correct: " + re.sub(r'[^\w\s]', '', text.lower())

    # Tokenize the input
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=512,  # Must match the training max_length
        truncation=True
    ).to(device)

    # Generate the corrected text
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_length=512,       # Max length of the *output*
            num_beams=4,          # Use beam search for better results
            early_stopping=True   # Stop when the <eos> token is generated
        )

    # Decode the output tokens back into a string
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

Loading model from: /content/drive/MyDrive/final-punctuation-model/checkpoint-31074
Model loaded successfully and moved to device: cpu


In [None]:
import math
import re

# We'll re-use the `correct()` function from Cell 3.
# Make sure Cell 3 has been run.

def correct_large_paragraph(long_text):
    """
    Corrects long text by splitting it into manageable chunks
    and correcting each one individually.
    """
    if 'model' not in globals():
        print("Model is not loaded. Cannot run correction.")
        return None

    # 1. Split the *original* text by whitespace
    # This is slightly better than doing regex on the whole
    # thing at the start.
    words = long_text.split()

    if not words:
        return ""

    # --- 2. Define a safe chunk size ---
    # 128 tokens is roughly ~100 words.
    # We use 80 to be safe and give the model room.
    chunk_size = 70

    corrected_chunks = []
    num_chunks = math.ceil(len(words) / chunk_size)

    print(f"Input is long. Splitting into {num_chunks} chunks...")

    # --- 3. Loop through the text in chunks ---
    for i in range(0, len(words), chunk_size):

        # Get the chunk of words
        chunk_of_words = words[i : i + chunk_size]

        # Join them back into a single string
        text_chunk = " ".join(chunk_of_words)

        print(f"--- Correcting chunk {i // chunk_size + 1}/{num_chunks} ---")

        # 4. Run our *existing* `correct()` function on the chunk
        # This function (from Cell 3) handles the regex and prefix
        corrected_chunk = correct(text_chunk)
        corrected_chunks.append(corrected_chunk)

    # --- 5. Join the corrected chunks back together ---
    return " ".join(corrected_chunks)

def check(sentence):
  threshold = 70
  words = sentence.split()

  if len(words) > threshold:
    #print(f"Input is long ({len(words)} words). Using chunking function...")
    return correct_large_paragraph(sentence)
  else:
    #print(f"Input is short ({len(words)} words). Using single-pass function...")
    return correct(sentence)

In [None]:
import re
#Rule Based

# --- Rule 1: Spacing Fixes ---
def refine_spacing(text):
    """
    Cleans up spacing errors around punctuation.
    - 'word .' -> 'word.'
    - 'word,word' -> 'word, word'
    - 'word  word' -> 'word word'
    """
    # Remove space *before* punctuation
    text = re.sub(r'\s+([.,?])', r'\1', text)

    # Ensure space *after* punctuation (if followed by a letter)
    text = re.sub(r'([.,?])([a-zA-Z])', r'\1 \2', text)

    # Collapse multiple spaces into one
    text = re.sub(r'\s+', ' ', text).strip()

    # Collapse multiple terminal punctuations
    text = re.sub(r'([.?!]){2,}', r'\1', text)

    return text


# --- Rule 2: Contraction Fixes ---
def refine_contractions(text):
    """
    Fixes common contraction errors the model might make.
    Uses \b (word boundary) to avoid changing words like "itself".
    """
    text = re.sub(r"\b(its)\b", "it's", text, flags=re.IGNORECASE)
    text = re.sub(r"\b(dont)\b", "don't", text, flags=re.IGNORECASE)
    text = re.sub(r"\b(cant)\b", "can't", text, flags=re.IGNORECASE)
    text = re.sub(r"\b(wont)\b", "won't", text, flags=re.IGNORECASE)
    text = re.sub(r"\b(isnt)\b", "isn't", text, flags=re.IGNORECASE)
    text = re.sub(r"\b(arent)\b", "aren't", text, flags=re.IGNORECASE)
    text = re.sub(r"\b(wasnt)\b", "wasn't", text, flags=re.IGNORECASE)
    text = re.sub(r"\b(werent)\b", "weren't", text, flags=re.IGNORECASE)
    text = re.sub(r"\b(hes)\b", "he's", text, flags=re.IGNORECASE)
    text = re.sub(r"\b(shes)\b", "she's", text, flags=re.IGNORECASE)
    text = re.sub(r"\b(theyre)\b", "they're", text, flags=re.IGNORECASE)
    text = re.sub(r"\b(youre)\b", "you're", text, flags=re.IGNORECASE)
    text = re.sub(r"\b(im)\b", "I'm", text, flags=re.IGNORECASE)
    text = re.sub(r"\b(ive)\b", "I've", text, flags=re.IGNORECASE)
    text = re.sub(r"\b(id)\b", "I'd", text, flags=re.IGNORECASE)

    return text


# --- Rule 3: "Lone I" Fix ---
def refine_lone_i(text):
    """
    Capitalizes the lone pronoun 'i' (e.g., " i " -> " I ").
    """
    text = re.sub(r"\b( i )\b", " I ", text)
    text = re.sub(r"\b( i')\b", " I'", text)  # For "i'm", "i've"
    return text


# --- Rule 4: Capitalization After Sentence End ---
def refine_capitalization(text):
    """
    Ensures that the first letter after a terminal punctuation
    (. ! ?) is capitalized.
    """

    def capitalize_match(match):
        # match.group(1): punctuation and space (e.g., ". ")
        # match.group(2): lowercase letter (e.g., "t")
        return match.group(1) + match.group(2).upper()

    # Find a terminal punctuation, followed by whitespace,
    # and capture the first lowercase letter.
    text = re.sub(r'([.?!]\s+)([a-z])', capitalize_match, text)

    return text


# -The Main "Pipeline" Function
def post_process_refinement(model_output_text):
    """
    Runs the model's output through the full pipeline of
    rule-based refinement functions.

    The order matters:
    1. Fix contractions/lone 'i' first.
    2. Fix spacing.
    3. Fix capitalization (so it can use the new periods).
    """
    if model_output_text:
        text = model_output_text[0].upper() + model_output_text[1:]
    else:
        text = model_output_text

    text = refine_contractions(text)
    text = refine_lone_i(text)
    text = refine_spacing(text)
    text = refine_capitalization(text)

    return text


In [None]:
# Test 1: Simple
text_1 = "hello my name is shahaan what is yours"
print(f"Input:    '{text_1}'")
model_out = check(text_1)
print(f"Model Output:   '{model_out}'")
print(f"Polished Output: '{post_process_refinement(model_out)}'")
print("-" * 20)


# Test 2: Longer Paragraph
text_2 = "the quick brown fox jumps over the lazy dog this is a classic sentence used for typing practice but it also serves as a good test for our model i wonder if it will know where to put the period and how to capitalize the word 'this' in the middle of the text it's a non-trivial task because the model has to understand context not just individual words for example will it know what to do with a sentence like this what do you think the final output will be i am very excited to see the results"
print(f"Input: '{text_2}'")
model_out = check(text_2)
print(f"Model Output:   '{model_out}'")
print(f"Polished Output: '{post_process_refinement(model_out)}'")
print("-" * 20)


# Test 3: Your Own Text
text_3 = "type any text you want here i hope this works"
print(f"Input: '{text_2}'")
model_out = check(text_3)
print(f"Model Output:   '{model_out}'")
print(f"Polished Output: '{post_process_refinement(model_out)}'")
print("-" * 20)

Input:    'hello my name is shahaan what is yours'
Model Output:   'Hello, my name is Shahaan, what is yours?'
Polished Output: 'Hello, my name is Shahaan, what is yours?'
--------------------
Input: 'the quick brown fox jumps over the lazy dog this is a classic sentence used for typing practice but it also serves as a good test for our model i wonder if it will know where to put the period and how to capitalize the word 'this' in the middle of the text it's a non-trivial task because the model has to understand context not just individual words for example will it know what to do with a sentence like this what do you think the final output will be i am very excited to see the results'
Input is long. Splitting into 2 chunks...
--- Correcting chunk 1/2 ---
--- Correcting chunk 2/2 ---
Model Output:   'The quick brown fox jumps over the lazy dog. This is a classic sentence used for typing practice, but it also serves as a good test for our model. I wonder if it will know where to put the

In [None]:
my_text = """The complexity of human language presents! a profound challenge for artificial intelligence It's not merely a structured system of; vocabulary; rather it's a dynamic entity deeply intertwined with? context, culture and subtle intention Natural Language Processing (NLP;) models often built on vast statistical analysis strive to? comprehend and generate text with human-like fluency However a fascinating; inverse problem exists modeling human error Simulating a typo isn't just random substitution it involves understanding keyboard layouts common transpositions, and phonetic similarities More advanced simulations, such as semantic errors require a sophisticated grasp of how the, human mind retrieves and associates words This "reverse engineering" of mistakes purposefully? creating plausible incorrectness is? not only a creative exercise but also a powerful method for! building and evaluating more robust, correction systems"""
print(f"Input:    '{my_text}'")
model_out = check(my_text)
print(f"Model Output:   '{model_out}'")
print(f"Polished Output: '{post_process_refinement(model_out)}'")

Input:    'The complexity of human language presents! a profound challenge for artificial intelligence It's not merely a structured system of; vocabulary; rather it's a dynamic entity deeply intertwined with? context, culture and subtle intention Natural Language Processing (NLP;) models often built on vast statistical analysis strive to? comprehend and generate text with human-like fluency However a fascinating; inverse problem exists modeling human error Simulating a typo isn't just random substitution it involves understanding keyboard layouts common transpositions, and phonetic similarities More advanced simulations, such as semantic errors require a sophisticated grasp of how the, human mind retrieves and associates words This "reverse engineering" of mistakes purposefully? creating plausible incorrectness is? not only a creative exercise but also a powerful method for! building and evaluating more robust, correction systems'
Input is long. Splitting into 2 chunks...
--- Correctin

In [None]:
from fuzzywuzzy import fuzz

def check_similarity(str1, str2):
    # fuzz.ratio() calculates the Levenshtein distance similarity
    similarity_score = fuzz.ratio(str1, str2)
    return similarity_score

correct_text = '''The complexity of human language presents a profound challenge for artificial intelligence. It's not merely a structured system of vocabulary; rather, it's a dynamic entity deeply intertwined with context, culture, and subtle intention. Natural Language Processing (NLP) models, often built on vast statistical analysis, strive to comprehend and generate text with human-like fluency. However, a fascinating inverse problem exists: modeling human error. Simulating a typo isn't just random substitution; it involves understanding keyboard layouts, common transpositions, and phonetic similarities. More advanced simulations, such as semantic errors, require a sophisticated grasp of how the human mind retrieves and associates words. This "reverse engineering" of mistakes—purposefully creating plausible incorrectness—is not only a creative exercise but also a powerful method for building and evaluating more robust correction systems.'''
score = check_similarity(correct_text, post_process_refinement(model_out))

print(f"\nStr1: {correct_text}")
print(f"Str2: {post_process_refinement(model_out)}")
print(f"Similarity Score: {score}%")



Str1: The complexity of human language presents a profound challenge for artificial intelligence. It's not merely a structured system of vocabulary; rather, it's a dynamic entity deeply intertwined with context, culture, and subtle intention. Natural Language Processing (NLP) models, often built on vast statistical analysis, strive to comprehend and generate text with human-like fluency. However, a fascinating inverse problem exists: modeling human error. Simulating a typo isn't just random substitution; it involves understanding keyboard layouts, common transpositions, and phonetic similarities. More advanced simulations, such as semantic errors, require a sophisticated grasp of how the human mind retrieves and associates words. This "reverse engineering" of mistakes—purposefully creating plausible incorrectness—is not only a creative exercise but also a powerful method for building and evaluating more robust correction systems.
Str2: The complexity of human language presents a profo

