In [1]:
# Cell 1 — Imports and basic setup for text normalization pipeline
import re
import string
import pandas as pd
from difflib import SequenceMatcher
from google.colab import files
import math

# Try to use num2words/unidecode if available; otherwise fall back to simple functions
try:
    from num2words import num2words
    _HAS_NUM2WORDS = True
except Exception:
    _HAS_NUM2WORDS = False

try:
    import unidecode
    def to_ascii(s): return unidecode.unidecode(s)
except Exception:
    def to_ascii(s): return s


In [2]:
# Cell 2 — Upload your CSV (choose text_normalization_dataset_small.csv or ..._large.csv)
print("Upload the CSV file (the uploader will appear).")
uploaded = files.upload()  # select the CSV you generated earlier

fname = list(uploaded.keys())[0]
print("Uploaded file:", fname)

df = pd.read_csv(fname)
print("Rows:", len(df))
display(df.head(5))


Upload the CSV file (the uploader will appear).


Saving text_normalization_dataset_large.csv to text_normalization_dataset_large.csv
Uploaded file: text_normalization_dataset_large.csv
Rows: 200000


Unnamed: 0,raw_text,expected_normalized_text
0,i am so happy and excited,i am so happy and excited
1,lol that is so funny,lol that is so funny
2,they are playing football outside,they are playing football outside
3,project submission is due tonight,project submission is due tonight
4,textt norrmmallizaattiioon is verry iimporttant,text normalization is very important


In [3]:
# Cell 3 — Define contractions, slang maps, and helper functions for normalization
# (this cell defines the normalization rules used below)

# Common contractions mapping
CONTRACTIONS = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "cant": "cannot",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "gonna": "going to",
    "gotta": "got to",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he's": "he is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it's": "it is",
    "let's": "let us",
    "she's": "she is",
    "shouldn't": "should not",
    "that's": "that is",
    "there's": "there is",
    "they're": "they are",
    "we're": "we are",
    "weren't": "were not",
    "what's": "what is",
    "where's": "where is",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "y'all": "you all",
    "you're": "you are",
    "you've": "you have",
}

# Common slang / chat shorthand mapping
SLANG_MAP = {
    "u": "you",
    "ur": "your",
    "r": "are",
    "tho": "though",
    "thx": "thanks",
    "tx": "thanks",
    "pls": "please",
    "plz": "please",
    "asap": "as soon as possible",
    "b4": "before",
    "gr8": "great",
    "2day": "today",
    "2morrow": "tomorrow",
    "2moro": "tomorrow",
    "2nite": "tonight",
    "tmrw": "tomorrow",
    "gonna": "going to",
    "wanna": "want to",
    "gimme": "give me",
    "cuz": "because",
    "cuz.": "because",
    "idk": "i do not know",
    "imo": "in my opinion",
    "imho": "in my humble opinion",
    "thru": "through",
    "b/c": "because",
    "w/": "with",
    "d": "the",   # in some texting 'd' stands for 'the' (use cautiously)
    "plz": "please",
    "yw": "you are welcome",
    "brb": "be right back",
    "ttyl": "talk to you later",
    "asap!": "as soon as possible",
}

# Fallback small number mapping (0-20) if num2words not available
NUM_WORDS = {
    0: "zero",1: "one",2:"two",3:"three",4:"four",5:"five",
    6:"six",7:"seven",8:"eight",9:"nine",10:"ten",
    11:"eleven",12:"twelve",13:"thirteen",14:"fourteen",15:"fifteen",
    16:"sixteen",17:"seventeen",18:"eighteen",19:"nineteen",20:"twenty"
}

# Helper: expand contractions (word boundaries)
_contraction_re = re.compile(r'\b(' + '|'.join(re.escape(k) for k in CONTRACTIONS.keys()) + r')\b')

def expand_contractions(text):
    return _contraction_re.sub(lambda m: CONTRACTIONS[m.group(0)], text)

# Helper: replace slang
_slang_re = re.compile(r'\b(' + '|'.join(re.escape(k) for k in SLANG_MAP.keys()) + r')\b')
def replace_slang(text):
    return _slang_re.sub(lambda m: SLANG_MAP[m.group(0)], text)

# Helper: attempt to convert numeric tokens to words
def number_to_words_token(match):
    num_str = match.group(0)
    try:
        n = int(num_str)
    except:
        return num_str
    if _HAS_NUM2WORDS:
        try:
            # num2words returns hyphenated words for tens, we remove hyphens
            return num2words(n).replace('-', ' ')
        except:
            return str(n)
    else:
        # fallback for small numbers
        if n in NUM_WORDS:
            return NUM_WORDS[n]
        if n < 100:
            tens = n // 10 * 10
            ones = n % 10
            if ones == 0:
                return {20:"twenty",30:"thirty",40:"forty",50:"fifty",60:"sixty",70:"seventy",80:"eighty",90:"ninety"}.get(tens, str(n))
            else:
                tens_word = {20:"twenty",30:"thirty",40:"forty",50:"fifty",60:"sixty",70:"seventy",80:"eighty",90:"ninety"}.get(tens, "")
                return (tens_word + " " + NUM_WORDS[ones]).strip()
        return str(n)

# Helper: reduce repeated characters (soooo -> so)
def reduce_repeated_chars(word):
    # Reduce sequences of the same character longer than 2 down to 2 (you can change to 1)
    return re.sub(r'(.)\1{2,}', r'\1', word)

# Utility: similarity ratio
def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()


In [4]:
# Cell 4 — The main normalize_text function (combine all rules here)
def normalize_text(text):
    if pd.isna(text):
        return ""
    # ASCII fold + lowercase + strip
    s = to_ascii(str(text))
    s = s.lower().strip()

    # unify different apostrophes to '
    s = s.replace("’", "'").replace("‘", "'")

    # Expand common contractions first (so "i'm" -> "i am")
    s = expand_contractions(s)

    # Replace common slang/short forms
    s = replace_slang(s)

    # Convert numeric tokens to words (e.g., "2" -> "two", "2021" -> "2021" or words if num2words available)
    s = re.sub(r'\b\d+\b', number_to_words_token, s)

    # Remove punctuation/emojis (keep spaces and alphanumeric)
    # We keep apostrophes already expanded; remove other punctuation
    s = re.sub(r'[^a-z0-9\s]', ' ', s)

    # Reduce repeated characters inside words (so "happppy" -> "happy")
    s = ' '.join(reduce_repeated_chars(w) for w in s.split())

    # Collapse multiple spaces
    s = re.sub(r'\s+', ' ', s).strip()

    # Final small post-processing (handle leftover tokens like 'u' -> 'you' again)
    s = replace_slang(s)

    return s


In [5]:
# Cell 5 — Apply normalize_text to df and compare to expected_normalized_text
print("Applying normalization to dataset...")
df['normalized'] = df['raw_text'].astype(str).apply(normalize_text)

# Basic statistics
total = len(df)
exact_matches = (df['normalized'] == df['expected_normalized_text']).sum()
avg_similarity = df.apply(lambda r: similarity(str(r['normalized']), str(r['expected_normalized_text'])), axis=1).mean()

print(f"Total rows: {total}")
print(f"Exact matches: {exact_matches} ({exact_matches/total*100:.2f}%)")
print(f"Average similarity (0..1): {avg_similarity:.4f}")

# Show a sample of normalized vs expected
display(df[['raw_text','expected_normalized_text','normalized']].head(15))

# Show top 20 worst matches by similarity (for debugging)
df['sim'] = df.apply(lambda r: similarity(str(r['normalized']), str(r['expected_normalized_text'])), axis=1)
worst = df.sort_values('sim').head(20)
print("\nTop 20 worst matches (lowest similarity):")
display(worst[['raw_text','expected_normalized_text','normalized','sim']])


Applying normalization to dataset...
Total rows: 200000
Exact matches: 171919 (85.96%)
Average similarity (0..1): 0.9775


Unnamed: 0,raw_text,expected_normalized_text,normalized
0,i am so happy and excited,i am so happy and excited,i am so happy and excited
1,lol that is so funny,lol that is so funny,lol that is so funny
2,they are playing football outside,they are playing football outside,they are playing football outside
3,project submission is due tonight,project submission is due tonight,project submission is due tonight
4,textt norrmmallizaattiioon is verry iimporttant,text normalization is very important,textt norrmmallizaattiioon is verry iimporttant
5,Proojeecctt suubbmmisssioon iis dduee ttoo...,project submission is due tonight,proojeecctt suubbmmisioon iis dduee ttoonniightt
6,i need to finish my homework!!! 😂,i need to finish my homework,i need to finish my homework
7,the movie will be released next friday,the movie will be released next friday,the movie will be released next friday
8,what are u doing right now,what are you doing right now,what are you doing right now
9,the weather forecast predicts heavy rain,the weather forecast predicts heavy rain,the weather forecast predicts heavy rain



Top 20 worst matches (lowest similarity):


Unnamed: 0,raw_text,expected_normalized_text,normalized,sim
189569,idk wwhhaatt tto saay,idk what to say,i do not know wwhhaatt tto saay,0.434783
119443,ii hhadd brreaakkfaastt aallreeaddyy,i had breakfast already,ii hhadd brreaakkfaastt aallreeaddyy,0.440678
198394,aaree yyoouu frreee tthhiis eeveniinngg,are you free this evening,aaree yyoouu frre tthhiis eeveniinngg,0.548387
127516,aaree yyoou ffrreeee tthhiis eveenniing,are you free this evening,aaree yyoou ffrre tthhiis eveenniing,0.557377
169172,aaree yyoouu frreee tthhis evenninngg 😎,are you free this evening,aaree yyoouu frre tthhis evenninngg,0.566667
42139,aaree yyoou ffreeee thhiis evveenningg,are you free this evening,aaree yyoou ffre thhiis evveenningg,0.566667
5951,aaree yyoouu ffreee thhiis eeveeninng,are you free this evening,aaree yyoouu ffre thhiis eeveeninng,0.566667
123441,aaree yoouu ffrreee tthhis eveeniinng,are you free this evening,aaree yoouu ffrre tthhis eveeniinng,0.566667
198978,arree yoouu ffrreee thhiiss eevveenniing 😭,are you free this evening,arree yoouu ffrre thhiiss eevveenniing,0.571429
110185,arree yyoouu ffrreee thhiis eveenniinngg,are you free this evening,arree yyoouu ffrre thhiis eveenniinngg,0.571429


In [6]:
# Cell 6 — Save normalized CSV and provide download link
out_name = fname.replace('.csv', '') + '_normalized.csv'
df.to_csv(out_name, index=False)
print("Saved normalized CSV as:", out_name)

# Trigger browser download in Colab
files.download(out_name)


Saved normalized CSV as: text_normalization_dataset_large_normalized.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Cell 7 — Interactive normalizer: enter any sentence and get normalized output
s = input("Type a sentence to normalize (or paste messy text):\n> ")
print("\nNormalized result:\n", normalize_text(s))
