In [1]:
# 🚀 Text Normalization Dataset Generator with Expanded Sentences
import pandas as pd
import random
from google.colab import files

# Expanded base sentences (now ~80+ examples)
base_sentences = [
    # Greetings & small talk
    "hello how are you doing today",
    "good morning have a nice day",
    "good night sweet dreams",
    "hi there long time no see",
    "see you soon my friend",
    "thanks a lot for your help",
    "happy birthday to you",
    "congratulations on your success",
    "have a safe journey",
    "take care and stay safe",

    # Casual conversation
    "what are you doing right now",
    "i will meet you at the park tomorrow",
    "let us go for a walk",
    "cannot wait for the weekend",
    "i am so happy and excited",
    "weather is great today",
    "this movie was fantastic",
    "the exam was really hard",
    "python is my favorite language",
    "i love natural language processing",

    # Work/study related
    "please send me the report asap",
    "i need to finish my homework",
    "the deadline is next monday",
    "let us schedule a meeting for tomorrow",
    "i will call you after the class",
    "project submission is due tonight",
    "machine learning is awesome",
    "data science requires a lot of practice",
    "artificial intelligence is the future",
    "text normalization is very important",

    # Questions & answers
    "where are you going now",
    "do you know the answer",
    "can you please help me",
    "what time is the train",
    "is it going to rain today",
    "are you free this evening",
    "when will you be back",
    "did you complete the task",
    "who is your best friend",
    "why are you so late",

    # Social media/chat slang
    "this song is lit",
    "that was a savage reply",
    "lol that is so funny",
    "brb need to grab food",
    "idk what to say",
    "btw did you check instagram",
    "omg that is crazy",
    "smh i cannot believe it",
    "yolo lets do this",
    "fyi the meeting is cancelled",

    # News/event style
    "the president will speak tonight",
    "the football match starts at 9 pm",
    "the new iphone was just announced",
    "the economy is growing faster this year",
    "the weather forecast predicts heavy rain",
    "scientists discovered a new species",
    "the concert tickets are sold out",
    "traffic is heavy on the main road",
    "the movie will be released next friday",
    "the festival begins tomorrow",

    # Mixed pronouns & tenses
    "she is reading a book",
    "he went to the market yesterday",
    "they are playing football outside",
    "we will travel to paris next month",
    "i had breakfast already",
    "you should drink more water",
    "i was watching tv when you called",
    "he is working from home today",
    "they will arrive in an hour",
    "we are learning python programming",
]

# Variations to mess up text (added a few more)
variations = [
    lambda s: s.upper(),
    lambda s: s.capitalize(),
    lambda s: s.replace("you", "u"),
    lambda s: s.replace("are", "r"),
    lambda s: s + "!!!",
    lambda s: "   " + s + "   ",
    lambda s: s.replace("today", "2day"),
    lambda s: s.replace("to", "2"),
    lambda s: s.replace("and", "&"),
    lambda s: s.replace("asap", "ASAP"),
    lambda s: s.replace("great", "gr8"),
    lambda s: s.replace("excited", "excitedddd"),
    lambda s: s.replace("cannot", "can't"),
    lambda s: s.replace("thanks", "thx"),
    lambda s: s.replace("morning", "mornin"),
    lambda s: s.replace("for", "4"),
    lambda s: s.replace("with", "w/"),
    lambda s: s.replace("because", "cuz"),
    lambda s: s.replace("people", "ppl"),
    lambda s: s.replace("before", "b4"),
    lambda s: s.replace("please", "plz"),
    lambda s: s.replace("love", "luv"),
    lambda s: s.replace("very", "vry"),
    lambda s: s.replace("okay", "ok"),
    lambda s: s.replace("message", "msg"),
    lambda s: s.replace("tomorrow", "tmrw"),
    lambda s: s + random.choice([" 😂", " 😎", " 🤔", " 😭"]),
    lambda s: ''.join(random.choice([c, c+c]) for c in s),  # extra letters
]

# Function to mess up text randomly
def mess_up(sentence):
    s = sentence
    for _ in range(random.randint(1, 3)):
        func = random.choice(variations)
        s = func(s)
    return s

# Function to generate dataset and download
def generate_dataset(num_rows, file_name):
    rows = []
    for _ in range(num_rows):
        sent = random.choice(base_sentences)
        messy = mess_up(sent)
        rows.append({"raw_text": messy, "expected_normalized_text": sent})
    df = pd.DataFrame(rows)
    df.to_csv(file_name, index=False)
    print(f"✅ Dataset saved as {file_name} with {num_rows} rows.")
    files.download(file_name)

# Generate datasets
#generate_dataset(50000, "text_normalization_dataset_small.csv")
generate_dataset(200000, "text_normalization_dataset_large.csv")


✅ Dataset saved as text_normalization_dataset_large.csv with 200000 rows.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>