In [1]:
import pandas as pd 
import re 

# Mock dataset with different text patterns
data = {
    "id": [1, 2, 3, 4, 5],
    "text": [
        "Natural Language Processing is FUN!",
        "I LOVE AI and NLP.",
        "Visit https://example.com for FREE info.",
        "Numbers 123 are here.",
        "Spam!!! spam spam..."
    ]
}



In [5]:
# Example 21: Loop through each row and print text length
df  = pd.DataFrame(data)

for index,row in df.iterrows():
    text_len = len(row["text"])
    print(f"index {index},text length is {text_len}")
    

index 0,text length is 35
index 1,text length is 18
index 2,text length is 40
index 3,text length is 21
index 4,text length is 20


In [6]:
# Example 22: List comprehension to lowercase all texts

df["text_lower"] = [text.lower() for text in df["text"]]
df["text_lower"]

0         natural language processing is fun!
1                          i love ai and nlp.
2    visit https://example.com for free info.
3                       numbers 123 are here.
4                        spam!!! spam spam...
Name: text_lower, dtype: object

In [7]:
# Example 23: List comprehension to remove punctuation

df["text_no_punc"] = [re.sub(r'[^\w\s]',"",text) for text in df["text"]]
print(df["text_no_punc"])

0     Natural Language Processing is FUN
1                      I LOVE AI and NLP
2    Visit httpsexamplecom for FREE info
3                   Numbers 123 are here
4                         Spam spam spam
Name: text_no_punc, dtype: object


In [10]:
# Example 24: List comprehension to tokenize text
df["tokens"] = [text.split() for text in df["text_no_punc"]]
print(df["tokens"])


0     [Natural, Language, Processing, is, FUN]
1                      [I, LOVE, AI, and, NLP]
2    [Visit, httpsexamplecom, for, FREE, info]
3                    [Numbers, 123, are, here]
4                           [Spam, spam, spam]
Name: tokens, dtype: object


In [11]:
# Example 25: Loop to create dictionary mapping id → tokens
id_token_dict = {}
for i, row in df.iterrows():
    id_token_dict[row["id"]] = row["tokens"]
print(id_token_dict)  

{1: ['Natural', 'Language', 'Processing', 'is', 'FUN'], 2: ['I', 'LOVE', 'AI', 'and', 'NLP'], 3: ['Visit', 'httpsexamplecom', 'for', 'FREE', 'info'], 4: ['Numbers', '123', 'are', 'here'], 5: ['Spam', 'spam', 'spam']}


In [13]:
# Example 26: Lambda to strip whitespace
df["text_stripped"]  =df["text"].apply(lambda x:x.strip())
print(df[['id', 'text_stripped']])

   id                             text_stripped
0   1       Natural Language Processing is FUN!
1   2                        I LOVE AI and NLP.
2   3  Visit https://example.com for FREE info.
3   4                     Numbers 123 are here.
4   5                      Spam!!! spam spam...


In [14]:
# Example 27: Lambda to replace URLs with <URL> placeholder

df["text_no_url"] = df["text"].apply(lambda x: re.sub(r'http\S+',"<URL>",x))
print(df[["id","text_no_url"]])

   id                          text_no_url
0   1  Natural Language Processing is FUN!
1   2                   I LOVE AI and NLP.
2   3           Visit <URL> for FREE info.
3   4                Numbers 123 are here.
4   5                 Spam!!! spam spam...


In [15]:
# Example 28: map() to count tokens in each text
df["token_count"] = df["token"].map(len)
print(df[["id","token_count"]])


   id  token_count
0   1            5
1   2            5
2   3            5
3   4            4
4   5            3


In [16]:
# Example 29: map() to check if text contains keyword 'nlp'
df["contain_nlp"] = df["text_lower"].map(lambda x: "nlp" in x)
print(df["contain_nlp"])

0    False
1     True
2    False
3    False
4    False
Name: contain_nlp, dtype: bool


In [17]:
# Example 30: Nested lambda to clean numbers (replace digits with '0')

df["text_num_normalized"] = df["text_no_punc"].map(lambda x:re.sub(r'\d',"0",x))
print(df[["id","text_num_normalized"]])

   id                  text_num_normalized
0   1   Natural Language Processing is FUN
1   2                    I LOVE AI and NLP
2   3  Visit httpsexamplecom for FREE info
3   4                 Numbers 000 are here
4   5                       Spam spam spam


 tokenization + aggregation + advanced regex — this is where text becomes features for models.

