###  tokenization + aggregation + advanced regex — this is where text becomes features for models.



In [1]:
import pandas as pd
import re

data = {
    "id": [1, 2, 3, 4, 5, 6],
    "category": ["tech", "tech", "sports", "sports", "news", "news"],
    "text": [
        "NLP is amazing for AI.",
        "Transformers dominate NLP benchmarks.",
        "Football match on 2023-05-14!",
        "Cricket WorldCup 2024 schedule announced.",
        "Breaking news: Email leaks admin@domain.com",
        "Protests continue!!! Huge crowd in New York."
    ]
}

In [2]:
df  = pd.DataFrame(data)
df 

Unnamed: 0,id,category,text
0,1,tech,NLP is amazing for AI.
1,2,tech,Transformers dominate NLP benchmarks.
2,3,sports,Football match on 2023-05-14!
3,4,sports,Cricket WorldCup 2024 schedule announced.
4,5,news,Breaking news: Email leaks admin@domain.com
5,6,news,Protests continue!!! Huge crowd in New York.


In [3]:
# Example 31: Tokenize text column
import re 
df["tokens"] =[re.sub(r'[^\w\s]',"",text.lower()).split() for text in df["text"]]
print(df[["id","text","tokens"]])


   id                                          text  \
0   1                        NLP is amazing for AI.   
1   2         Transformers dominate NLP benchmarks.   
2   3                 Football match on 2023-05-14!   
3   4     Cricket WorldCup 2024 schedule announced.   
4   5   Breaking news: Email leaks admin@domain.com   
5   6  Protests continue!!! Huge crowd in New York.   

                                             tokens  
0                       [nlp, is, amazing, for, ai]  
1         [transformers, dominate, nlp, benchmarks]  
2                   [football, match, on, 20230514]  
3    [cricket, worldcup, 2024, schedule, announced]  
4    [breaking, news, email, leaks, admindomaincom]  
5  [protests, continue, huge, crowd, in, new, york]  


In [4]:
# Example 32: Explode tokens into separate rows
df_exploded = df.explode("tokens")
print(df_exploded)


   id category                                          text          tokens
0   1     tech                        NLP is amazing for AI.             nlp
0   1     tech                        NLP is amazing for AI.              is
0   1     tech                        NLP is amazing for AI.         amazing
0   1     tech                        NLP is amazing for AI.             for
0   1     tech                        NLP is amazing for AI.              ai
1   2     tech         Transformers dominate NLP benchmarks.    transformers
1   2     tech         Transformers dominate NLP benchmarks.        dominate
1   2     tech         Transformers dominate NLP benchmarks.             nlp
1   2     tech         Transformers dominate NLP benchmarks.      benchmarks
2   3   sports                 Football match on 2023-05-14!        football
2   3   sports                 Football match on 2023-05-14!           match
2   3   sports                 Football match on 2023-05-14!              on

In [5]:
# Example 33: Count token frequency

token_freq = df_exploded["tokens"].value_counts()
print(token_freq)

tokens
nlp               2
schedule          1
new               1
in                1
crowd             1
huge              1
continue          1
protests          1
admindomaincom    1
leaks             1
email             1
news              1
breaking          1
announced         1
2024              1
is                1
worldcup          1
cricket           1
20230514          1
on                1
match             1
football          1
benchmarks        1
dominate          1
transformers      1
ai                1
for               1
amazing           1
york              1
Name: count, dtype: int64


In [6]:
# Example 34: Remove rare tokens (appear once)
rare_tokens = token_freq[token_freq==1].index
df_filtered = df_exploded[~df_exploded["tokens"].isin(rare_tokens)]

print(df_filtered)


   id category                                   text tokens
0   1     tech                 NLP is amazing for AI.    nlp
1   2     tech  Transformers dominate NLP benchmarks.    nlp


In [7]:
# 35 
df["word_count"] = df["text"].str.split().str.len()
avg_word_count = df.groupby("category")["word_count"].sum()
print(avg_word_count)


category
news      12
sports     9
tech       9
Name: word_count, dtype: int64


In [8]:
# Example 36: Group by category and concatenate all texts
all_text_by_cat = df.groupby("category")["text"].apply(lambda x:" ".join(x))
print(all_text_by_cat)

category
news      Breaking news: Email leaks admin@domain.com Pr...
sports    Football match on 2023-05-14! Cricket WorldCup...
tech      NLP is amazing for AI. Transformers dominate N...
Name: text, dtype: object


In [9]:
# Example 37: Group by category and count messages
msg_count_by_cat = df.groupby("category")["id"].count()
print(msg_count_by_cat)

category
news      2
sports    2
tech      2
Name: id, dtype: int64


In [10]:
df

Unnamed: 0,id,category,text,tokens,word_count
0,1,tech,NLP is amazing for AI.,"[nlp, is, amazing, for, ai]",5
1,2,tech,Transformers dominate NLP benchmarks.,"[transformers, dominate, nlp, benchmarks]",4
2,3,sports,Football match on 2023-05-14!,"[football, match, on, 20230514]",4
3,4,sports,Cricket WorldCup 2024 schedule announced.,"[cricket, worldcup, 2024, schedule, announced]",5
4,5,news,Breaking news: Email leaks admin@domain.com,"[breaking, news, email, leaks, admindomaincom]",5
5,6,news,Protests continue!!! Huge crowd in New York.,"[protests, continue, huge, crowd, in, new, york]",7


In [11]:
# Example 38: Extract mentions (@username)
df["mentions"] = df["text"].str.extractall(r'@(\w+)').groupby(level=0)[0].agg(list)
print(df[["id", "mentions"]])


   id  mentions
0   1       NaN
1   2       NaN
2   3       NaN
3   4       NaN
4   5  [domain]
5   6       NaN


In [12]:
## Example 39: Extract dates (YYYY-MM-DD or YYYY format)
df['dates'] = df['text'].str.extract(r'(\d{4}(?:-\d{2}-\d{2})?)')
print(df[["id","dates"]])


   id       dates
0   1         NaN
1   2         NaN
2   3  2023-05-14
3   4        2024
4   5         NaN
5   6         NaN


In [13]:
# Example 40: Flag texts with >3 exclamation marks

df["high_exclaim"] = df["text"].str.contains(r'!{3,}')
print(df[["id","high_exclaim"]])


   id  high_exclaim
0   1         False
1   2         False
2   3         False
3   4         False
4   5         False
5   6          True
