###  Real NLP Cleaning Pipeline (Examples 41–50 combined)

In [14]:
import pandas as pd 
import re 


In [15]:
# Step 1: Mock dataset (like scraped/social media data)
# ---------------------------
data = {
    "id": [1, 2, 3, 4, 5],
    "text": [
        "WIN a FREE iPhone!!! Visit https://promo.com @user1",
        "Python 3.9 released on 2020-10-05. Amazing update!",
        "Email leaks: admin@company.com shared details.",
        "Weather in New York!!! Flood alerts.",
        "Sports Update: WorldCup2024 starts soon!!!"
    ],
    "category": ["promo", "tech", "news", "news", "sports"]
}


In [16]:
df  = pd.DataFrame(data)
df 

Unnamed: 0,id,text,category
0,1,WIN a FREE iPhone!!! Visit https://promo.com @...,promo
1,2,Python 3.9 released on 2020-10-05. Amazing upd...,tech
2,3,Email leaks: admin@company.com shared details.,news
3,4,Weather in New York!!! Flood alerts.,news
4,5,Sports Update: WorldCup2024 starts soon!!!,sports


In [17]:
# Step 2 (Example 41-42): Clean basic text (lowercase, remove extra spaces)

df["clean_text"] = (
    df["text"].str.lower().str.replace(r"\s+"," ",regex=True).str.strip()
    
)
print(df[["id","text","clean_text"]])


   id                                               text  \
0   1  WIN a FREE iPhone!!! Visit https://promo.com @...   
1   2  Python 3.9 released on 2020-10-05. Amazing upd...   
2   3     Email leaks: admin@company.com shared details.   
3   4               Weather in New York!!! Flood alerts.   
4   5         Sports Update: WorldCup2024 starts soon!!!   

                                          clean_text  
0  win a free iphone!!! visit https://promo.com @...  
1  python 3.9 released on 2020-10-05. amazing upd...  
2     email leaks: admin@company.com shared details.  
3               weather in new york!!! flood alerts.  
4         sports update: worldcup2024 starts soon!!!  


In [18]:
# Step 3 (Example 43): Remove URLs
df["clean_text"] = df["clean_text"].str.replace(r'http\S+',"<URL",regex=True)
df["clean_text"]

0               win a free iphone!!! visit <URL @user1
1    python 3.9 released on 2020-10-05. amazing upd...
2       email leaks: admin@company.com shared details.
3                 weather in new york!!! flood alerts.
4           sports update: worldcup2024 starts soon!!!
Name: clean_text, dtype: object

In [19]:
# 44 : remove email address 
df["clean_text"] = df["text"].str.replace(r'[\w\.-]+@[\w\.-]+', '<EMAIL>', regex=True)
print(df["clean_text"])


0    WIN a FREE iPhone!!! Visit https://promo.com @...
1    Python 3.9 released on 2020-10-05. Amazing upd...
2                 Email leaks: <EMAIL> shared details.
3                 Weather in New York!!! Flood alerts.
4           Sports Update: WorldCup2024 starts soon!!!
Name: clean_text, dtype: object


In [20]:
# Step 5 (Example 45): Normalize numbers
df["clean_text"] = df["clean_text"].str.replace(r'\d',"<NUM>",regex=True)
df["clean_text"] 

0    WIN a FREE iPhone!!! Visit https://promo.com @...
1    Python <NUM>.<NUM> released on <NUM><NUM><NUM>...
2                 Email leaks: <EMAIL> shared details.
3                 Weather in New York!!! Flood alerts.
4    Sports Update: WorldCup<NUM><NUM><NUM><NUM> st...
Name: clean_text, dtype: object

In [21]:
# Step 6 (Example 46): Remove punctuation except placeholders
df["clean_text"] = df["clean_text"].str.replace(r'[^\w\s<>]',"",regex=True)

print(df["clean_text"])


0      WIN a FREE iPhone Visit httpspromocom user<NUM>
1    Python <NUM><NUM> released on <NUM><NUM><NUM><...
2                   Email leaks <EMAIL> shared details
3                     Weather in New York Flood alerts
4    Sports Update WorldCup<NUM><NUM><NUM><NUM> sta...
Name: clean_text, dtype: object


In [22]:
# Step 7 (Example 47): Tokenize

df["tokens"] = df["clean_text"].str.split()
print(df["tokens"])

0    [WIN, a, FREE, iPhone, Visit, httpspromocom, u...
1    [Python, <NUM><NUM>, released, on, <NUM><NUM><...
2             [Email, leaks, <EMAIL>, shared, details]
3              [Weather, in, New, York, Flood, alerts]
4    [Sports, Update, WorldCup<NUM><NUM><NUM><NUM>,...
Name: tokens, dtype: object


In [23]:
# Step 8 (Example 48): Remove stopwords using list comprehension
stopwords = {'a', 'the', 'is', 'in', 'on', 'at', 'and', 'or', 'to', 'for', 'of', 'soon'}
df["tokens"] =df["tokens"].apply(lambda tokens:[t for t in tokens if t not in stopwords])
df["tokens"]


0    [WIN, FREE, iPhone, Visit, httpspromocom, user...
1    [Python, <NUM><NUM>, released, <NUM><NUM><NUM>...
2             [Email, leaks, <EMAIL>, shared, details]
3                  [Weather, New, York, Flood, alerts]
4    [Sports, Update, WorldCup<NUM><NUM><NUM><NUM>,...
Name: tokens, dtype: object

In [24]:
# Step 9 (Example 49): Join tokens back to clean string

df["final_cleaned_dataset"] = df["tokens"].apply(lambda tokens:" ".join(tokens))
df["final_cleaned_dataset"] 

0        WIN FREE iPhone Visit httpspromocom user<NUM>
1    Python <NUM><NUM> released <NUM><NUM><NUM><NUM...
2                   Email leaks <EMAIL> shared details
3                        Weather New York Flood alerts
4    Sports Update WorldCup<NUM><NUM><NUM><NUM> starts
Name: final_cleaned_dataset, dtype: object

In [26]:
# Step 10 (Example 50): Export cleaned dataset

df[["id","category","final_cleaned_dataset"]].to_csv("cleaned_nlp_dataset.csv",index=False)

print(df[["id","category","final_cleaned_dataset"]])


   id category                              final_cleaned_dataset
0   1    promo      WIN FREE iPhone Visit httpspromocom user<NUM>
1   2     tech  Python <NUM><NUM> released <NUM><NUM><NUM><NUM...
2   3     news                 Email leaks <EMAIL> shared details
3   4     news                      Weather New York Flood alerts
4   5   sports  Sports Update WorldCup<NUM><NUM><NUM><NUM> starts
