In [1]:
import pandas as pd 
import re 


In [None]:
# Mock dataset (small but realistic)
data = {"id": [1,2,3,4,5],
        "text": [ "I love NLP and AI!", 
        "Win FREE prizes!!!", 
        "Contact us: info@example.com", 
        "Python > Java??", 
        None]}

df = pd.DataFrame(data)

print(df.head())




   id                          text
0   1            I love NLP and AI!
1   2            Win FREE prizes!!!
2   3  Contact us: info@example.com
3   4               Python > Java??
4   5                          None


In [4]:
#2 
print(df["text"])


0              I love NLP and AI!
1              Win FREE prizes!!!
2    Contact us: info@example.com
3                 Python > Java??
4                            None
Name: text, dtype: object


In [None]:
#3
df["text"] = df["text"].fillna("Unknown")
print(df)

   id                          text
0   1            I love NLP and AI!
1   2            Win FREE prizes!!!
2   3  Contact us: info@example.com
3   4               Python > Java??
4   5                       Unknown


In [None]:
# Example 4: Filter rows containing 'NLP'
nlp_rows = df[df["text"].str.contains("NLP",na=False)]
print(nlp_rows)

   id                text
0   1  I love NLP and AI!


In [7]:
#5 reset index after filtering
nlp_rows = nlp_rows.reset_index(drop=True)
print(nlp_rows)




   id                text
0   1  I love NLP and AI!


In [8]:
#6 lowercase all text
df["text_lower"] = df["text"].str.lower()
print(df[["id","text_lower"]])


   id                    text_lower
0   1            i love nlp and ai!
1   2            win free prizes!!!
2   3  contact us: info@example.com
3   4               python > java??
4   5                       unknown


In [9]:
# Example 7: Remove punctuation (regex)
df["text_clean"]=df["text_lower"].str.replace(r'[^\w\s]',"",regex=True)
df["text_clean"]

0            i love nlp and ai
1              win free prizes
2    contact us infoexamplecom
3                 python  java
4                      unknown
Name: text_clean, dtype: object

In [10]:
#exract email address
df["emails"] = df["text"].str.extract(r'([\w\.-]+@[\w\.-]+)')

print(df[["id","emails"]])

   id            emails
0   1               NaN
1   2               NaN
2   3  info@example.com
3   4               NaN
4   5               NaN


In [11]:
df["numbers"] = df["text"].str.extract(r'(\d+)')
print(df["numbers"])

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
Name: numbers, dtype: object


In [12]:
# 10
df["capital_words"] = df["text"].apply(lambda x: re.findall(r'\b[A-Z][a-z]*\b', x))

df[["id","capital_words"]]

Unnamed: 0,id,capital_words
0,1,[I]
1,2,[Win]
2,3,[Contact]
3,4,"[Python, Java]"
4,5,[Unknown]


### Conditional Filtering + functions

In [6]:
import pandas as pd 
import re
data_2 = {
    "id": [1, 2, 3, 4, 5, 6],
    "text": [
        "Win FREE prizes now!", 
        "Python is great for NLP.", 
        "What is your name?", 
        "123456", 
        "Contact: admin@domain.com", 
        ""
    ]
}
df_2 = pd.DataFrame(data_2)
print(df_2)

   id                       text
0   1       Win FREE prizes now!
1   2   Python is great for NLP.
2   3         What is your name?
3   4                     123456
4   5  Contact: admin@domain.com
5   6                           


In [7]:
df_2["text"] = df_2["text"].fillna("Unknown")


In [8]:
# Example 11: Filter text length > 10
long_texts = df_2[df_2["text"].str.len()>10]
print(long_texts)

   id                       text
0   1       Win FREE prizes now!
1   2   Python is great for NLP.
2   3         What is your name?
4   5  Contact: admin@domain.com


In [9]:
# Example 12: Flag SPAM if contains 'free' or 'win'

df_2["is_spam"] = df_2["text"].str.contains(r'(free|win)',flags=re.IGNORECASE,na=False)
print(df_2[["id","text","is_spam"]])


   id                       text  is_spam
0   1       Win FREE prizes now!     True
1   2   Python is great for NLP.    False
2   3         What is your name?    False
3   4                     123456    False
4   5  Contact: admin@domain.com    False
5   6                               False


  df_2["is_spam"] = df_2["text"].str.contains(r'(free|win)',flags=re.IGNORECASE,na=False)


In [10]:
# Example 13: Keep only rows with alphabets
# ---------------------------
df_2_alpha = df_2[df_2['text'].str.contains(r'[A-Za-z]', na=False)]
print(df_2_alpha)

   id                       text  is_spam
0   1       Win FREE prizes now!     True
1   2   Python is great for NLP.    False
2   3         What is your name?    False
4   5  Contact: admin@domain.com    False


In [11]:
#14
df_2_num= df_2[df_2['text'].str.match(r'\d+$')]
print(df_2_num)


   id    text  is_spam
3   4  123456    False


In [12]:
#15 drop rows wit empty text 
df_2 = df_2[df_2["text"].str.strip()!=""]
df_2

Unnamed: 0,id,text,is_spam
0,1,Win FREE prizes now!,True
1,2,Python is great for NLP.,False
2,3,What is your name?,False
3,4,123456,False
4,5,Contact: admin@domain.com,False


In [15]:
#16 function to clean special symbols 
def clean_symbols(text):
    return re.sub(r'[^A-Za-z0-9\s]','',text)

df_2["clean_text"] = df_2["text"].apply(clean_symbols)
print(df_2[["id","clean_text"]])


   id               clean_text
0   1      Win FREE prizes now
1   2  Python is great for NLP
2   3        What is your name
3   4                   123456
4   5   Contact admindomaincom


In [17]:
#17 func to count words 
def word_count(text):
    return len(text.split())

df_2["word_count"]  = df_2["clean_text"].apply(word_count)
print(df_2[["id","word_count"]])

   id  word_count
0   1           4
1   2           5
2   3           4
3   4           1
4   5           2


In [19]:
# Example 18: Function to detect questions
def is_question(text):
    return text.strip().endswith("?")
df_2["is_qn"] = df_2["text"].apply(is_question)
print(df_2[["id","text","is_qn"]])

   id                       text  is_qn
0   1       Win FREE prizes now!  False
1   2   Python is great for NLP.  False
2   3         What is your name?   True
3   4                     123456  False
4   5  Contact: admin@domain.com  False


In [21]:
# Example 19: Function to detect Named Entities (simple capital words)

def extract_capital_words(text):
    return re.findall(r'\b[A-Z][a-z]*\b',text)

df_2["named_entities"] = df_2["text"].apply(extract_capital_words)

print(df_2[["id","text","named_entities"]])

   id                       text named_entities
0   1       Win FREE prizes now!          [Win]
1   2   Python is great for NLP.       [Python]
2   3         What is your name?         [What]
3   4                     123456             []
4   5  Contact: admin@domain.com      [Contact]


In [23]:
# Example 20: Function to expand contractions (mock example)
contractions = {"don't": "do not", "can't": "cannot", "won't": "will not"}

def expand_contractions(text):
    for k,v in contractions.items():
        text = re.sub(k,v,text,flags=re.IGNORECASE)
    return text

df_2["expanded_text"]  = df_2["text"].apply(expand_contractions)

print(df_2[["id","text","expanded_text"]])


   id                       text              expanded_text
0   1       Win FREE prizes now!       Win FREE prizes now!
1   2   Python is great for NLP.   Python is great for NLP.
2   3         What is your name?         What is your name?
3   4                     123456                     123456
4   5  Contact: admin@domain.com  Contact: admin@domain.com
