ðŸŸ¦ 1. Import libraries

In [1]:
import pandas as pd

ðŸŸ¦ 2. Create a sample dataset

In [2]:
data = {
    "Name": ["  Alice Smith ", "BOB johnson", "Charlie   Brown", None, "david lee"],
    "Email": ["alice@gmail.com", "bob@yahoo.com", "charlie@gmail.com", None, "david@outlook.com"],
    "City": ["Toronto", "  NEW york", "los angeles ", "Chicago", "Toronto"],
    "Code": ["A-123", "B-456", "C-789", "A-000", "B-999"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Email,City,Code
0,Alice Smith,alice@gmail.com,Toronto,A-123
1,BOB johnson,bob@yahoo.com,NEW york,B-456
2,Charlie Brown,charlie@gmail.com,los angeles,C-789
3,,,Chicago,A-000
4,david lee,david@outlook.com,Toronto,B-999


ðŸŸ¦ 3. Cleaning Text

In [3]:
df["Name_clean"] = df["Name"].str.strip()
df["Name_lower"] = df["Name_clean"].str.lower()
df["Name_upper"] = df["Name_clean"].str.upper()

df[["Name", "Name_clean", "Name_lower", "Name_upper"]]


Unnamed: 0,Name,Name_clean,Name_lower,Name_upper
0,Alice Smith,Alice Smith,alice smith,ALICE SMITH
1,BOB johnson,BOB johnson,bob johnson,BOB JOHNSON
2,Charlie Brown,Charlie Brown,charlie brown,CHARLIE BROWN
3,,,,
4,david lee,david lee,david lee,DAVID LEE


ðŸŸ¦ 4. Finding & Replacing Text

4.1 Replace domain names

In [4]:
df["Email_new"] = df["Email"].str.replace("gmail.com", "edu", regex=False)
df[["Email", "Email_new"]]

Unnamed: 0,Email,Email_new
0,alice@gmail.com,alice@edu
1,bob@yahoo.com,bob@yahoo.com
2,charlie@gmail.com,charlie@edu
3,,
4,david@outlook.com,david@outlook.com


4.2 Remove dashes from codes

In [5]:
df["Code_clean"] = df["Code"].str.replace("-", "", regex=False)
df

Unnamed: 0,Name,Email,City,Code,Name_clean,Name_lower,Name_upper,Email_new,Code_clean
0,Alice Smith,alice@gmail.com,Toronto,A-123,Alice Smith,alice smith,ALICE SMITH,alice@edu,A123
1,BOB johnson,bob@yahoo.com,NEW york,B-456,BOB johnson,bob johnson,BOB JOHNSON,bob@yahoo.com,B456
2,Charlie Brown,charlie@gmail.com,los angeles,C-789,Charlie Brown,charlie brown,CHARLIE BROWN,charlie@edu,C789
3,,,Chicago,A-000,,,,,A000
4,david lee,david@outlook.com,Toronto,B-999,david lee,david lee,DAVID LEE,david@outlook.com,B999


ðŸŸ¦ 5. Extracting Patterns from Text

5.1 Extract letter from Code

In [7]:
df["Code_letter"] = df["Code"].str.extract(r"([A-Z])")
df[['Code','Code_letter']]

Unnamed: 0,Code,Code_letter
0,A-123,A
1,B-456,B
2,C-789,C
3,A-000,A
4,B-999,B


5.2 Extract numbers

In [8]:
df["Code_number"] = df["Code"].str.extract(r"(\d+)")
df[['Code', 'Code_number']]


Unnamed: 0,Code,Code_number
0,A-123,123
1,B-456,456
2,C-789,789
3,A-000,0
4,B-999,999


ðŸŸ¦ 6. Checking if Text Contains Pattern

6.1 Check if email is Gmail

In [9]:
df["Is_Gmail"] = df["Email"].str.contains("gmail", na=False)
df[['Email', 'Is_Gmail']]

Unnamed: 0,Email,Is_Gmail
0,alice@gmail.com,True
1,bob@yahoo.com,False
2,charlie@gmail.com,True
3,,False
4,david@outlook.com,False


6.2 Filter only Gmail users

In [10]:
gmail_users = df[df["Email"].str.contains("gmail", na=False)]
gmail_users

Unnamed: 0,Name,Email,City,Code,Name_clean,Name_lower,Name_upper,Email_new,Code_clean,Code_letter,Code_number,Is_Gmail
0,Alice Smith,alice@gmail.com,Toronto,A-123,Alice Smith,alice smith,ALICE SMITH,alice@edu,A123,A,123,True
2,Charlie Brown,charlie@gmail.com,los angeles,C-789,Charlie Brown,charlie brown,CHARLIE BROWN,charlie@edu,C789,C,789,True


ðŸŸ¦ 7. Splitting Strings into Multiple Columns

In [14]:
df[["First_Name", "Last_Name"]] = df["Name_clean"].str.split(" ", expand=True, n=1)
df[["Name_clean","First_Name", "Last_Name"]]

Unnamed: 0,Name_clean,First_Name,Last_Name
0,Alice Smith,Alice,Smith
1,BOB johnson,BOB,johnson
2,Charlie Brown,Charlie,Brown
3,,,
4,david lee,david,lee


In [12]:
df[["Username", "Domain"]] = df["Email"].str.split("@", expand=True)
df[["Email", "Username", "Domain"]]


Unnamed: 0,Email,Username,Domain
0,alice@gmail.com,alice,gmail.com
1,bob@yahoo.com,bob,yahoo.com
2,charlie@gmail.com,charlie,gmail.com
3,,,
4,david@outlook.com,david,outlook.com


ðŸŸ¦ 8. Advanced Examples

8.1 Extract city initial

In [15]:
df["City_initial"] = df["City"].str.strip().str[0]
df[['City', 'City_initial']]

Unnamed: 0,City,City_initial
0,Toronto,T
1,NEW york,N
2,los angeles,l
3,Chicago,C
4,Toronto,T


8.2 Count length of each name

In [17]:
df["Name_length"] = df["Name_clean"].str.len()
df[['Name_clean', 'Name_length']]

Unnamed: 0,Name_clean,Name_length
0,Alice Smith,11.0
1,BOB johnson,11.0
2,Charlie Brown,15.0
3,,
4,david lee,9.0


ðŸŸ¦ 9. Handling Missing Values in String Operations

In [18]:
df["Safe_lower_name"] = df["Name_clean"].str.lower().fillna("unknown")
df[['Name_clean', 'Safe_lower_name']]

Unnamed: 0,Name_clean,Safe_lower_name
0,Alice Smith,alice smith
1,BOB johnson,bob johnson
2,Charlie Brown,charlie brown
3,,unknown
4,david lee,david lee


ðŸŸ¦ 10. Multiple String Transformations Chain

In [22]:
df["Final_Name"] = (
    df["Name"]
    .str.strip()
    .str.lower()
    .str.replace(r"\s+", " ", regex=True)   # collapse multiple spaces
    .str.replace(" ", "_", regex=False)
)

df[["Name", "Final_Name"]]

Unnamed: 0,Name,Final_Name
0,Alice Smith,alice_smith
1,BOB johnson,bob_johnson
2,Charlie Brown,charlie_brown
3,,
4,david lee,david_lee


#âœ… Summary of Methods Covered


| Method                          | Purpose                    |
| ------------------------------- | -------------------------- |
| `.str.strip()`                  | Remove extra spaces        |
| `.str.lower()` / `.str.upper()` | Case formatting            |
| `.str.replace()`                | Replace text patterns      |
| `.str.extract()`                | Extract values using regex |
| `.str.contains()`               | Check regex pattern        |
| `.str.split()`                  | Break text into components |
| `.str.len()`                    | Get string length          |
| `.str[]`                        | Character slicing          |
