# **String Manipulation**

In [1]:
import pandas as pd

In [2]:
# Sample messy data
data = {
    'Name': ['  ali  ', 'NOman', 'sARA ', 'Ahmed'],
    'Email': ['ali@gmail.com', 'noman@YAHOO.com', 'sara@gmail.com', 'ahmed@outlook.com'],
    'Price': ['$1,000', '$2,500', '$1,200', '$3,000']
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Email,Price
0,ali,ali@gmail.com,"$1,000"
1,NOman,noman@YAHOO.com,"$2,500"
2,sARA,sara@gmail.com,"$1,200"
3,Ahmed,ahmed@outlook.com,"$3,000"


1. The .str Accessor

In [3]:
df["Name"].str.lower()

0      ali  
1      noman
2      sara 
3      ahmed
Name: Name, dtype: str

2. Changing Case

In [4]:
# Convert to lowercase
df['Name'] = df['Name'].str.lower()
df.head()

Unnamed: 0,Name,Email,Price
0,ali,ali@gmail.com,"$1,000"
1,noman,noman@YAHOO.com,"$2,500"
2,sara,sara@gmail.com,"$1,200"
3,ahmed,ahmed@outlook.com,"$3,000"


In [5]:
# Convert to uppercase
df['Name'] = df['Name'].str.upper()
df.head()

Unnamed: 0,Name,Email,Price
0,ALI,ali@gmail.com,"$1,000"
1,NOMAN,noman@YAHOO.com,"$2,500"
2,SARA,sara@gmail.com,"$1,200"
3,AHMED,ahmed@outlook.com,"$3,000"


In [6]:
# Convert to title case (First letter capitalized)
df['Name'] = df['Name'].str.title()
df.head()

Unnamed: 0,Name,Email,Price
0,Ali,ali@gmail.com,"$1,000"
1,Noman,noman@YAHOO.com,"$2,500"
2,Sara,sara@gmail.com,"$1,200"
3,Ahmed,ahmed@outlook.com,"$3,000"


3. Cleaning Whitespace

In [7]:
# Remove leading and trailing spaces
df['Name'] = df['Name'].str.strip()
df.head()

Unnamed: 0,Name,Email,Price
0,Ali,ali@gmail.com,"$1,000"
1,Noman,noman@YAHOO.com,"$2,500"
2,Sara,sara@gmail.com,"$1,200"
3,Ahmed,ahmed@outlook.com,"$3,000"


In [8]:
# Remove only leading (left) spaces
df['Name'] = df['Name'].str.lstrip()
df.head()

Unnamed: 0,Name,Email,Price
0,Ali,ali@gmail.com,"$1,000"
1,Noman,noman@YAHOO.com,"$2,500"
2,Sara,sara@gmail.com,"$1,200"
3,Ahmed,ahmed@outlook.com,"$3,000"


In [9]:
# Remove only trailing (right) spaces
df['Name'] = df['Name'].str.rstrip()
df.head()

Unnamed: 0,Name,Email,Price
0,Ali,ali@gmail.com,"$1,000"
1,Noman,noman@YAHOO.com,"$2,500"
2,Sara,sara@gmail.com,"$1,200"
3,Ahmed,ahmed@outlook.com,"$3,000"


4. Splitting and Replacing

Replace Substrings
Useful for cleaning currency symbols or fixing typos.

In [10]:
# Before Removing
df.head()


Unnamed: 0,Name,Email,Price
0,Ali,ali@gmail.com,"$1,000"
1,Noman,noman@YAHOO.com,"$2,500"
2,Sara,sara@gmail.com,"$1,200"
3,Ahmed,ahmed@outlook.com,"$3,000"


In [11]:
# Remove '$' sign and ',' from prices
df['Price'] = df['Price'].str.replace('$', '').str.replace(',', '')
df.head()

Unnamed: 0,Name,Email,Price
0,Ali,ali@gmail.com,1000
1,Noman,noman@YAHOO.com,2500
2,Sara,sara@gmail.com,1200
3,Ahmed,ahmed@outlook.com,3000


In [12]:
# Replace specific words
df['Email'] = df['Email'].str.replace('.com', '.org')
df.head()

Unnamed: 0,Name,Email,Price
0,Ali,ali@gmail.org,1000
1,Noman,noman@YAHOO.org,2500
2,Sara,sara@gmail.org,1200
3,Ahmed,ahmed@outlook.org,3000


### Split Strings
Split a column into multiple parts (e.g., splitting "Firstname Lastname").

In [13]:
# Split Strings
# Split a column into multiple parts (e.g., splitting "Firstname Lastname").
df_emloyee = pd.DataFrame({
    "Full_Name": ["John Doe", "Jane Doe", "Bob Smith", "Alice Smith"],
    "Email": ["john.doe@example.com", "jane.doe@example.com", "bob.smith@example.com", "alice.smith@example.com"],
    "Phone": ["123-456-7890", "234-567-8901", "345-678-9012", "456-789-0123"]
})
df_emloyee

Unnamed: 0,Full_Name,Email,Phone
0,John Doe,john.doe@example.com,123-456-7890
1,Jane Doe,jane.doe@example.com,234-567-8901
2,Bob Smith,bob.smith@example.com,345-678-9012
3,Alice Smith,alice.smith@example.com,456-789-0123


In [14]:
# Split by space
df_emloyee[['First_Name', 'Last_Name']] = df_emloyee['Full_Name'].str.split(' ', expand=True)

df_emloyee.head()

Unnamed: 0,Full_Name,Email,Phone,First_Name,Last_Name
0,John Doe,john.doe@example.com,123-456-7890,John,Doe
1,Jane Doe,jane.doe@example.com,234-567-8901,Jane,Doe
2,Bob Smith,bob.smith@example.com,345-678-9012,Bob,Smith
3,Alice Smith,alice.smith@example.com,456-789-0123,Alice,Smith


### 5. Filtering with String Methods
Filter rows based on text patterns.

In [15]:
df_data = pd.DataFrame({
    "Name": ["John", "Jane", "Bob", "Alice"],
    "Email": ["john@example.com", "jane@example.com", "bob@example.com", "alice@example.com"],
    "Phone": ["123-456-7890", "234-567-8901", "345-678-9012", "456-789-0123"],
    "Job Title": ["Software Engineer", "Data Scientist", "Project Manager", "Marketing Specialist"],
    "Department": ["Engineering", "Data Science", "Project Management", "Marketing"],
    "File Name": ["john.pdf", "jane.pdf", "bob.pdf", "alice.pdf"]
})
df_data.head()

Unnamed: 0,Name,Email,Phone,Job Title,Department,File Name
0,John,john@example.com,123-456-7890,Software Engineer,Engineering,john.pdf
1,Jane,jane@example.com,234-567-8901,Data Scientist,Data Science,jane.pdf
2,Bob,bob@example.com,345-678-9012,Project Manager,Project Management,bob.pdf
3,Alice,alice@example.com,456-789-0123,Marketing Specialist,Marketing,alice.pdf


In [17]:
# Find rows where 'Job Title' contains "Manager"
managers = df_data[df_data['Job Title'].str.contains("Manager")]
managers.head()

Unnamed: 0,Name,Email,Phone,Job Title,Department,File Name
2,Bob,bob@example.com,345-678-9012,Project Manager,Project Management,bob.pdf


In [18]:
# Find rows starting with specific text
sales_dept = df_data[df_data['Department'].str.startswith("Sales")]
sales_dept.head()

Unnamed: 0,Name,Email,Phone,Job Title,Department,File Name


In [19]:
# Find rows ending with specific text
pdf_files = df_data[df_data['File Name'].str.endswith(".pdf")]
pdf_files.head()

Unnamed: 0,Name,Email,Phone,Job Title,Department,File Name
0,John,john@example.com,123-456-7890,Software Engineer,Engineering,john.pdf
1,Jane,jane@example.com,234-567-8901,Data Scientist,Data Science,jane.pdf
2,Bob,bob@example.com,345-678-9012,Project Manager,Project Management,bob.pdf
3,Alice,alice@example.com,456-789-0123,Marketing Specialist,Marketing,alice.pdf
