In [None]:
import pandas as pd
import re



## Strings, Regex, and Pandas String Methods

1. Introduction to Strings
- Basic string operations in Python (str methods)
- Concatenation: "Hello" + " World"
- String formatting: f"Hello {name}"
- String slicing: "Hello"[1:4]
- Common methods: .lower(), .upper(), .strip(), .replace(), .split(), .join()

 Let's try it out

In [None]:
text = "Data Science and Economics"

In [None]:
print(text.lower())  


In [None]:

#print convert to upper case
print(text.upper())

In [None]:
#concatenate
print(text + " is a great field to work in.")

In [None]:
print(text  * 3) #multiply



In [None]:
#slice
print(text[0:4]) #slicing
print(text[0:13] + text[17:]) #slicing

In [None]:
print(text.split(" ")) #split

In [None]:

print(text.replace("Data", "Big Data")) #replace


In [None]:
print(text.split())  

In [None]:
df = pd.DataFrame({'names': ['Ani Adhikari', 'Emi Nakamura', 'Ted Miguel', 'David Card']})
df

In [None]:
df['last_name'] = df['names'].str.split().str[-1]  # Extract last name
df

In [None]:
df['first_name'] = df['names'].str.extract(r'(\w+)')
df

Breakdown of r'(\w+)'
- r'' (Raw String Prefix)
- The r before the string makes it a raw string, meaning backslashes (\) are treated literally.
- this prevents Python from interpreting \ as an escape character.
	
(\w+) (Main Pattern)
- \w → Matches any word character ([a-zA-Z0-9_]).
- + → Matches one or more word characters.
- () → Capturing Group, meaning it extracts the matched text.

In [None]:
text2 = "My phone number is 415-555-1234."
match = re.search(r"\d{3}-\d{3}-\d{4}", text2)
print(match.group())  # 415-555-1234

In [None]:
text3 = "My social security number is 123-45-6789 umm, or maybe it’s 321-45-6789."
pattern = r"[0-9]{3}-[0-9]{2}-[0-9]{4}"
re.findall(pattern, text3) 

In [None]:
ssn_pattern = r"[0-9]{3}-[0-9]{2}-[0-9]{4}"
# SSN Pattern: 3 digits, dash, 2 digits, dash, 3 digits

In [None]:
data = {
    "SSN": ["000-00-0000", "error", "876-54-3210 invalid or 210-54-3210", "111-11-1111", 
            "I refuse to provide", "456-78-9012", "abc-12-3456", "Nice try, FBI agent","777-77-7777", 
    "seventy-seven", "987-65-4321 please or 876-54-3210", "222-22-2222", "classified information", 
            "135-79-2468", "xyz-98-7654", "I only give my SSN to aliens","555-55-5555", "fifty-five", 
            "345-67-8901 not sure or 789-01-2345", "333-33-3333", "I'm a robot", "678-90-1234", 
            "aaa-11-2222", "Ask my cat for the SSN", "444-44-4444", "forty-four", 
            "234-56-7890 maybe or 876-54-3210", "444-44-4444", "top secret", "789-01-2345", 
            "bbb-22-3333", "I only speak in emojis 🤖", "666-66-6666", "sixty-six", 
            "432-10-9876 not applicable or 765-43-2109", "555-55-5555", "I'm a ghost", "012-34-5678", 
            "ccc-33-4444", "SSN? What's that?", "111-11-1111", "eleven", 
            "543-21-0987 avoid or 890-12-3456", "666-66-6666", "I'm on the run", "345-67-8901", 
            "ddd-44-5555", "My SSN is a secret code", "999-99-9999", "ninety-nine", 
            "654-32-1098 dodge or 210-98-7654", "777-77-7777", "I'm a spy", "567-89-0123", 
            "eee-55-6666", "I'll give you my SSN if you beat me in a dance-off", "123-45-6789", 
            "twelve", "765-43-2109 sidestep or 098-76-5432", "888-88-8888", "I'm an alien", 
            "678-90-1234", "fff-66-7777", "SSN? I speak in riddles", "234-56-7890", "twenty-four", 
            "876-54-3210 escape or 765-43-2109", "999-99-9999", "I'm a time traveler", "789-01-2345", 
            "ggg-77-8888", "My SSN is a password", "345-67-8901", "thirty-four", 
            "987-65-4321 run or 876-54-3210", "000-00-0000", "I'm a superhero", 
            "012-34-5678", "hhh-88-9999", "You'll never guess my SSN"]
}
ssn_data = pd.DataFrame(data)
ssn_data

In [None]:
ssn_data["SSN"].str.findall(ssn_pattern)


In [None]:
capturing_ssn_pattern = r"([0-9]{3})-([0-9]{2})-([0-9]{4})"
ssn_data["SSN"].str.extract(capturing_ssn_pattern) 
# Note: this just extracts the first matching pattern