# Overview 

Within this notebook, we focus on different aspects of regular expressions with a notation of improving string handling.

## Diagnostics


In [1]:
#@title (Hidden) Diagnostic Check
import os
import sys
import pandas as pd
import re

print(f"python: {sys.version}")
print(f"pandas: {pd.__version__}")
print(f"re: {re.__version__}")

try:
  from google.colab import drive
  is_google_colab = True

  print("Notebook is on Google CoLab")
except:
  is_google_colab = False
  print("Notebook is being run locally or through another source.")


python: 3.10.4 | packaged by conda-forge | (main, Mar 24 2022, 17:43:32) [Clang 12.0.1 ]
pandas: 1.4.2
re: 2.2.1
Notebook is being run locally or through another source.


# Regular expressions


In [2]:
import re

# Sample string data
x = ["did you lie to me?",
          "all lies!",
          "are you lying?",
          "lying on the couch"]



print([re.search(r'you', string) for string in x])

[<re.Match object; span=(4, 7), match='you'>, None, <re.Match object; span=(4, 7), match='you'>, None]


## Conditional Matching

In [3]:
# Look for either lie or you
print([re.findall(r'lie|you', string) for string in x])

# Equivalent, but emphasizes terms
print([re.findall(r'(lie)|(you)', string) for string in x])

[['you', 'lie'], ['lie'], ['you'], []]
[[('', 'you'), ('lie', '')], [('lie', '')], [('', 'you')], []]


# Your Turn: Identifying Strings
    1. Find all cities that reside in the state of illinois or have the acryonym IL


In [4]:
import pandas as pd
y = ["Chicago, IL", "San Fran, CA", "Iowa City, IA", "Urbana, IL", "Wheaton, IL", "Myrtle Beach, SC"]
print([re.findall(r'IL', string) for string in y])

#identify strings that match
print([re.search(r'IL', string) for string in y])

#force response to be either true or false
print([bool(re.search(r'IL', string)) for string in y])


mask_il = [bool(re.search(r'IL', string)) for string in y]
pd.Series(y)[mask_il]

[['IL'], [], [], ['IL'], ['IL'], []]
[<re.Match object; span=(9, 11), match='IL'>, None, None, <re.Match object; span=(8, 10), match='IL'>, <re.Match object; span=(9, 11), match='IL'>, None]
[True, False, False, True, True, False]


0    Chicago, IL
3     Urbana, IL
4    Wheaton, IL
dtype: object

# 2. All the instance of either UIUC or UofI

In [5]:
y = ["UNR", "UNC", "UofI", "UIUC","UI"]
print([re.findall(r'(UIUC)|(UofI)', string) for string in y])

[[], [], [('', 'UofI')], [('UIUC', '')], []]


In [18]:
y = ["UNR", "UNC", "UofI", "UIUC", "UI"]

print([re.search(r'UIUC|UofI', string) for string in y])

print([bool(re.search(r'UIUC|UofI', string)) for string in y])

[None, None, <re.Match object; span=(0, 4), match='UofI'>, <re.Match object; span=(0, 4), match='UIUC'>, None]
[False, False, True, True, False]


## Escaping Metacharacters

In [19]:
print(
 [re.search(r'\?', string) for string in x]
)

[<re.Match object; span=(17, 18), match='?'>, None, <re.Match object; span=(13, 14), match='?'>, None]


# Find all strings with either a `+` or `\`.

In [27]:
x = ["3 + 4 = 7", "1/4 = 0.25", "2*4 = 8", "3 *4 ",
     "Algebra is fun?", "Green Eggs and\or Ham"]

print(
 [re.search('\\+|\\\\', string) for string in x])

#yo agadi ko r vaneko it allows us to use regex pattern
print(
 [re.search(r'\+|\\', string) for string in x]
)

# yo code is escaped pattern
print(
 [re.search('\\+|\\\\', string) for string in x]
)

[<re.Match object; span=(2, 3), match='+'>, None, None, None, None, <re.Match object; span=(14, 15), match='\\'>]
[<re.Match object; span=(2, 3), match='+'>, None, None, None, None, <re.Match object; span=(14, 15), match='\\'>]
[<re.Match object; span=(2, 3), match='+'>, None, None, None, None, <re.Match object; span=(14, 15), match='\\'>]


### Your Turn: Phone Numbers

Write a regex that matches a telephone number given as:

```
###-###-####
```

Hint: Use the range feature of character classes

In [63]:
phone_nums = ["(217) 333-2167", 
              "217-333-2167",
              "217 244-7190"]
##yo output 1 ko code
print(
 [bool(re.search(r'[0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9]', string)) for string in phone_nums])

##yo output 2 ko code
print(
 [bool(re.search(r'\d\d\d-\d\d\d-\d\d\d\d', string)) for string in phone_nums])

##yo output 3 ko code
print(
 [bool(re.search(r'\d{3}-\d{3}-\d{4}', string)) for string in phone_nums])




[False, True, False]
[False, True, False]
[False, True, False]


# ### Your Turn: Phone Number Jenga

1. Retrieve the different portions of a phone number

2. Change the area code of the phone number to 8888

In [101]:
phone_nums = ["(217) 333-2167", 
              "217-333-2167",
              "217 244-7190"]
#print([re.sub("s.*217", "888", x) for x in phone_nums])
print([re.sub("217", "888", x) for x in phone_nums])

#re.search(r'\(?(\d{3})\)[- ]?(\d{3})-(\d{4})', phone_nums[0])
[re.search(r'\(?(\d{3})\)?[- ]?(\d{3})-?(\d{4})', string) for string in phone_nums]
## We did this to see if it is a valid phone number

['(888) 333-2167', '888-333-2167', '888 244-7190']


[<re.Match object; span=(0, 14), match='(217) 333-2167'>,
 <re.Match object; span=(0, 12), match='217-333-2167'>,
 <re.Match object; span=(0, 12), match='217 244-7190'>]

In [165]:
##Intermediate result
pattern = r'\(?(\d{3})\)?[- ]?(\d{3})-?(\d{4})'
replacement = r'\1-\2-\3'
x=[re.sub(pattern, repl = replacement, string = string) for string in phone_nums]

## We did all this to standardize. the phone number


In [166]:
#replaced area by 888
pattern = r'^\(?\d{3}'
replacement = r'888'

final_phone_number = [re.sub(pattern, repl = replacement, string = string) for string in x]
final_phone_number

['888-333-2167', '888-333-2167', '888-244-7190']

### Your Turn: Character Classes

1.	Find all matches of the word "i" / "I".
2.	Remove the word "not".
3.	Change the word "Green" to be "Blue".

In [104]:
green_eggs = ["I do not like them",
              "Sam-I-am.", "I do not like",
              "Green eggs and ham."]

print([bool(re.search("i|I", string)) for string in green_eggs])

pattern = "\snot"
replacement = ""
print([re.sub(pattern, replacement, x) for x in green_eggs])

pattern = "Green"
replacement = "Blue"
print([re.sub(pattern, replacement, x) for x in green_eggs])

[True, True, True, False]
['I do like them', 'Sam-I-am.', 'I do like', 'Green eggs and ham.']
['I do not like them', 'Sam-I-am.', 'I do not like', 'Blue eggs and ham.']


## Greedy vs. Lazy

**Greedy:** Match a pattern as many times as possible. (Default)

This will cause it to search until pattern cannot be found.

In [105]:
re.search("s.*o", "stackoverflow").group()

'stackoverflo'

**Lazy:** Match a pattern as few times as possible.
Stop searching once the pattern is found.

In [106]:
re.search("s.*?o", "stackoverflow").group()

'stacko'

### Your Turn: What kind of quantifer is present? 

Determine what kind of greedy or lazy quantifier is present in each pattern.
How does the quantifier affect the results?

In [107]:
# HTML Text Data
html_txt = "<span class='val'> <span> <b> Hi </b> </span> </span>"

In [108]:
re.findall("<span>(.*?)</span>", html_txt)

[' <b> Hi </b> ']

In [109]:
re.findall("<span>(.*)</span>", html_txt)

[' <b> Hi </b> </span> ']

# Regex with Pandas

Frequently, we'll want to use regular expressions with pandas instead of on just a single expression. To that end, we have:

```python
df['new_variable']=df['variable_of_interest'].str.regex_function(pattern)
```

In [147]:
import pandas as pd

In [148]:
df = pd.read_csv("House_details.csv")

In [149]:
df.head()

Unnamed: 0,House_Address,Price,Bedrooms,Bathrooms,SqFT,lot,yearbuilt,lastsoldifavailable,heattype,zillowdays,cooling,parking,basement,fireplace,floorcover
0,House_Address,Price,Bedrooms,Bathrooms,SqFT,lot,yearbuilt,lastsoldifavailable,heattype,zillowdays,cooling,parking,basement,fireplace,floorcover
1,"3751 Harbor Estates Ln , Champaign, IL 61822","$82,500",2 beds,2 baths,"1,149 sq ft",Contact for details,2009,"Jul 2012 for $82,500",Contact for details,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
2,"618 Lauterbur Ln , Champaign, IL 61822","$112,000",2 beds,2 baths,"1,001 sq ft",Contact for details,2008,"Jul 2012 for $112,000",Contact for details,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
3,"3728 Balcary Bay , Champaign, IL 61822","$82,000",2 beds,2 baths,"1,070 sq ft",Contact for details,2008,"Mar 2012 for $82,000",Forced air,Central,Unknown,Unknown,Unknown,Unknown,Unknown
4,"503 Corey Ln , Champaign, IL 61822","$103,000",2 beds,2 baths,995 sq ft,51.9 acres,2007,"Jul 2012 for $103,000",Contact for details,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown


In [150]:
df.tail()

Unnamed: 0,House_Address,Price,Bedrooms,Bathrooms,SqFT,lot,yearbuilt,lastsoldifavailable,heattype,zillowdays,cooling,parking,basement,fireplace,floorcover
3722,"2505 S Cottage Grove Ave , Urbana, IL 61801","$134,500",3 beds,2 baths,"1,481 sq ft",Contact for details,1969,"May 2013 for $134,500",Forced air,Central,Garage - Attached,Unknown,Unknown,"Carpet, Hardwood, Linoleum / Vinyl",Unknown
3723,"1702 E Michigan Ave , Urbana, IL 61802","$72,000",4 beds,2 baths,"1,550 sq ft","1,900 sqft",1969,"Mar 2012 for $72,000",Contact for details,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
3724,"2310 Slayback St , Urbana, IL 61802","$110,000",3 beds,2 baths,"1,426 sq ft","10,350 sqft",1969,"Jul 2013 for $110,000",Contact for details,Unknown,Garage - Attached,Unknown,Yes,Unknown,Unknown
3725,"1412 S Kinch St , Urbana, IL 61802","$136,500",4 beds,2 baths,"2,003 sq ft","2,003 sqft",1969,"Jun 2013 for $136,500",Forced air,Central,Garage - Attached,Partial,No,"Carpet, Hardwood, Laminate, Tile",No
3726,"1105 Brighton Dr , Urbana, IL 61801","$124,000",3 beds,2 baths,"1,740 sq ft","9,462 sqft",1968,"Jul 2012 for $124,000",Baseboard,Central,Garage - Attached,Unknown,Yes,"Carpet, Laminate, Tile",Unknown


From head and tail, we found out that the housing header was included twice. Skipping this row fixes that issue.  

In [151]:
df = pd.read_csv("House_details.csv", skiprows=[0])
df.head()

Unnamed: 0,House_Address,Price,Bedrooms,Bathrooms,SqFT,lot,yearbuilt,lastsoldifavailable,heattype,zillowdays,cooling,parking,basement,fireplace,floorcover
0,"3751 Harbor Estates Ln , Champaign, IL 61822","$82,500",2 beds,2 baths,"1,149 sq ft",Contact for details,2009,"Jul 2012 for $82,500",Contact for details,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
1,"618 Lauterbur Ln , Champaign, IL 61822","$112,000",2 beds,2 baths,"1,001 sq ft",Contact for details,2008,"Jul 2012 for $112,000",Contact for details,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
2,"3728 Balcary Bay , Champaign, IL 61822","$82,000",2 beds,2 baths,"1,070 sq ft",Contact for details,2008,"Mar 2012 for $82,000",Forced air,Central,Unknown,Unknown,Unknown,Unknown,Unknown
3,"503 Corey Ln , Champaign, IL 61822","$103,000",2 beds,2 baths,995 sq ft,51.9 acres,2007,"Jul 2012 for $103,000",Contact for details,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
4,"605 Lauterbur Ln , Champaign, IL 61822","$112,500",2 beds,2 baths,"1,001 sq ft",50 acres,2007,"Aug 2013 for $112,500","Forced air, Heat pump","Central, Other",Garage - Attached,Unknown,Unknown,"Carpet, Linoleum / Vinyl",Unknown


Verify the second header is removed.

In [152]:
df['House_Address'].head()

0    3751 Harbor Estates Ln , Champaign, IL 61822
1          618 Lauterbur Ln , Champaign, IL 61822
2          3728 Balcary Bay , Champaign, IL 61822
3              503 Corey Ln , Champaign, IL 61822
4          605 Lauterbur Ln , Champaign, IL 61822
Name: House_Address, dtype: object

Switch addresses to lower.

In [153]:
# Convert to lowercase
df['House_Address'] = df['House_Address'].str.lower()

In [154]:
df['House_Address'].head()

0    3751 harbor estates ln , champaign, il 61822
1          618 lauterbur ln , champaign, il 61822
2          3728 balcary bay , champaign, il 61822
3              503 corey ln , champaign, il 61822
4          605 lauterbur ln , champaign, il 61822
Name: House_Address, dtype: object

Retrieve street address information

In [155]:
df['St_Address'] = df['House_Address'].replace(' ,.*','',regex=True, inplace = False)

In [156]:
df['St_Address'].head()

0    3751 harbor estates ln
1          618 lauterbur ln
2          3728 balcary bay
3              503 corey ln
4          605 lauterbur ln
Name: St_Address, dtype: object

In [157]:
 df['House_Address'].head()

0    3751 harbor estates ln , champaign, il 61822
1          618 lauterbur ln , champaign, il 61822
2          3728 balcary bay , champaign, il 61822
3              503 corey ln , champaign, il 61822
4          605 lauterbur ln , champaign, il 61822
Name: House_Address, dtype: object

City information

In [175]:
df['city'] = df['House_Address'].replace('.*, champaign, .*', '0', regex=True, inplace = False)
df['city'] = df['city'].replace('.*, urbana, .*','1',regex=True, inplace = False)
df['city'] = df['city'].replace('.*, savoy, .*','2',regex=True, inplace = False)
df['city'] = df['city'].replace('.*, royal, .*','3',regex=True, inplace = False)
df['city'] = df['city'].replace('.*, saint joseph, .*','4',regex=True, inplace = False)

In [176]:
df['cityname'] = df['House_Address'].replace('.*, ([a-z ]+), .*',r'\1',regex=True, inplace = False)
df.groupby('cityname').size()

cityname
champaign       2620
royal              2
saint joseph       2
savoy              6
urbana          1088
dtype: int64

In [168]:
mask_city = ~df['city'].isin(["house_address"])
df = df[mask_city]
df.groupby('city').size()

city
champaign       2620
royal              2
saint joseph       2
savoy              6
urbana          1088
dtype: int64

In [169]:
df['city'].head()

0    champaign
1    champaign
2    champaign
3    champaign
4    champaign
Name: city, dtype: object

In [170]:
df['city'].unique()

array(['champaign', 'urbana', 'savoy', 'royal', 'saint joseph'],
      dtype=object)

In [177]:
df['city'] = pd.to_numeric(df['city'])

## Your Turn

Drop the city row that has "house_address" and, then, convert city to numeric with `.to_numeric`


## Zipcode

In [146]:
df['zipcode'] = df['House_Address'].replace(".* , .*, il ", "",regex=True, inplace = False)
df['zipcode'].head()

0    61822
1    61822
2    61822
3    61822
4    61822
Name: zipcode, dtype: object