# Programming for Business Analytics (Python)

### In this session, we will learn more about strings.
    Useful string methods
    Pattern matching with regular expressions

In [1]:
# This code appears in every demonstration Notebook.
# By default, when you run each cell, only the last output of the codes will show.
# This code makes all outputs of a cell show.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Useful string methods
    We work with strings all the time. Python provides many useful methods. For a complete list of the methods, you may go to https://www.w3schools.com/python/python_ref_string.asp.

In [None]:
# Strings may contain digits, letters and other characters. Sometimes we need to separate them or treat them differently.
# isX methods can help with that.
# isalpha(): check whether the string contains only letter. 
# isdigit(), isdecimal(), isnumeric(): check whether the string contains only digits. These three are different only in very rare cases.
# isalnum(): check whether the string contains only letter.
while True:
    print("Create a new password (letters and numbers only)")
    userpw = input()
    if userpw.isalnum():
        break
    print("Passwords can only have letters and numbers.")

In [4]:
# Sometimes we need to check uppercase and lowercase of strings
# islower(), isupper(), istitle()
myStr = 'Apple'
myStr.islower()
myStr.isupper()
myStr.istitle()

# Correspondingly, lower(), upper(), title() to change cases.
myStr1 = myStr.lower()
myStr2 = myStr.upper()
print(myStr1, myStr2)

# These methods give flexibility to content match. For instance, if a variable can take either uppercase or lowercase, use 
# lower() to turn all inputs to lowercase before content match.

False

False

True

apple APPLE


In [None]:
# We often need to search strings using certain criteria. There are some functions can help.
# For instance, startswith() and endswith()
# For example, our data has a price column, where some prices are recorded as "$500", which turns numerical price into a string.
# We need to turn them into numerical values.
def money(amount):
    if amount.startswith("$"):
        amount = int(amount.strip("$")) # strip() function removes the specified character from the beginning and the end.
    else:
        amount = int(amount)
    return amount
x = money("$56")
x

In [None]:
# challenge: change this money() to a lambda function
moneylbd = lambda x: int(x.strip('$')) if x.startswith('$') else int(x)
moneylbd('$78')

In [11]:
# lstrip() removes at the beginning, rstrip() removes at the end.
# You can specify a set of characters to remove.
testStr = ",,,,,ssaaww.....banana"

cleanStr = testStr.lstrip(",.asw")

print(cleanStr)

banana


In [12]:
# We are familiar with split() which turns a string into a list. There is also join() to turn an iterable into a string.
# For example,
myList = [1,2,3,4,5]
#'..'.join(myList) # This will return errors. Cannot join numbers.
newList = [str(i) for i in myList]
'--'.join(newList)

# A string is also an iterable.

'1--2--3--4--5'

### Pattern matching with regular expressions
String searching is probably one of the most common operations we would encounter. A regular expression is a sequence of characters that specifies a pattern for search. It has specific, standard textual syntax for representing patterns for matching text. Characters in a regular expression is either a metacharacter, having a special meaning, or a regular character that has a literal meaning.<br>
    Programming languages usually have functions to find or find and replace the strings that matches the regular expression. In Python, the functions are provided with the package 're'.

In [2]:
#Find phone number in the format of XXX-XXX-XXXX
message = "Call me at 415-555-1011 tomorrow. 415-555-9999 is my office number."

def isPhoneNumber(text):
    if len(text) != 12:
        return False
    for i in range(3):
        if not text[i].isnumeric():
            return False
    if text[3] != '-':
        return False
    for i in range(4,7):
        if not text[i].isnumeric():
            return False
    if text[7] != '-':
        return False
    for i in range(8,12):
        if not text[i].isnumeric():
            return False
    return True

In [3]:
# To find phone numbers in a string using the isPhoneNumber function

phonenums = []

for i in range(len(message)):
    chunk = message[i:i+12]
    if isPhoneNumber(chunk):
        phonenums.append(chunk)
        
print(phonenums)

['415-555-1011', '415-555-9999']


In [4]:
# If we use regular expressions.
# We first import re package.
import re

# Create the regular expression object using compile()
# The pattern: three digits - dash - three digits - dash - four digits.
# r'' refers to raw string, which will not be interpreted by Python with special meanings.
# \d for digits, \s for space, \w for letter, digit or the underscore
# The uppercase \D, \W, |S means anything but the set meant with lowercase.
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')

# Use findall() function to find all matched strings. They will be returned as a list.
numlist = phoneNumRegex.findall(message)
numlist

['415-555-1011', '415-555-9999']

In [5]:
itemList = "12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7 swans,\
6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 patridge"

# The pattern: digits-space-letters
# The number of digits and letters is flexible.
# Metacharacters with the special meanings are needed.
xmasRegex = re.compile(r'\d+\s\w+')
xmasRegex.findall(itemList)

# The ? matches zero or one of the preceding group.
# The * matches zero or more of the preceding group.
# The + matches one or more of the preceding group.
# The {n} matches exactly n of the preceding group.
# The {n,} matches n or more of the preceding group.
# The {,m} matches 0 to m of the preceding group.
# The {n,m} matches at least n and at most m of the preceding group.

['12 drummers',
 '11 pipers',
 '10 lords',
 '9 ladies',
 '8 maids',
 '7 swans',
 '6 geese',
 '5 rings',
 '4 birds',
 '3 hens',
 '2 doves',
 '1 patridge']

In [6]:
# The ways people keep phone numbers can be flexible. For example, the area code is optional.
message1 = "Call me at 415-555-1011 tomorrow. 555-9999 is my office number."

# The regular expression can specify the part is optional. Optional means the occurrence of the part can be either 0 or 1.
# First, we need to put the optional part into a group using ()

phoneNumRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')
numlist = phoneNumRegex.findall(message1)
numlist

# The ? matches zero or one of the preceding group.
# The * matches zero or more of the preceding group.
# The + matches one or more of the preceding group.
# The {n} matches exactly n of the preceding group.
# The {n,} matches n or more of the preceding group.
# The {,m} matches 0 to m of the preceding group.
# The {n,m} matches at least n and at most m of the preceding group.

['415-', '']

In [8]:
# Once groups are specified, findall() function will search for groups only.
# We put all parts into groups. findall() returned all found groups in a tuple
phoneNumRegex1 = re.compile(r'(\d\d\d-)?(\d\d\d-)(\d\d\d\d)')
numlist = phoneNumRegex1.findall(message1)
numlist
# How can we get the output as phone numbers?

[('415-', '555-', '1011'), ('', '555-', '9999')]

In [9]:
# With groups, search() function will find the first matched set of groups.
mo = phoneNumRegex1.search(message1)
mo.group(1)
mo.group(2)
mo.group()

'415-'

'555-'

'415-555-1011'

In [10]:
# We can keep adding flexibility to the phone number format.
# What if the format is (415) 555-4242?
message2 = "Call me at (415) 555-1011 tomorrow. (415) 555-9999 is my 555-6789 office number."
phoneNumRegex2 = re.compile(r'\(\d{3}\) \d{3}-\d{4}')
numlist = phoneNumRegex2.findall(message2)
numlist

['(415) 555-1011', '(415) 555-9999']

In [29]:
# '.' as wildcard character, which means it can be character to match.
atRegex = re.compile(r'.at')
atRegex.findall("The cat in the hat sat on the flat mat.")

['cat', 'hat', 'sat', 'lat', 'mat']

In [11]:
# Sometimes we would like to search for all strings within a context.
# For example, all names that are listed with "First Name: XXX Last Name: XXX"
names = '''
First Name: Messi Last Name: Messi
First Name: Right Last Name: Left
First Name: Ling Last Name: Ge
'''
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
nameList = nameRegex.findall(names)
nameList
#mo = nameRegex.search("First Name: Ling Last Name: Ge")
#mo.group()
#mo.group(1)
#mo.group(2)
#nameRegex.findall("First Name: Ling Last Name: Ge")

[('Messi', 'Messi'), ('Right', 'Left'), ('Ling', 'Ge')]

In [13]:
#re.I to make regex case-insensitive
robocop = re.compile(r'robocop', re.I)
robocop.search('Robocop is scary').group()

'Robocop'

In [14]:
# sub() to replace the searched pattern
namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED', 'Agent Alice gave the document to Agent Bob.')

'CENSORED gave the document to CENSORED.'

In [34]:
#re.VERBOSE to ignore comments and whitespace
phoneRegexfull = re.compile(r'''(
      (\d{3}|\(\d{3}\))? # area code can be optional
      (\s|-|\.) # separator
      (\d{3}) # first three digits
      (\s|-|\.) # separator
      (\d{4}) # last 4 digits
      (\s*(ext|x)\s*\d{2,5})? # extension
                        )''', re.VERBOSE)
# re.VERBOSE is to allow multiple-line regular expressions
phoneRegexfull.findall("Call me at 415-555-8789 ext 555")
# (415).555.8789
# 415.555.8789
# 415-555-8789

[('415-555-8789 ext 555', '415', '-', '555', '-', '8789', ' ext 555', 'ext')]

In [16]:
bat = 'Batman is Bruce Wayne. Batwoman is Babara Gordon.'
batRegex = re.compile(r'bat.{0,2}man', re.I)
bathuman = batRegex.findall(bat)
bathuman

['Batman', 'Batwoman']