In [1]:
#a function to calculate the average of marks
def avg_marks(marks_list):
    return sum(marks_list)/len(marks_list)

In [2]:
marks = [76, 67, 98, 81, 40, 37, 44, 55]
avg_marks(marks)

62.25

In [3]:
marks = []
avg_marks(marks)

ZeroDivisionError: division by zero

### Assertion example
Reminder: <br>
use `assert` for **errors that should never happen** <br>
use `try, except` for **errors that can conceivably** happen

Format is:

```python
#never use assert(condition,exception)! this is treated as a tuple for condition
assert condition, exception
```

In [8]:
#a function to calculate the average of marks
def avg_marks(marks_list):
    assert len(marks_list) != 0, "The list is empty"
    assert type(marks_list) == list, "Must provide list"
    #try, except can be used
    #if, else 
    return sum(marks_list)/len(marks_list)

In [10]:
marks = (23,45,67)
avg_marks(marks)

AssertionError: Must provide list

In [7]:
try:
    marks = []
    avg_marks(marks) #
except AssertionError as ae:
    print("There is an error {}".format(ae.args))

There is an error ('The list is empty',)


## Regular Expressions

Regular expressions are text matching patterns described with a formal syntax. You'll often hear regular expressions referred to as 'regex' or 'regexp' in conversation (and Yes, you have regexp in ES). Regular expressions can include a variety of rules, for finding repetition, to text-matching, and much more. As you advance in Python you'll see that a lot of your parsing problems can be solved with regular expressions.

### Basic Patterns

       
Pattern    | Match      
:----------|:-----------
a, W, 9, < |  ordinary characters match themselves exactly
.          |a period matches any single character except newline
\w | matches a ”word” character: a letter or digit or underbar [a-zA-Z0-9 ]
\W | matches any non-word character
\b | boundary between word and non-word
\s | a single whitespace character – space, newline, return, tab, form [\n \r \t \f]
\S | matches any non-whitespace character
\t | tab
\n | newline
\r | return
\d | decimal digit [0-9]
\D | non-digit character
ˆ | circumflex (top hat) matches the start of a string
$ | dollar matches the end of a string
\ | inhibits the ”specialness” of a character. So, for example, use \. to match a period

```python
'''
These are some of the most commonly used methods
'''
re.search(pattern,text,condition)
re.findall(pattern,text,condition)
re.sub(existing_pattern,replace_with,text)
re.compile(pattern) #compiles the pattern => especially useful if pattern is used multiple times
```

In [11]:
import re

In [12]:
#look for abcd!! in the following string
string = '@@!abcd!!'

match = re.search('abcd!!', string)

if match:
    print("Match found: ",match.group())
else:
    print("No Match")

Match found:  abcd!!


In [14]:
#look for 4letters followed by!! in the following string
string = '@@!cghz!!'

match = re.search('\w\w\w\w\W\W', string)

if match:
    print("Match found: ",match.group())
else:
    print("No Match")

Match found:  cghz!!


In [15]:
#look for 4letters followed by!! in the following string
string = '@@!cghz!!'

match = re.search('\w{4}\W{2}', string)

if match:
    print("Match found: ",match.group())
else:
    print("No Match")

Match found:  cghz!!


In [16]:
text = "How many tshirts are there in this tshirt text"
re.findall('tshirt',text)

['tshirt', 'tshirt']

In [17]:
text.find('tshirt')

9

In [18]:
# + => 1 or more occurences of the pattern = GREEDY
# * => 0 or more occurences of the pattern = GREEDY
# ? => 0 or 1 occurence of the pattern

match = re.search(r'pi?','piiigiiii')

if match:
    print("Match found: ",match.group())
else:
    print("No Match")


Match found:  pi


In [20]:
# + => 1 or more occurences of the pattern = GREEDY
# * => 0 or more occurences of the pattern = GREEDY
# ? => 0 or 1 occurence of the pattern

match = re.search(r'pi*','piiigiiii')

if match:
    print("Match found: ",match.group())
else:
    print("No Match")

Match found:  piii


In [21]:
string = 'piiig12 3456xx'
#find 2 3456xx
match = re.search(r'\d\s+\d\s*\d.+', string)

if match:
    print("Match found: ",match.group())
else:
    print("No Match")

Match found:  2 3456xx


In [31]:
string = 'piiig12  3456xx'
#find 2 3456xx
match = re.search(r'\d{1}\s?\d{4}\w+', string)

if match:
    print("Match found: ",match.group())
else:
    print("No Match")

No Match


In [23]:
print('\d{1}\s+\d{4}\w+')

\d{1}\s+\d{4}\w+


In [26]:
number = 34
print("The number is {}".format(number))
print(f'The number is {number}')

The number is 34
The number is 34


In [36]:
foo = 'this is a foobar'
match = re.search(r'f\w+',foo)

if match:
    print("Match found: ",match.group())
else:
    print("No Match")
    

Match found:  foobar


In [40]:
email = 'purple alice-07@gmail.com monkey dishwasher'
#match email pattern
match = re.search(r'\w+@\w+', email)

if match:
    print("Match found: ",match.group())
else:
    print("No Match")
    

Match found:  07@gmail


### Square Brackets

Square brackets can be used to indicate a set of chars, so [abc] matches 'a' or 'b' or 'c'. The codes \w, \s etc. work inside square brackets too with the one exception that dot (.) just means a literal dot. For the emails problem, the square brackets are an easy way to add '.' and '-' to the set of chars which can appear around the @ with the pattern r'[\w.-]+@[\w.-]+' to get the whole email address:

```python
email = 'purple alice-b@google.com monkey dishwasher'
match = re.search(r'[\w.-]+@[\w.-]+', email)

## 'alice-b@google.com'
if match:                      
    print('Found:', match.group()) 
else:
    print('Did not find')
```

(More square-bracket features) You can also use a dash to indicate a range, so [a-z] matches all lowercase letters. To use a dash without indicating a range, put the dash last, e.g. [abc-]. An up-hat (^) at the start of a square-bracket set inverts it, so [^ab] means any char except 'a' or 'b'.

In [42]:
email = 'purple alice_b@google.com monkey dishwasher'
match = re.search(r'[\w.-]+@[\w.-]+', email)
if match:
    print("Match found: ",match.group())
else:
    print("No Match")
    

Match found:  alice_b@google.com


In [43]:
# find the username and the domain of the email => use brackets to divide groups
email = 'purple alice_b@google.com monkey dishwasher'
match = re.search(r'([\w.-]+)@([\w.-]+)', email)
if match:
    print("Match found: ",match.group())
    print("Username is: ",match.group(1))
    print("Domain is: ",match.group(2))
else:
    print("No Match")

Match found:  alice_b@google.com
Username is:  alice_b
Domain is:  google.com


In [47]:
match.group(2)

'google.com'

## findall

In [51]:
## Suppose we have a text with many email addresses
s = 'purple alice_b@google.com, blah monkey bob-07@abc.com blah dishwasher'
emails = re.findall(r'[\w.-]+@[\w.-]+',s)

for email in emails:
    print(email)

alice_b@google.com
bob-07@abc.com


## Question 1

Given the following paragraph, find the network IP address and replace it to 127.0.0.1.

> On most computer systems, localhost resolves to the IP address 10.100.11.121, which is the most commonly used IPv4 loopback address, and to the IPv6 loopback address. The localhost IP address is 192.168.11.10.

In [62]:
string = '''On most computer systems, localhost resolves to the IP address 10.100.11.121, which is the most commonly used IPv4 loopback address, and to the IPv6 loopback address. The localhost IP address is 192.168.11.10.'''

In [63]:
#METHOD 1
re.sub(r'\d+[.]\d+[.]\d+[.]\d+','127.0.0.1', string)

'On most computer systems, localhost resolves to the IP address 127.0.0.1, which is the most commonly used IPv4 loopback address, and to the IPv6 loopback address. The localhost IP address is 127.0.0.1.'

In [64]:
#METHOD 2
re.sub(r'(\d{1,3}[.]){3}(\d{1,3})','127.0.0.1', string)

'On most computer systems, localhost resolves to the IP address 127.0.0.1, which is the most commonly used IPv4 loopback address, and to the IPv6 loopback address. The localhost IP address is 127.0.0.1.'

## Question 2
Here are the combinations of possible phone numbers to be parsed. 

We should be able to get the area code 415, the trunk 867, and the rest of the phone number 5309. 

* 415-867-5309
* 415 657 5039
* 415.567.5467

Use findall to get the required data.

In [71]:
number_list = ['415-867-5309','415 657 5039','415.567.5467']

def parse_phone_numbers(number_list):
    '''
    uses regex to find area code, trunk and ext of a phone number
    returns the parsed number as printed string
    '''
    for idx, value in enumerate(number_list):
        print(f"Looking at index number: {idx} with phone number {value}")
        match = re.search(r'(\d{3})\W?(\d{3})\W?(\d+)',value)
        if match:
            print("Area code: ",match.group(1))
            print("Trunk: ",match.group(2))
            print("Ext: ",match.group(3))
            print("\n")
        else:
            print("No Match")
    

In [72]:
parse_phone_numbers(number_list)

Looking at index number: 0 with phone number 415-867-5309
Area code:  415
Trunk:  867
Ext:  5309


Looking at index number: 1 with phone number 415 657 5039
Area code:  415
Trunk:  657
Ext:  5039


Looking at index number: 2 with phone number 415.567.5467
Area code:  415
Trunk:  567
Ext:  5467


