# Data Cleansing
 ## Regular Expressions
https://docs.python.org/3/library/re.html

https://realpython.com/regex-python/

https://realpython.com/regex-python-part-2/

https://developers.google.com/edu/python/regular-expressions

 ## Python's re module

In [1]:
import re

In [2]:
# A string to be manipulated.
original = 'Words, words, words.'

# The pattern/regular expression to use on the above string.
pattern = r'\W+'

# Splits a string into substrings using a regular expression.
result = re.split(pattern, original)

# Print the result.
print(result)

['Words', 'words', 'words', '']


In [3]:
# A string to be manipulated.
original = 'Words, words, words.'

# The pattern/regular expression to use on the above string.
pattern = r'(\W+)'

# Splits a string into substrings using a regular expression.
result = re.split(pattern, original)

# Print the result.
print(result)

['Words', ', ', 'words', ', ', 'words', '.', '']


In [4]:
re.split(r'\W+', 'Words, words, words.', 1)

['Words', 'words, words.']

In [5]:
re.split('[a-f]+', '0a3B9', flags=re.IGNORECASE)

['0', '3', '9']

# Real Python

In [6]:
'abccba' == 'abccba'

True

In [7]:
'abccba' == 'cbaabc'

False

In [8]:
'abc' in 'cbaabc'

True

In [9]:
'cbaabc'.index('a')

2

In [10]:
'cbaabc'[2]

'a'

In [11]:
'cbaaabc'.find('aa')

2

In [12]:
s = 'foo123bar'

re.search('123', s)

<re.Match object; span=(3, 6), match='123'>

In [13]:
s[3:6]

'123'

In [14]:
s = 'foo123bar'
re.search(r'[0-9][0-9][0-9]', s)

<re.Match object; span=(3, 6), match='123'>

In [15]:
re.search(r'[0-9][0-9][0-9]', 'foo456bar')

<re.Match object; span=(3, 6), match='456'>

In [16]:
re.search(r'[0-9][0-9][0-9]', '234baz')

<re.Match object; span=(0, 3), match='234'>

In [17]:
re.search(r'[0-9][0-9][0-9]', 'qux678')

<re.Match object; span=(3, 6), match='678'>

In [18]:
print(re.search(r'[0-9][0-9][0-9]', '12foo34'))

None


In [19]:
re.search(r'[0-9]{3}', 'qux678')

<re.Match object; span=(3, 6), match='678'>

# Google for Education

In [20]:
str = 'an example word:cat!!'
match = re.search(r'word:\w\w\w', str)
# If-statement after search() tests if it succeeded
if match:
  print('found', match.group()) ## 'found word:cat'
else:
  print('did not find')

found word:cat


In [21]:
string = 'aaaabaa'
pattern = r'a+'

re.search(pattern, string)

<re.Match object; span=(0, 4), match='aaaa'>

In [22]:
string = 'aaaabaa'
pattern = r'a*'

re.search(pattern, string)

<re.Match object; span=(0, 4), match='aaaa'>

In [23]:
## i+ = one or more i's, as many as possible.
match = re.search(r'pi+', 'piiig') # found, match.group() == "piii"
match

<re.Match object; span=(0, 4), match='piii'>

In [24]:
## Finds the first/leftmost solution, and within it drives the +
## as far as possible (aka 'leftmost and largest').
## In this example, note that it does not get to the second set of i's.
match = re.search(r'i+', 'piigiiii') # found, match.group() == "ii"
match

<re.Match object; span=(1, 3), match='ii'>

In [25]:
## \s* = zero or more whitespace chars
## Here look for 3 digits, possibly separated by whitespace.
match = re.search(r'\d\s*\d\s*\d', 'xx1 2   3xx') # found, match.group() == "1 2   3"
print(match)
match = re.search(r'\d\s*\d\s*\d', 'xx12  3xx') # found, match.group() == "12  3"
print(match)
match = re.search(r'\d\s*\d\s*\d', 'xx123xx') # found, match.group() == "123"
print(match)

<re.Match object; span=(2, 9), match='1 2   3'>
<re.Match object; span=(2, 7), match='12  3'>
<re.Match object; span=(2, 5), match='123'>


In [26]:
## ^ = matches the start of string, so this fails:
match = re.search(r'^b\w+', 'foobar') # not found, match == None
match

In [27]:
## but without the ^ it succeeds:
match = re.search(r'b\w+', 'foobar') # found, match.group() == "bar"
match

<re.Match object; span=(3, 6), match='bar'>

In [28]:
str = 'purple alice-b@google.com monkey dishwasher'
match = re.search(r'\w+@\w+', str)
if match:
    print(match.group())  ## 'b@google'

b@google


## Exercise 1
Remember to do these exercises in your own notebook in your assessment repository.

Write a Python function to remove all non-alphanumeric characters from a string.

## Answer
### Removing non-alphanumeric characters from a Python string


1. A simple solution is to use regular expressions for removing non-alphanumeric characters from a string. The idea is to use the special character \W, which matches any character which is not a word character.

The \W is equivalent of [^a-zA-Z0-9_], which excludes all numbers and letters.

In [29]:
 if __name__ == '__main__':
 
    input = "Welcome, User_12!!"
 
    s = re.sub(r'\W+', '', input)
    print(s)    # WelcomeUser_12

WelcomeUser_12


If the expression is used several times in a single program, you should compile and save the resulting regular expression object for reuse:

In [30]:
if __name__ == '__main__':
 
    input = "Welcome, User_12!!"
 
    pattern = re.compile('\W')
    s = re.sub(pattern, '', input)
 
    print(s)    # WelcomeUser_12

WelcomeUser_12


In [31]:
#This is equivalent to:

if __name__ == '__main__':
 
    input = "Welcome, User_12!!"
 
    s = ''.join(c for c in input if c.isalnum())
    print(s)    # WelcomeUser12

WelcomeUser12


## Second Part of Real Python's Regexes

In [32]:
re.search(r'(\d+)', 'foo123bar')

<re.Match object; span=(3, 6), match='123'>

In [33]:
re.search(r'[a-z]+', '123FOO456', flags=re.IGNORECASE)

<re.Match object; span=(3, 6), match='FOO'>

In [34]:
print(re.search(r'\d+', 'foo.bar'))

None


In [35]:
re.search(r'\d+', '123foobar')

<re.Match object; span=(0, 3), match='123'>

In [36]:
re.search(r'\d+', 'foo123bar')

<re.Match object; span=(3, 6), match='123'>

In [37]:
re.match(r'\d+', '123foobar')

<re.Match object; span=(0, 3), match='123'>

In [38]:
print(re.match(r'\d+', 'foo123bar'))

None


In [39]:
print(re.fullmatch(r'\d+', '123foobar'))

None


In [40]:
re.fullmatch(r'\d+', '123')

<re.Match object; span=(0, 3), match='123'>

In [41]:
re.search(r'^\d+$', '123')

<re.Match object; span=(0, 3), match='123'>

In [42]:
print(re.search(r'^\d+', 'foo123bar'))

None


In [43]:
print(re.search(r'^\d+', '123foobar'))

<re.Match object; span=(0, 3), match='123'>


In [44]:
print(re.search(r'^\d+$', '123foobar'))

None


In [45]:
re.search(r'\d+', '123foo456bar789.')

<re.Match object; span=(0, 3), match='123'>

In [46]:
re.match(r'\d+', '123foo456bar789.')

<re.Match object; span=(0, 3), match='123'>

In [47]:
re.fullmatch(r'\d+', '123foo456bar789.')

In [48]:
re.findall(r'\d+', '123foo456bar789.')

['123', '456', '789']

https://realpython.com/introduction-to-python-generators/

In [49]:
matches = re.finditer(r'\d+', '123foo456bar789.')
matches

<callable_iterator at 0x2918bc08cd0>

In [50]:
next(matches)

<re.Match object; span=(0, 3), match='123'>

In [51]:
next(matches)

<re.Match object; span=(6, 9), match='456'>

In [52]:
next(matches)

<re.Match object; span=(12, 15), match='789'>

In [53]:
try:
    next(matches)
except:
    print(None)

None


In [54]:
matches = re.finditer(r'\d+', '123foo456bar789.')

for match in matches:
    print(match)

<re.Match object; span=(0, 3), match='123'>
<re.Match object; span=(6, 9), match='456'>
<re.Match object; span=(12, 15), match='789'>


 ## re.sub()

In [55]:
s = 'foo.123.bar.789.baz'

In [56]:
re.sub(r'\d+', '#', s)

'foo.#.bar.#.baz'

In [57]:
re.sub('[a-z]+', '(*)', s)

'(*).123.(*).789.(*)'

In [58]:
re.sub(r'([a-z]+)([0-9]+)', r'\2\1', 'foo123bar456')

'123foo456bar'

In [59]:
re.sub(r'(\w+),bar,baz,(\w+)', r'\2,bar,baz,\1', 'foo,bar,baz,qux')

'qux,bar,baz,foo'

 ## Compiling

In [60]:
my_regex = re.compile(r'([0-9]+)')

In [61]:
my_regex

re.compile(r'([0-9]+)', re.UNICODE)

In [62]:
my_regex.search('foo123bar456')

<re.Match object; span=(3, 6), match='123'>

In [63]:
my_regex.findall('foo123bar456')

['123', '456']

In [64]:
my_regex.sub(r'...', 'foo123bar456')

'foo...bar...'

## Regular Expressions on Iris

In [65]:
# https://stackoverflow.com/a/1393367

import urllib.request

url = r'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

iris = [line.decode('utf-8').strip() for line in urllib.request.urlopen(url)]

iris

['5.1,3.5,1.4,0.2,Iris-setosa',
 '4.9,3.0,1.4,0.2,Iris-setosa',
 '4.7,3.2,1.3,0.2,Iris-setosa',
 '4.6,3.1,1.5,0.2,Iris-setosa',
 '5.0,3.6,1.4,0.2,Iris-setosa',
 '5.4,3.9,1.7,0.4,Iris-setosa',
 '4.6,3.4,1.4,0.3,Iris-setosa',
 '5.0,3.4,1.5,0.2,Iris-setosa',
 '4.4,2.9,1.4,0.2,Iris-setosa',
 '4.9,3.1,1.5,0.1,Iris-setosa',
 '5.4,3.7,1.5,0.2,Iris-setosa',
 '4.8,3.4,1.6,0.2,Iris-setosa',
 '4.8,3.0,1.4,0.1,Iris-setosa',
 '4.3,3.0,1.1,0.1,Iris-setosa',
 '5.8,4.0,1.2,0.2,Iris-setosa',
 '5.7,4.4,1.5,0.4,Iris-setosa',
 '5.4,3.9,1.3,0.4,Iris-setosa',
 '5.1,3.5,1.4,0.3,Iris-setosa',
 '5.7,3.8,1.7,0.3,Iris-setosa',
 '5.1,3.8,1.5,0.3,Iris-setosa',
 '5.4,3.4,1.7,0.2,Iris-setosa',
 '5.1,3.7,1.5,0.4,Iris-setosa',
 '4.6,3.6,1.0,0.2,Iris-setosa',
 '5.1,3.3,1.7,0.5,Iris-setosa',
 '4.8,3.4,1.9,0.2,Iris-setosa',
 '5.0,3.0,1.6,0.2,Iris-setosa',
 '5.0,3.4,1.6,0.4,Iris-setosa',
 '5.2,3.5,1.5,0.2,Iris-setosa',
 '5.2,3.4,1.4,0.2,Iris-setosa',
 '4.7,3.2,1.6,0.2,Iris-setosa',
 '4.8,3.1,1.6,0.2,Iris-setosa',
 '5.4,3.

In [66]:
strip_iris = re.compile(r'([0-9]\.[0-9]),([0-9]\.[0-9]),([0-9]\.[0-9]),([0-9]\.[0-9]),Iris-([a-z]+)')

In [67]:
[strip_iris.sub(r'\5,\4,\3,\2,\1', line) for line in iris if line]

['setosa,0.2,1.4,3.5,5.1',
 'setosa,0.2,1.4,3.0,4.9',
 'setosa,0.2,1.3,3.2,4.7',
 'setosa,0.2,1.5,3.1,4.6',
 'setosa,0.2,1.4,3.6,5.0',
 'setosa,0.4,1.7,3.9,5.4',
 'setosa,0.3,1.4,3.4,4.6',
 'setosa,0.2,1.5,3.4,5.0',
 'setosa,0.2,1.4,2.9,4.4',
 'setosa,0.1,1.5,3.1,4.9',
 'setosa,0.2,1.5,3.7,5.4',
 'setosa,0.2,1.6,3.4,4.8',
 'setosa,0.1,1.4,3.0,4.8',
 'setosa,0.1,1.1,3.0,4.3',
 'setosa,0.2,1.2,4.0,5.8',
 'setosa,0.4,1.5,4.4,5.7',
 'setosa,0.4,1.3,3.9,5.4',
 'setosa,0.3,1.4,3.5,5.1',
 'setosa,0.3,1.7,3.8,5.7',
 'setosa,0.3,1.5,3.8,5.1',
 'setosa,0.2,1.7,3.4,5.4',
 'setosa,0.4,1.5,3.7,5.1',
 'setosa,0.2,1.0,3.6,4.6',
 'setosa,0.5,1.7,3.3,5.1',
 'setosa,0.2,1.9,3.4,4.8',
 'setosa,0.2,1.6,3.0,5.0',
 'setosa,0.4,1.6,3.4,5.0',
 'setosa,0.2,1.5,3.5,5.2',
 'setosa,0.2,1.4,3.4,5.2',
 'setosa,0.2,1.6,3.2,4.7',
 'setosa,0.2,1.6,3.1,4.8',
 'setosa,0.4,1.5,3.4,5.4',
 'setosa,0.1,1.5,4.1,5.2',
 'setosa,0.2,1.4,4.2,5.5',
 'setosa,0.1,1.5,3.1,4.9',
 'setosa,0.2,1.2,3.2,5.0',
 'setosa,0.2,1.3,3.5,5.5',
 

## Exercise 2
Remember to do these exercises in your own notebook in your assessment repository.

Adapt the above code to capitalise the first letter of the iris species, using regular expressions.

In [68]:
x =[strip_iris.sub(r'\5,\4,\3,\2,\1', line) for line in iris if line]
xa = [i.title() for i in x]
xa

['Setosa,0.2,1.4,3.5,5.1',
 'Setosa,0.2,1.4,3.0,4.9',
 'Setosa,0.2,1.3,3.2,4.7',
 'Setosa,0.2,1.5,3.1,4.6',
 'Setosa,0.2,1.4,3.6,5.0',
 'Setosa,0.4,1.7,3.9,5.4',
 'Setosa,0.3,1.4,3.4,4.6',
 'Setosa,0.2,1.5,3.4,5.0',
 'Setosa,0.2,1.4,2.9,4.4',
 'Setosa,0.1,1.5,3.1,4.9',
 'Setosa,0.2,1.5,3.7,5.4',
 'Setosa,0.2,1.6,3.4,4.8',
 'Setosa,0.1,1.4,3.0,4.8',
 'Setosa,0.1,1.1,3.0,4.3',
 'Setosa,0.2,1.2,4.0,5.8',
 'Setosa,0.4,1.5,4.4,5.7',
 'Setosa,0.4,1.3,3.9,5.4',
 'Setosa,0.3,1.4,3.5,5.1',
 'Setosa,0.3,1.7,3.8,5.7',
 'Setosa,0.3,1.5,3.8,5.1',
 'Setosa,0.2,1.7,3.4,5.4',
 'Setosa,0.4,1.5,3.7,5.1',
 'Setosa,0.2,1.0,3.6,4.6',
 'Setosa,0.5,1.7,3.3,5.1',
 'Setosa,0.2,1.9,3.4,4.8',
 'Setosa,0.2,1.6,3.0,5.0',
 'Setosa,0.4,1.6,3.4,5.0',
 'Setosa,0.2,1.5,3.5,5.2',
 'Setosa,0.2,1.4,3.4,5.2',
 'Setosa,0.2,1.6,3.2,4.7',
 'Setosa,0.2,1.6,3.1,4.8',
 'Setosa,0.4,1.5,3.4,5.4',
 'Setosa,0.1,1.5,4.1,5.2',
 'Setosa,0.2,1.4,4.2,5.5',
 'Setosa,0.1,1.5,3.1,4.9',
 'Setosa,0.2,1.2,3.2,5.0',
 'Setosa,0.2,1.3,3.5,5.5',
 

### Resources:
1. https://www.techiedelight.com/remove-non-alphanumeric-characters-string-python/
2. https://bobbyhadz.com/blog/python-remove-non-alphanumeric-characters-from-string
3. https://thispointer.com/python-remove-all-non-alphanumeric-characters-from-string/
4. https://www.delftstack.com/howto/python/remove-non-alphanumeric-characters-python/
5. https://docs.intersystems.com/irislatest/csp/docbook/DocBook.UI.Page.cls?KEY=GCOS_REGEXP
6. https://stackoverflow.com/questions/6251463/regex-capitalize-first-letter-every-word-also-after-a-special-character-like-a
7. https://docs.intersystems.com/irislatest/csp/docbook/DocBook.UI.Page.cls?KEY=GCOS_REGEXP
8. https://favtutor.com/blogs/capitalize-first-letter-python
9. https://www.geeksforgeeks.org/string-capitalize-python/