In [1]:
import pandas as pd
import numpy as np
import re

### search
Returns a Match object if there is a match anywhere in the string

### Escape Codes

You can use special escape codes to find specific types of patterns in your data, such as digits, non-digits,whitespace, and more. For example:

<table border="1" class="docutils">
<colgroup>
<col width="14%" />
<col width="86%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Code</th>
<th class="head">Meaning</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td><tt class="docutils literal"><span class="pre">\d</span></tt></td>
<td>a digit</td>
</tr>
<tr class="row-odd"><td><tt class="docutils literal"><span class="pre">\D</span></tt></td>
<td>a non-digit</td>
</tr>
<tr class="row-even"><td><tt class="docutils literal"><span class="pre">\s</span></tt></td>
<td>whitespace (tab, space, newline, etc.)</td>
</tr>
<tr class="row-odd"><td><tt class="docutils literal"><span class="pre">\S</span></tt></td>
<td>non-whitespace</td>
</tr>
<tr class="row-even"><td><tt class="docutils literal"><span class="pre">\w</span></tt></td>
<td>alphanumeric</td>
</tr>
<tr class="row-odd"><td><tt class="docutils literal"><span class="pre">\W</span></tt></td>
<td>non-alphanumeric</td>
</tr>
</tbody>
</table>

### Meta Characters

In [3]:
sent = 'I was born in year 1996'

In [4]:
rr = re.search("[a-zA-Z]+", sent)
print(rr)

<re.Match object; span=(0, 1), match='I'>


In [5]:
rr = re.findall("[a-zA-Z]+", sent)
print(rr)

['I', 'was', 'born', 'in', 'year']


In [6]:
rr = re.search('[\d]+', sent)
print(rr)

<re.Match object; span=(19, 23), match='1996'>


In [7]:
sent = 'I was born in year 1996'
sent2 = re.sub('[\d]', '', sent)
sent2

'I was born in year '

In [8]:
sent = 'I was born ## @ ?? .  in year`s 1986'

sent2 = re.sub('[@#?\.\`]', '', sent)
sent2

'I was born      in years 1986'

### findall
Returns a list containing all matches

In [9]:
sent = 'the term match is in the middle'
rr = re.findall('match', sent)
print(rr)

['match']


In [10]:
# s followed by zero or more d's
phrase = 'sdsd sssddd sdddsddd dsds dsssss sdddd'
rr = re.findall('sd*', phrase)
print(rr)

['sd', 'sd', 's', 's', 'sddd', 'sddd', 'sddd', 'sd', 's', 's', 's', 's', 's', 's', 'sdddd']


In [11]:
# s followed by one or more d's
phrase = 'sdsd sssddd sdddsddd dsds dsssss sdddd'
rr = re.findall('sd+', phrase)
print(rr)

['sd', 'sd', 'sddd', 'sddd', 'sddd', 'sd', 'sdddd']


In [12]:
# s followed by zero or one d's
phrase = 'sdsd sssddd sdddsddd dsds dsssss sdddd'
rr = re.findall('sd?', phrase)
print(rr)

['sd', 'sd', 's', 's', 'sd', 'sd', 'sd', 'sd', 's', 's', 's', 's', 's', 's', 'sd']


In [13]:
# s followed by three d's
phrase = 'sdsd sssddd sdddsddd dsds dsssss sdddd'
rr = re.findall('sd{3}', phrase)
print(rr)

['sddd', 'sddd', 'sddd', 'sddd']


In [14]:
# s followed by two to three d's
phrase = 'sdsd sssddd sdddsddd dsds dsssss sdddd'
rr = re.findall('sd{2,3}', phrase)
print(rr)

['sddd', 'sddd', 'sddd', 'sddd']


In [15]:
# s or d
phrase = 'sdsd sssddd sdddsddd dsds dsssss sdddd'
rr = re.findall('[sd]', phrase)
print(rr)

['s', 'd', 's', 'd', 's', 's', 's', 'd', 'd', 'd', 's', 'd', 'd', 'd', 's', 'd', 'd', 'd', 'd', 's', 'd', 's', 'd', 's', 's', 's', 's', 's', 's', 'd', 'd', 'd', 'd']


### Exclusion

In [2]:
phrase = 'This is a string! But it has punctuation. How can we remove it?'
rr = re.findall('[^!.?]+', phrase)
print(rr)

['This is a string', ' But it has punctuation', ' How can we remove it']


In [3]:
phrase = 'This is a string! But it has punctuation. How can we remove it?'
rr = re.sub(r'[^\w\s]', '', phrase)
print(rr)

This is a string But it has punctuation How can we remove it


### Character Ranges

In [17]:
phrase = 'This is an example sentence. Lets see if we can find some letters.'
rr = re.findall('[a-z]+', phrase)
print(rr)

['his', 'is', 'an', 'example', 'sentence', 'ets', 'see', 'if', 'we', 'can', 'find', 'some', 'letters']


In [18]:
# sequences of lower or upper case letters
phrase = 'This is an example sentence. Lets see if we can find some letters.'
rr = re.findall('[a-zA-Z]+', phrase)
print(rr)

['This', 'is', 'an', 'example', 'sentence', 'Lets', 'see', 'if', 'we', 'can', 'find', 'some', 'letters']


In [19]:
# one upper case letter followed by lower case letters
phrase = 'This is an example sentence. Lets see if we can find some letters.'
rr = re.findall('[A-Z][a-z]+', phrase)
print(rr)

['This', 'Lets']


### Escape Codes

In [20]:
phrase = 'This is a string with some numbers 1233 and a symbol #hashtag'
rr = re.findall('\d+', phrase)
print(rr)

['1233']


In [21]:
phrase = 'This is a string with some numbers 1233 and a symbol #hashtag'
rr = re.findall('\D+', phrase)
print(rr)

['This is a string with some numbers ', ' and a symbol #hashtag']


In [22]:
phrase = 'This is a string with some numbers 1233 and a symbol #hashtag'
rr = re.findall('\W+', phrase)
print(rr)

[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' #']


### Extracting e-mail addresses

In [23]:
phrase = "Hello from shubhamg199630@gmail.com to priya@yahoo.com about the meeting @2PM"
rr = re.findall('\S+@\S+', phrase)
print(rr)

['shubhamg199630@gmail.com', 'priya@yahoo.com']


In [24]:
phrase = "Hello from shubhamg199630@gmail.ac.in to priya@yahoo.com about the meeting @2PM"
re.findall(r'[\w.+-]+@[\w-]+\.[\w.-]+', phrase)

['shubhamg199630@gmail.ac.in', 'priya@yahoo.com']

### Extracting URLs

In [101]:
phrase = "This is a text with some URLs: https://www.google.com, https://www.facebook.com, and https://www.twitter.com"
rr = re.findall(r"https?://\S+", phrase)
print(rr)

['https://www.google.com,', 'https://www.facebook.com,', 'https://www.twitter.com']
