### Method/Attribute    : Purpose
match()                 :::::: Determine if the RE matches at the beginning of the string.

search()                :::::: Scan through a string, looking for any location where this RE matches.

findall()               :::::: Find all substrings where the RE matches, and returns them as a list.

finditer()              :::::: Find all substrings where the RE matches, and returns them as an iterator.

In [None]:
## SINGLE CHARACTERS

#### Use           To match any character
[set]      ::::::    In that set
 
 
[^set]     ::::::    Not in that set
 
 
[a–z]      ::::::    In the a-z range
                        
[^a–z]     ::::::    Not in the a-z range
 
 
.          :::::::   Any except \n (new line)
 
 
\char      :::::::    Escaped special character

In [1]:
## If I just want big as three letter word, not just one. In that case I have to use pipe character.

import re

string = 'this is the biggest test'
pattern = r'[big]'

print(re.findall(pattern, string))



['i', 'i', 'b', 'i', 'g', 'g']


In [2]:
## If I do not want certain character, in that case.

string = 'just check out apple carrot'
pattern = r'[^aeiou]'         # here we are getting everything other than the vowels 

print(re.findall(pattern, string))




['j', 's', 't', ' ', 'c', 'h', 'c', 'k', ' ', 't', ' ', 'p', 'p', 'l', ' ', 'c', 'r', 'r', 't']


In [3]:
## If I do not want certain character, in that case.

string = 'just check out apple carrot'
pattern = r'[^aeiou]'         # here we are getting everything other than the vowels 

print(re.findall(pattern, string))

['j', 's', 't', ' ', 'c', 'h', 'c', 'k', ' ', 't', ' ', 'p', 'p', 'l', ' ', 'c', 'r', 'r', 't']


In [4]:
## slicing can also be used over here.

string = 'just check out apple carrot 256'
pattern = r'[a-dx-z0-3]'         # here we are getting just a,b,c,d,x,y,z,0,1,2,3

print(re.findall(pattern, string))

['c', 'c', 'a', 'c', 'a', '2']


In [5]:
##.  '.' dot charachter is the special character that matches evverything excepts a new line.

string = 'let us liik at the . dot special text'
pattern = r'[.]'         # here we are getting just a,b,c,d,x,y,z,0,1,2,3

print(re.findall(pattern, string))

['.']


In [10]:
##  here we will se how to tackle the tabs.

string = 'let-us liik--at the . dot          special text'
pattern = r'\t'         

print(re.match(pattern, string))

None


In [14]:
string = 'this is a test'
pattern = r'(?i)A'   #though regular expression is case sensitive. But this does help to recover well. 
print(re.search(pattern, string))

<_sre.SRE_Match object; span=(8, 9), match='a'>


### .span() returns a tuple containing the start-, and end positions of the match.
### .string returns the string passed into the function
### .group() returns the part of the string where there was a match

In [15]:
# span : 
str = "The rain in Spain"
x = re.search(r"\bS\w+", str)
print(x.span())

(12, 17)


In [16]:
## string
str = "The rain in Spain"
x = re.search(r"\bS\w+", str)
print(x.string)

The rain in Spain


In [17]:
## group
str = "The rain in Spain"
x = re.search(r"\bS\w+", str)
print(x.group())

Spain


In [23]:
match = re.search(r'...g', 'piniig') # found, match.group() == "iig"
match


<_sre.SRE_Match object; span=(2, 6), match='niig'>

In [24]:
match = re.search(r'\b\w+\b', 'foobar')
print(match)

<_sre.SRE_Match object; span=(0, 6), match='foobar'>


In [25]:
 ## but without the ^ it succeeds:
match = re.search(r'b\w+', 'foobar')
print(match)

<_sre.SRE_Match object; span=(3, 6), match='bar'>


In [26]:
## Suppose you want to find the email address inside the string 'xyz alice-b@google.com purple monkey'. 
str = 'purple alice-b@google.com monkey dishwasher'
match = re.search(r'\w+@\w+', str)
if match:
    print (match.group()) 

b@google


### Group extraction:
* The "group" feature of a regular expression allows you to pick out parts of the matching text.
*  Suppose for the emails problem that we want to extract the username and host separately. T


In [22]:
str = 'purple alice-b@google.com monkey dishwasher'
match = re.search(r'([\w.-]+)@([\w.-]+)', str)

if match:
    print(match.group())      # ## 'alice-b@google.com' (the whole match)
    print(match.group(1))       ## 'alice-b' (the username, group 1)
    print(match.group(2))       ## 'google.com' (the host, group 2) 

alice-b@google.com
alice-b
google.com


### findall
findall() is probably the single most powerful function in the re module. Above we used re.search() to find the first match for a pattern. findall() finds *all* the matches and returns them as a list of strings, with each string representing one match.

In [53]:
  ## Suppose we have a text with many email addresses. Suppose we just want all the mail ids
str = 'purple alice@google.com, blah monkey bob@abc.com blah dishwasher'

  ## Here re.findall() returns a list of all the found email strings
emails = re.findall(r'[\w\.-]+@[\w\.-]+', str) ## ['alice@google.com', 'bob@abc.com']
for email in emails:
    # do something with each found email string
    print (email)

alice@google.com
bob@abc.com


In [69]:
mails = 'purple alice@google.com, blah monkey bob@abc.com blah dishwasher'

pattern = r'([\w\.-]+@[\w.-]+)'

re.findall(pattern, mails)

['alice@google.com', 'bob@abc.com']

In [75]:
kakanee = r'just look now how am I,\n behaving, unable to understand'
kakanee

'just look now how am I,\\n behaving, unable to understand'

In [90]:
multi = 'It was the best of timesIt was the worst of times.'
print(multi)

It was the best of times. It was the worst of times.


In [96]:
s = 'digits'
s.isalpha()

True

In [101]:
text = "%d little pigs come out, or I'll %s, and I'll %s, and I'll blow your %s down." % (3, 'huff', 'puff', 'house')
text

"3 little pigs come out, or I'll huff, and I'll puff, and I'll blow your house down."

In [103]:
numbers = [34,23,2,32]
print([x for x in numbers if x < 25])

[23, 2]


In [109]:
## add some extra words in the strings:

strings = ['bhawna', 'very', 'innocent', 'girl']
print([x.upper() + ' kakane' for x in strings if x[0] == 'b'])

['BHAWNA kakane']


In [112]:
## Select fruits containing 'a', change to upper case
fruits = ['apple', 'cherry', 'banana', 'lemon']

print([x.upper() for x in fruits if 'a' in x])

['APPLE', 'BANANA']


In [116]:
import re
import requests
the_idiot_url = 'https://www.gutenberg.org/files/2638/2638-0.txt'

def get_book(url):
    # Sends a http request to get the text from project Gutenberg
    raw = requests.get(url).text
    # Discards the metadata from the beginning of the book
    start = re.search(r"\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*",raw ).end()
    # Discards the metadata from the end of the book
    stop = re.search(r"II", raw).start()
    # Keeps the relevant text
    text = raw[start:stop]
    return text

def preprocess(sentence): 
    return re.sub('[^A-Za-z0-9.]+' , ' ', sentence).lower()

book = get_book(the_idiot_url)
processed_book = preprocess(book)
print(processed_book)

 produced by martin adamson david widger with corrections by andrew sly the idiot by fyodor dostoyevsky translated by eva martin part i i. towards the end of november during a thaw at nine o clock one morning a train on the warsaw and petersburg railway was approaching the latter city at full speed. the morning was so damp and misty that it was only with great difficulty that the day succeeded in breaking and it was impossible to distinguish anything more than a few yards away from the carriage windows. some of the passengers by this particular train were returning from abroad but the third class carriages were the best filled chiefly with insignificant persons of various occupations and degrees picked up at the different stations nearer town. all of them seemed weary and most of them had sleepy eyes and a shivering expression while their complexions generally appeared to have taken on the colour of the fog outside. when day dawned two passengers in one of the third class carriages fou

In [114]:
len(re.findall(r'the', processed_book))

302

In [115]:
processed_book = re.sub(r'\si\s', " I ", processed_book)
print(processed_book)

 produced by martin adamson david widger with corrections by andrew sly the idiot by fyodor dostoyevsky translated by eva martin part I i. towards the end of november during a thaw at nine o clock one morning a train on the warsaw and petersburg railway was approaching the latter city at full speed. the morning was so damp and misty that it was only with great difficulty that the day succeeded in breaking and it was impossible to distinguish anything more than a few yards away from the carriage windows. some of the passengers by this particular train were returning from abroad but the third class carriages were the best filled chiefly with insignificant persons of various occupations and degrees picked up at the different stations nearer town. all of them seemed weary and most of them had sleepy eyes and a shivering expression while their complexions generally appeared to have taken on the colour of the fog outside. when day dawned two passengers in one of the third class carriages fou

In [117]:
re.findall(r'[a-zA-Z0-9]*--[a-zA-Z0-9]*', book)

['ironical--it',
 'malicious--smile',
 'fur--or',
 'astrachan--overcoat',
 'it--the',
 'Italy--was',
 'malady--a',
 'money--and',
 'little--to',
 'No--Mr',
 'is--where',
 'I--I',
 'I--',
 '--though',
 'crime--we',
 'or--judge',
 'gaiters--still',
 '--if',
 'through--well',
 'say--through',
 'however--and',
 'Epanchin--oh',
 'too--at',
 'was--and',
 'Andreevitch--that',
 'everyone--that',
 'reduce--or',
 'raise--to',
 'listen--and',
 'history--but',
 'individual--one',
 'yes--I',
 'but--',
 't--not',
 'me--then',
 'perhaps--',
 'Yes--those',
 'me--is',
 'servility--if',
 'Rogojin--hereditary',
 'citizen--who',
 'least--goodness',
 'memory--but',
 'latter--since',
 'Rogojin--hung',
 'him--I',
 'anything--she',
 'old--and',
 'you--scarecrow',
 'certainly--certainly',
 'father--I',
 'Barashkoff--I',
 'see--and',
 'everything--Lebedeff',
 'about--he',
 'now--I',
 'Lihachof--',
 'Zaleshoff--looking',
 'old--fifty',
 'so--and',
 'this--do',
 'day--not',
 'that--',
 'do--by',
 'know--my',
 'il

In [60]:
import pandas as pd

df = pd.DataFrame({'One': [21,34,323,55,2357],
                  'Two'  : ['werdf', 'ankush@gmail', '23_dkso', 'chalo2delhi','lat23_tr'],
                  'three' : [32,44,289,'ankush21nk','pinkwilla'] })

In [61]:
df

Unnamed: 0,One,Two,three
0,21,werdf,32
1,34,ankush@gmail,44
2,323,23_dkso,289
3,55,chalo2delhi,ankush21nk
4,2357,lat23_tr,pinkwilla


In [45]:
df['four'] = df['three'].str.replace('[a-z]','')

In [62]:
df['three'] = df['three'].astype('str')

In [63]:
df.dtypes

One       int64
Two      object
three    object
dtype: object

In [64]:
df['four'] = df['three'].str.replace('[a-z]','')

In [65]:
df

Unnamed: 0,One,Two,three,four
0,21,werdf,32,32.0
1,34,ankush@gmail,44,44.0
2,323,23_dkso,289,289.0
3,55,chalo2delhi,ankush21nk,21.0
4,2357,lat23_tr,pinkwilla,


In [None]:
def func(col):
    for i in col:
        re.
        

In [69]:
df['five'] = df['three'].re.findall('\d')

AttributeError: 'Series' object has no attribute 're'

In [70]:
lists = ['erf23','fddghy54', 'gr543h']

In [104]:
jk = []
for i in lists:
    l = re.findall('\d+', i)
    jk.append(l)
print(jk)

[['23'], ['54'], ['543']]


In [105]:
## making the nested lists as individual strings. 
result = []
for sublist in jk:
    for item in sublist:
        result.append(item)

print(result)

['23', '54', '543']
