In [1]:
import re

# How to find all?

In [5]:
text = 'barssbarssbar'
re.search('bar', text)

<re.Match object; span=(0, 3), match='bar'>

# findall

In [6]:
text = 'barssbarssbar'
pattern = r'bar'

# Using findall to get all occurrences
matches = re.findall(pattern, text)
print(matches)

['bar', 'bar', 'bar']


## re.compile + finditer

In [4]:
text = 'barssbarssbar'
pattern = re.compile(r'bar')

# Using finditer to get an iterator yielding match objects
for match in pattern.finditer(text):
    start = match.start()  # Starting position of the match
    end = match.end()      # Ending position of the match
    print(f"'bar' found at position: {start}-{end}")

'bar' found at position: 0-3
'bar' found at position: 5-8
'bar' found at position: 10-13


# Search vs Match

In [12]:
text = "Python is fun"
result = re.search('fun', text)

if result:
    print("Found!")
else:
    print("Not found!")

Found!


In [14]:
text = "Python is fun"
result = re.match('fun', text)

if result:
    print("Found!")
else:
    print("Not found!")

Not found!


# Date Example

In [9]:
text = "Logs received on 2021-12-14 and 2022-01-15. Previous entries were made on 14/12/2020 and 15/01/2021."
pattern = '(\d{4}-\d{2}-\d{2})|(\d{2}/\d{2}/\d{4})'

matches = re.findall(pattern, text)
# Flatten tuple list and remove empty strings
matches = [date for sublist in matches for date in sublist if date]
print(matches)  # Output: ['2021-12-14', '2022-01-15', '14/12/2020', '15/01/2021']


['2021-12-14', '2022-01-15', '14/12/2020', '15/01/2021']


# Greedy vs Non-Greedy

In [10]:
text = "<div>Text 1</div><div>Text 2</div>"
pattern = r'<div>.*</div>'
result = re.findall(pattern, text)

print(result)  

['<div>Text 1</div><div>Text 2</div>']


In [11]:
pattern = r'<div>.*?</div>'
result = re.findall(pattern, text)

print(result)  

['<div>Text 1</div>', '<div>Text 2</div>']


# Grouping

In [23]:
res = re.match(r'\s*\((\d{3})\)(\d{3})-(\d{4})', '(848)555-4321')

In [24]:
print(res.group())  # for the whole thing
print(res.groups()) # for all parts captured with ( )
print(res.group(0)) # entire thing
print(res.group(1)) # first grouping with ( )
print(res.group(2)) # second grouping with ( )
print(res.group(3))

(848)555-4321
('848', '555', '4321')
(848)555-4321
848
555
4321


# Parallel Computing 

In [25]:
import pandas as pd
import numpy as np
import time

# Generate random strings
np.random.seed(0)
N = 10**6  # 1 million rows
data = {'text_column': np.random.choice(['apple', 'banana', 'cherry'], N)}

df = pd.DataFrame(data)


In [26]:
start_time = time.time()

mask = df['text_column'].str.contains(r'\bapple\b', regex=True)
filtered_df_pandas = df[mask]

end_time = time.time()
pandas_time = end_time - start_time

print(f"Pandas str.contains() took {pandas_time:.6f} seconds.")


Pandas str.contains() took 0.277800 seconds.


In [27]:
start_time = time.time()

filtered_data_for_loop = []
for index, row in df.iterrows():
    if "apple" in row['text_column']:
        filtered_data_for_loop.append(row)

filtered_df_for_loop = pd.DataFrame(filtered_data_for_loop)

end_time = time.time()
for_loop_time = end_time - start_time

print(f"For loop took {for_loop_time:.6f} seconds.")

For loop took 21.695211 seconds.
