## Regex Exercises

In [1]:
import pandas as pd
import re

1. Write a function named is_vowel. It should accept a string as input and use a regular expression to determine if the passed string is a vowel. While not explicity mentioned in the lesson, you can treat the result of re.search as a boolean value that indicates whether or not the regular expression matches the given string.

In [None]:
regexp = r'^[aeiouAEIOU]$'
subject = 'ab'

re.findall(regexp, subject)

In [None]:
def is_vowel(subject):
    '''
    This function will take in a string and look for an exact match for a single character vowel. 
    It will return a boolean value.
    '''
    regexp = r'^[aeiouAEIOU]$'
    
    vowel = re.search(regexp, subject)
    
    return bool(vowel)
    


In [None]:
is_vowel('aa')

2. Write a function named is_valid_username that accepts a string as input. A valid username starts with a lowercase letter, and only consists of lowercase letters, numbers, or the _ character. It should also be no longer than 32 characters. The function should return either True or False depending on whether the passed string is a valid username.

In [None]:
regexp = r'^[a-z]{1}[0-9_]*[^A-Z]{,31}$'
subject = 'ab12312adfsa12312_asdasd123123_0'

re.search(regexp, subject)

In [None]:
def is_valid_username(subject):
    '''
    This function accepts a username as a string and returns a boolean value based on whether or not the username meets the following requirements: 
    - starts with a lowercase letter
    - only consists of lowercase letter, numbers, or the '_' character
    - no longer than 32 characters
    '''
    
    regexp = r'^[a-z][a-z0-9_]{,31}$'
    
    username = re.search(regexp, subject)
    
    return bool(username)
    
    

In [None]:
is_valid_username('a12312_123123_0')

3. Write a regular expression to capture phone numbers. It should match all of the following:
- (210) 867 5309
- +1 210.867.5309
- 867-5309
- 210-867-5309

In [None]:
# Madeleine
df = pd.DataFrame()
df['number'] = [
    '(210) 867 5309',
    '+1 210.867.5309',
    '867-5309',
    '2108675309',
]


In [None]:
phone_regex = re.compile(
'''
^ 
(?P<country_code>\+\d+)?
\D*?
(?P<area_code>\d{3})?
\D*?
(?P<exchange_code>\d{3})
\D*?
(?P<line_number>\d{4})
\D*
$
''', re.VERBOSE)

In [None]:
df.number.str.extract(phone_regex)

In [None]:
def capture_phone_numbers(target):
    '''
    This function takes in a string and returns a boolean value based on whether or not the string is a valid phone number:
    - may start with '+', '(' or any digit
    - may be 8 to 15 characters
    - may contain whitespace
    '''
    
    # Create a blank dataframe
    df = pd.DataFrame()
    
    # assign the target variable list to a column in the df
    df['input_number'] = target
    
    # create the regexp to compile the sections of the phone numbers
    phone_regex = re.compile(
                            '''
                            ^ 
                            (?P<country_code>\+\d+)?
                            \D*?
                            (?P<area_code>\d{3})?
                            \D*?
                            (?P<exchange_code>\d{3})
                            \D*?
                            (?P<line_number>\d{4})
                            \D*
                            $
                            ''', re.VERBOSE)
    
    # Output results to the dataframe
    df = df['input_number'].str.extract(phone_regex)
    
    # creates a column with the original input
    df['input_number'] = target
    
    return df

In [None]:
test_list = [
    '(210) 867 5309',
    '+1 210.867.5309',
    '867-5309',
    '2108675309',
]

In [None]:
phones = capture_phone_numbers(test_list)

In [None]:
phones

4. Use regular expressions to convert the dates below to the standardized year-month-day format.
- 02/04/19
- 02/05/19
- 02/06/19
- 02/07/19
- 02/08/19
- 02/09/19
- 02/10/19

In [None]:
# currently in MM/DD/YY
# need to convert to YYYY-MM-DD
# 3 capture groups separated by '/', each two digit, although should be built to accept 1 digit month and day

In [None]:
# define the list of dates
dates_list = [
    '02/04/19',
    '02/05/19',
    '02/06/19',
    '02/07/19',
    '02/08/19',
    '02/09/19',
    '02/10/19']

In [None]:
# create our three capture groups, separated by '/''
# date_reg = r'(\d{1,2})/(\d{1,2})/(d{2})'
date_reg = r'(\d+)/(\d+)/(\d+)'


In [None]:
re.sub(date_reg, r'20\3-\1-\2',dates_list[0])

In [None]:
def convert_date_format(target):
    '''
    
    '''
    
    # Create a blank dataframe
    df = pd.DataFrame()
    
    # assign the target variable list to a column in the df
    df['input_date'] = target
        
    # create the regexp to compile the sections of the phone numbers
    date_regexp = r'(\d+)/(\d+)/(\d+)'

    # create output format
    output = r'20\3-\1-\2'
        
    # create new column of converted dates
    df['converted_date'] = [re.sub(date_regexp, output, i) for i in target]
    
    # convert to datetime
    df['converted_date'] = pd.to_datetime(df['converted_date'])
    
    return df

In [None]:
new_df = convert_date_format(dates_list)

In [None]:
new_df

In [None]:
new_df.info()

5. Write a regex to extract the various parts of these logfile lines:
- GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
- POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; -  - Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58
- GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58

In [None]:
lines = """
GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58
GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58
"""

In [None]:
# parts:
# GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
# method GET
# path /api/v1/sales?page=86
# timestamp [16/Apr/2019:193452+0000]
# http version HTTP/1.1
# status code {200}
# bytes 510348
# user agent "python-requests/2.21.0"
# ip 97.105.19.58

In [None]:
regexp = r'''
^
(?P<method>GET|POST)
\s
(?P<path>/[/\w\-\?=]+)
\s
\[(?P<timestamp>.+)\]
\s
(?P<http_version>HTTP/\d+\.\d+)
\s
\{(?P<status_code>\d+)\}
\s
(?P<bytes>\d+)
\s
"(?P<user_agent>.+)"
\s
(?P<ip>\d+\.\d+\.\d+\.\d+)
$
'''

In [None]:
[re.search(regexp, line, re.VERBOSE).groupdict() for line in lines.strip().split('\n')]

In [None]:
# method, endpoint, date, protocol, http_status_code, some_number, "user_agent", ip_address

regex = r'''
(?P<method>[A-Z]+)
\s
(?P<path>.*)
\s
\[(?P<timestamp>.*)\]
\s
HTTP/1.1
\s
{(?P<status>\d+)}
\s
(?P<bytes_sent>\d+)
\s
"(?P<user_agent>.*)"
\s+
(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})
'''

regex = re.compile(regex, re.VERBOSE)

df = pd.DataFrame()
df['input_line'] = lines.strip().split('\n')
df = pd.concat([df, df.line.str.extract(regex)], axis=1)
df

In [None]:
def extract_lines(target):
    '''
    This function takes in a string of logfiles. It creates an empty pandas DataFrame. 
    Creates an 'input_line' column that splits the original string by line, and returns the original input.
    Finally, it extracts the following sections of the original line, and returns a new column for each:
    - method
    - path
    - timestamp
    - status
    - bytes_sent
    - user_agent
    - ip
    '''
    
    # (?P<method>[A-Z]+) = begins with 1 or more cap letters, stored as 'method'
    # \s = separated by whitespace
    # (?P<path>.*) = 'path' could be any character(s) of any length
    # \s = separated by whitespace
    # HTTP/1.1 = literall HTTP/1.1
    # \s = separated by whitespace
    # {(?P<status>\d+)} = 'status' of 1 or more non-digit characters
    # \s = separated by whitespace
    # (?P<bytes_sent>\d+) = 'bytes_sent' of 1 or more digit characters
    # \s = separated by whitespace
    # "(?P<user_agent>.*)" = 'user_agent' inside "" of any character(s) zero or more times
    # \s = separated by whitespace 1 or more times
    # (?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) = 'ip' of 1 to 3 digits, '.' 4x


    
    regexp = r'''
(?P<method>[A-Z]+)
\s
(?P<path>.*)
\s
\[(?P<timestamp>.*)\]
\s
HTTP/1.1
\s
{(?P<status>\d+)}
\s
(?P<bytes_sent>\d+)
\s
"(?P<user_agent>.*)"
\s+
(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})
'''
    # compiles the VERBOSE regexp
    regexp = re.compile(regexp, re.VERBOSE)
    
    # creates empty pandas DataFrame
    df = pd.DataFrame()
    
    # creates 'input_line' column of original data
    df['input_line'] = lines.strip().split('\n')
    
    # concatenates 'input_line' and the extracted regexp data
    df = pd.concat([df, df['input_line'].str.extract(regexp)], axis=1)
    
    return df

In [None]:
output = extract_lines(lines)

In [None]:
output.info()

In [None]:
output

## Bonus

You can find a list of words on your mac at /usr/share/dict/words. Use this file to answer the following questions:
- How many words have at least 3 vowels?
- How many words have at least 3 vowels in a row?
- How many words have at least 4 consonants in a row?
- How many words start and end with the same letter?
- How many words start and end with a vowel?
- How many words contain the same letter 3 times in a row?
- What other interesting patterns in words can you find?

#### Bring the data in

In [2]:
# bring in the data
# drop null values
# convert all to lowercase
words = pd.read_csv('/usr/share/dict/words', header=None, squeeze=True).dropna()
words = words.str.lower()

In [30]:
# let's take a quick peak...
words.head()

0        a
1        a
2       aa
3      aal
4    aalii
Name: 0, dtype: object

In [31]:
def get_words():
    '''
    This function looks for a locally stored list of words. It read them in, drops nulls, 
    and converts all characters to lowercase. Returns a pandas Series.
    '''
    # reads in the locally stored list of words, and drops null values
    words = pd.read_csv('/usr/share/dict/words', header=None, squeeze=True).dropna()
    
    # converts all characters to lowercase
    words = words.str.lower()
    
    return words

#### How many words have at least 3 vowels?

In [32]:
# create a variable of words that contain at least three vowels
vowels_3_or_more = (words.str.count(r'[aeiou]') >=3)

In [33]:
# sum the count...
total = vowels_3_or_more.sum()
total

191365

#### How many words have at least 3 vowels in a row?


In [34]:
vowels_3_in_a_row = (words.str.count(r'[aeiou]{3,}'))

In [35]:
vowels_3_in_a_row.sum()

6250

#### How many words have at least 4 consonants in a row?

In [36]:
cons_4_in_row = (words.str.count(r'[^aeiou]{4,}'))

In [37]:
cons_4_in_row.sum()

19601

#### How many words start and end with the same letter?

In [38]:
# creates a first capture group that starts with any character
# may or may not have middle characters
# ends with the same character as capture group 1
start_end_same = (words.str.contains(r'^(\w).*\1$'))

  return func(self, *args, **kwargs)


In [39]:
start_end_same.sum()

11452

#### How many words start and end with a vowel?

In [40]:
vowel_start_end = (words.str.contains(r'^[aeiou].*[aeiou]$'))

In [41]:
vowel_start_end.sum()

14657

#### How many words contain the same letter 3 times in a row?

In [42]:
# first capture group contains any character
# repeated exactly 2 more times
same_char_3 = (words.str.contains(r'(\w)\1\1'))

In [43]:
same_char_3.sum()

7

#### What other interesting patterns in words can you find?

In [3]:
# can I find words with more vowels than consonants?

In [4]:
words.head()

0        a
1        a
2       aa
3      aal
4    aalii
Name: 0, dtype: object

In [15]:
# make an empty DataFrame
df = pd.DataFrame()

In [16]:
# create input data column
df['input_data'] = words

In [17]:
# create vowel count column
df['vowel_count'] = words.str.count(r'[aeiou]')

In [18]:
# create consonant count column
df['cons_count'] = words.str.count(r'[^aeiou]')

In [19]:
df['mas_vowels'] = df['vowel_count'] > df['cons_count']

In [21]:
df['mas_vowels'].sum()

14201

In [22]:
def more_vowels(target_list):
    '''
    This function takes in a pandas Series in an attempt to determine if each string has more vowels.
    First, we create an empty pandas DataFrame. Then an 'input_data' column that contains the target_list.
    A 'vowel_count' column is created that has a sum of each strings vowel count.
    A 'cons_count' column is created that has a sum of each strings consonant count.
    A 'mas_vowels' column is created that holds a boolean value of whether or not the string has more vowels than consonants.
    Finally, an update pandas DataFrame, along with a total count are returned.
    '''
    
    # make an empty DataFrame
    df = pd.DataFrame()
    
    # create input data column
    df['input_data'] = words
    
    # create vowel count column
    df['vowel_count'] = words.str.count(r'[aeiou]')
    
    # create consonant count column
    df['cons_count'] = words.str.count(r'[^aeiou]')
    
    # create a more vowels column
    df['mas_vowels'] = df['vowel_count'] > df['cons_count']
    
    # assign a variable to the sum of strings with more vowels than consonants
    target_count = df['mas_vowels'].sum()
    
    return df, target_count
    

<class 'pandas.core.frame.DataFrame'>
Int64Index: 235884 entries, 0 to 235885
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   input_data   235884 non-null  object
 1   vowel_count  235884 non-null  int64 
 2   cons_count   235884 non-null  int64 
 3   mas_vowels   235884 non-null  bool  
dtypes: bool(1), int64(2), object(1)
memory usage: 7.4+ MB


0                  a
1                  a
2                 aa
3                aal
4              aalii
             ...    
235881        zythem
235882        zythia
235883        zythum
235884       zyzomys
235885    zyzzogeton
Name: 0, Length: 235713, dtype: object