In [1]:
# imports

import pandas as pd
import re

### 1. Write a function named is_vowel. It should accept a string as input and use a regular expression to determine if the passed string is a vowel. While not explicity mentioned in the lesson, you can treat the result of re.search as a boolean value that indicates whether or not the regular expression matches the given string.

In [2]:
def is_vowel(letter):
    '''
    Takes in a letter and uses regex to 
    determine whether it is a vowel. Outputs the 
    appropriate message
    '''
    regexp = r'[aeiouAEIOU]'
    
    if (re.search(regexp, letter)):
        print(f'Valid. {letter} is a vowel')
    else:
        print(f'Try again. {letter} is not a vowel.')

In [4]:
is_vowel('B')

Try again. B is not a vowel.


In [5]:
is_vowel('o')

Valid. o is a vowel


### 2. Write a function named is_valid_username that accepts a string as input. A valid username starts with a lowercase letter, and only consists of lowercase letters, numbers, or the _ character. It should also be no longer than 32 characters. The function should return either True or False depending on whether the passed string is a valid username.

In [7]:
def is_valid_username(string):
    '''
    Takes in a string and uses regex to test whether or not
    it is a valid username, containing only lowercase letters, numbers
    or _
    '''
    regexp = r'^[a-z\w]{1,31}$'
    subject = string
    
    if re.search(regexp, subject):
        print('Valid username')
    else:
        print('Not a valid username')

In [8]:
is_valid_username('Snarkle_123$')

Not a valid username


In [9]:
is_valid_username('pao_alii')

Valid username


### 3. Write a regular expression to capture phone numbers. 

In [29]:
# Create new dataframe

df = pd.DataFrame()
df['number'] = [
    '(210) 867 5309',
    '+1 210.867.5309',
    '867-5309',
    '210-867-5309',
    '2108675309',
]

In [30]:
# Breaking down the unique components

phone_regex = re.compile(
'''^
(?P<country_code>\+\d+)?
\D*?
(?P<area_code>\d{3})?
\D*?
(?P<exchange_code>\d{3})
\D*?
(?P<line_number>\d{4})
$''', re.VERBOSE)

In [31]:
# Checking out the 'number' df

df['number'].str.extract(phone_regex)

Unnamed: 0,country_code,area_code,exchange_code,line_number
0,,210.0,867,5309
1,1.0,210.0,867,5309
2,,,867,5309
3,,210.0,867,5309
4,,210.0,867,5309


In [32]:
# Combining the two dfs

pd.concat([df, df['number'].str.extract(phone_regex)], axis=1)

Unnamed: 0,number,country_code,area_code,exchange_code,line_number
0,(210) 867 5309,,210.0,867,5309
1,+1 210.867.5309,1.0,210.0,867,5309
2,867-5309,,,867,5309
3,210-867-5309,,210.0,867,5309
4,2108675309,,210.0,867,5309


### 4. Use regular expressions to convert the dates below to the standardized year-month-day format.

02/04/19
02/05/19
02/06/19
02/07/19
02/08/19
02/09/19
02/10/19

In [38]:
# Make a list

date_list = [
    '02/04/19',
    '02/05/19',
    '02/06/19',
    '02/07/19',
    '02/08/19',
    '02/09/19',
    '02/10/19']

In [39]:
# make a regex

date_reg = r'(\d+)/(\d+)/(\d+)'

In [40]:
# Using list comprehension

[re.sub(date_reg, r'20\3-\1-\2', date) for date in date_list]

['2019-02-04',
 '2019-02-05',
 '2019-02-06',
 '2019-02-07',
 '2019-02-08',
 '2019-02-09',
 '2019-02-10']

### 5. Write a regex to extract the various parts of these logfile lines:

In [41]:
lines = """
GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58
GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58
"""

In [42]:
# parts:
# GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
# method GET
# path /api/v1/sales?page=86
# timestamp [16/Apr/2019:193452+0000]
# http version HTTP/1.1
# status code {200}
# bytes 510348
# user agent "python-requests/2.21.0"
# ip 97.105.19.58

In [43]:
# Make the regex

regexp = r'''
^
(?P<method>GET|POST)
\s
(?P<path>[/\w\-\?=]+)
\s
\[(?P<timestamp>.+)\]
\s
(?P<http_version>HTTP/\d+\.\d+)
\s
\{(?P<status_code>\d+)\}
\s
(?P<bytes_out>\d+)
\s
"(?P<user_agent>.+)"
\s
(?P<ip>\d+\.\d+\.\d+\.\d+)
$'''

In [44]:
[re.search(regexp, line, re.VERBOSE).groupdict() for line in lines.strip().split('\n')]

[{'method': 'GET',
  'path': '/api/v1/sales?page=86',
  'timestamp': '16/Apr/2019:193452+0000',
  'http_version': 'HTTP/1.1',
  'status_code': '200',
  'bytes_out': '510348',
  'user_agent': 'python-requests/2.21.0',
  'ip': '97.105.19.58'},
 {'method': 'POST',
  'path': '/users_accounts/file-upload',
  'timestamp': '16/Apr/2019:193452+0000',
  'http_version': 'HTTP/1.1',
  'status_code': '201',
  'bytes_out': '42',
  'user_agent': 'User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
  'ip': '97.105.19.58'},
 {'method': 'GET',
  'path': '/api/v1/items?page=3',
  'timestamp': '16/Apr/2019:193453+0000',
  'http_version': 'HTTP/1.1',
  'status_code': '429',
  'bytes_out': '3561',
  'user_agent': 'python-requests/2.21.0',
  'ip': '97.105.19.58'}]