In [13]:
import re

from datetime import date

## Extracting GPS location

In [6]:
lat = r'([-]?[0-9]?[0-9][.][0-9]{2,10})'
lon = r'([-]?1?[0-9]?[0-9][.][0-9]{2,10})'
sep = r'[,/ ]{1,3}'
re_gps = re.compile(lat + sep + lon)

In [7]:
re_gps.findall('http://...maps/@34.0551066,-118.2496763...')

[('34.0551066', '-118.2496763')]

In [8]:
re_gps.findall("https://www.openstreetmap.org/#map=10/5.9666/116.0566")

[('5.9666', '116.0566')]

In [9]:
re_gps.findall("Zig Zag Cafe is at 45.344, -121.9431 on my GPS.")

[('45.344', '-121.9431')]

## Extracting dates

In [10]:
us = r'((([01]?\d)[-/]([0123]?\d))([-/]([0123]\d)\d\d)?)'
mdy = re.findall(us, 'Santa came 12/25/2017. An elf appeared 12/12.')
mdy

[('12/25/2017', '12/25', '12', '25', '/2017', '20'),
 ('12/12', '12/12', '12', '12', '', '')]

In [11]:
dates = [{'mdy': x[0], 'my': x[1], 'm': int(x[2]), 'd': int(x[3]),
          'y': int(x[4].lstrip('/') or 0), 'c': int(x[5] or 0)} for x in mdy]
dates

[{'mdy': '12/25/2017', 'my': '12/25', 'm': 12, 'd': 25, 'y': 2017, 'c': 20},
 {'mdy': '12/12', 'my': '12/12', 'm': 12, 'd': 12, 'y': 0, 'c': 0}]

In [12]:
for i, d in enumerate(dates):
    for k, v in d.items():
        if not v:
            d[k] = dates[max(i - 1, 0)][k]
dates

[{'mdy': '12/25/2017', 'my': '12/25', 'm': 12, 'd': 25, 'y': 2017, 'c': 20},
 {'mdy': '12/12', 'my': '12/12', 'm': 12, 'd': 12, 'y': 2017, 'c': 20}]

In [14]:
datetimes = [date(d['y'], d['m'], d['d']) for d in dates]
datetimes

[datetime.date(2017, 12, 25), datetime.date(2017, 12, 12)]

In [15]:
eu = r'((([0123]?\d)[-/]([01]?\d))([-/]([0123]\d)?\d\d)?)'
dmy = re.findall(eu, 'Alan Mathison Turing OBE FRS (23/6/1912-7/6/1954) was an English computer scientist.')
dmy

[('23/6/1912', '23/6', '23', '6', '/1912', '19'),
 ('7/6/1954', '7/6', '7', '6', '/1954', '19')]

In [16]:
dmy = re.findall(eu, 'Alan Mathison Turing OBE FRS (23/6/12-7/6/54) was an English computer scientist.')
dmy

[('23/6/12', '23/6', '23', '6', '/12', ''),
 ('7/6/54', '7/6', '7', '6', '/54', '')]

In [17]:
yr_19xx = (r'\b(?P<yr_19xx>' + '|'.join('{}'.format(i) for i in range(30, 100)) + r')\b')
yr_20xx = (r'\b(?P<yr_20xx>' + '|'.join('{:02d}'.format(i) for i in range(10)) + '|' +
           '|'.join('{}'.format(i) for i in range(10, 30)) + r')\b')
yr_cent = r'\b(?P<yr_cent>' + '|'.join('{}'.format(i) for i in range(1, 40)) + r')'
yr_ccxx = r'(?P<yr_ccxx>' + '|'.join('{:02d}'.format(i) for i in range(0, 100)) + r')\b'
yr_xxxx = r'\b(?P<yr_xxxx>(' + yr_cent + ')(' + yr_ccxx + r'))\b'
yr = (r'\b(?P<yr>' + yr_19xx + '|' + yr_20xx + '|' + yr_xxxx + r')\b')
groups = list(re.finditer(yr, "0, 2000, 01, '08, 99, 1984, 2030/1970 85 47 `66"))
full_years = [g['yr'] for g in groups]
full_years

['2000', '01', '08', '99', '1984', '2030', '1970', '85', '47', '66']

In [18]:
yr_19xx

'\\b(?P<yr_19xx>30|31|32|33|34|35|36|37|38|39|40|41|42|43|44|45|46|47|48|49|50|51|52|53|54|55|56|57|58|59|60|61|62|63|64|65|66|67|68|69|70|71|72|73|74|75|76|77|78|79|80|81|82|83|84|85|86|87|88|89|90|91|92|93|94|95|96|97|98|99)\\b'

In [19]:
mon_words = 'January February March April May June July August September October November December'
mon = (r'\b(' + '|'.join('{}|{}|{}|{}|{:02d}'.format(m, m[:4], m[:3], i + 1, i + 1) for i, m in enumerate(mon_words.split())) + r')\b')

In [20]:
re.findall(mon, 'January has 31 days, February the 2nd month of 12, has 28, except in a Leap Year.')

['January', 'February', '12']

### Combining information extraction regular expressions

In [21]:
day = r'|'.join('{:02d}|{}'.format(i, i) for i in range(1, 32))
eu = (r'\b(' + day + r')\b[-,/ ]{0,2}\b(' + mon + r')\b[-,/ ]{0,2}\b(' + yr.replace('<yr', '<eu_yr') + r')\b')
us = (r'\b(' + mon + r')\b[-,/ ]{0,2}\b(' + day + r')\b[-,/ ]{0,2}\b(' + yr.replace('<yr', '<us_yr') + r')\b')

date_pattern = r'\b(' + eu + '|' + us + r')\b'
list(re.finditer(date_pattern, '31 Oct, 1970 25/12/2017'))

[<re.Match object; span=(0, 12), match='31 Oct, 1970'>,
 <re.Match object; span=(13, 23), match='25/12/2017'>]

In [22]:
import datetime

dates = []
for g in groups:
    month_num = (g['us_mon'] or g['eu_mon']).strip()
    try:
        month_num = int(month_num)
    except ValueError:
        month_num = [w[:len(month_num)] for w in mon_words].index(month_num) + 1
    date = datetime.date(int(g['us_yr'] or g['eu_yr']), month_num, int(g['us_day'] or g['eu_day']))
    dates.append(date)
dates

IndexError: no such group