### Objective
- To find and extract date from raw text when 'date' can be written in any of the formats below: 
    - 04/20/2009; 04/20/09; 4/20/09; 4/3/09
    - Mar-20-2009; Mar 20, 2009; March 20, 2009;  Mar. 20, 2009; Mar 20 2009;
    - 20 Mar 2009; 20 March 2009; 20 Mar. 2009; 20 March, 2009
    - Mar 20th, 2009; Mar 21st, 2009; Mar 22nd, 2009
    - Feb 2009; Sep 2009; Oct 2010
    - 6/2008; 12/2009 
    - 2009; 2010

- Assign found date in a new column and convert to date type

- DataType for demonstration: Pandas dataframe

In [1]:
import pandas as pd
from datetime import datetime 
import re 

# Opening text file 
doc = []
with open('raw_text_with_dates.txt') as file:
    for line in file:
        doc.append(line)

df = pd.Series(doc)
df.head(10)

0         03/25/93 Total time of visit (in minutes):\n
1                       6/18/85 Primary Care Doctor:\n
2    sshe plans to move as of 7/8/71 In-Home Servic...
3                7 on 9/27/75 Audit C Score Current:\n
4    2/6/96 sleep studyPain Treatment Pain Level (N...
5                    .Per 7/06/79 Movement D/O note:\n
6    4, 5/18/78 Patient's thoughts about current su...
7    10/24/89 CPT Code: 90801 - Psychiatric Diagnos...
8                         3/7/86 SOS-10 Total Score:\n
9             (4/10/71)Score-1Audit C Score Current:\n
dtype: object

### Functions 

In [2]:
# Convert various date string formats to date type
def parse_date(text):
    for fmt in ('%m/%d/%Y', '%m/%d/%y', '%m-%d-%y','%-m/%d/%y', '%b-%d-%Y', '%b %d %Y', '%b, %Y', '%B %d %Y', '%B, %Y', '%d %b %Y','%B %d, %Y', '%d %B %Y','%b. %d, %Y','%B. %d, %Y', '%b %Y','%b %d, %Y', '%B %Y', '%m/%Y', '%m/%y', '%Y', '%y'):
        try:
            return datetime.strptime(text, fmt)
        except ValueError:
            pass
    return None # if no valid format is found 

def extract_date(df):
    
    # Define Regular Expression patterns for all possible date variants, order by highest possiblity of date match to lowest possibility of match
    patterns = [r'(\d{1,2}(?:st|nd|rd|th)? (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*[-,.]?\s*\d{2,4})',
                r'((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*[-.]?\s* *\d{1,2}(?:st|nd|rd|th)?[-,.]?\s*\d{2,4})', 
                r'((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*[-.,]?\s*\d{4})',
                r'(\d{1,2}[\/-]\d{2}[\/-](\d{4}|\d{2}))',
                r'(\d{1,2}\b[/-]\d{1,2}[/-]?\b(\d{4}|\d{2})\b)',
                r'(\d{1,2}\b[/-]\d{4}\b)',
                r'(\d{4}\b)'
    ] 

    output = []
    for text in df:
        parsed_date = None
        for pattern in patterns:
            match = re.search(pattern, text)
            if match:
                parsed_date = parse_date(match.group(0))
                if parsed_date is not None: # Check if data parsing is successful
                    break 
        
        output.append({'original_text': text, 'date': parsed_date})

    return pd.DataFrame(output)


In [3]:
result_df = extract_date(df)
result_df.head()

Unnamed: 0,original_text,date
0,03/25/93 Total time of visit (in minutes):\n,1993-03-25
1,6/18/85 Primary Care Doctor:\n,1985-06-18
2,sshe plans to move as of 7/8/71 In-Home Servic...,1971-07-08
3,7 on 9/27/75 Audit C Score Current:\n,1975-09-27
4,2/6/96 sleep studyPain Treatment Pain Level (N...,1996-02-06
