# Daten umwandeln

In [1]:
import re
import csv

In [2]:
MONTHS = {
    'januar': '01', 'jan': '01', 'janvier': '01', 'ian': '01',
    'februar': '02', 'feb': '02', 'février': '02', 'febr': '02',
    'märz': '03', 'maerz': '03', 'mrz': '03', 'mars': '03', 'mart': '03',
    'april': '04', 'apr': '04', 'avril': '04',
    'mai': '05',
    'juni': '06', 'jun': '06', 'juin': '06', 'iun': '06',
    'juli': '07', 'jul': '07', 'juillet': '07', 'iul': '07',
    'august': '08', 'aug': '08', 'août': '08',
    'september': '09', 'sep': '09', 'sept': '09', 'septembre': '09',
    'oktober': '10', 'okt': '10', 'octobre': '10', 'oct': '10',
    'november': '11', 'nov': '11', 'novembre': '11',
    'dezember': '12', 'dez': '12', 'décembre': '12', 'dec': '12',
}

DAYS_PER_MONTH = {
    '01': '31',
    '02': '28',  # check for leap years!
    '03': '31',
    '04': '30',
    '05': '31',
    '06': '30',
    '07': '31',
    '08': '31',
    '09': '30',
    '10': '31',
    '11': '30',
    '12': '31',
}


In [None]:
def extract_month(text):
    month = []
    for token in text.lower().split():
        word = re.sub(r'\W+', '', token)
        if word in MONTHS:
            month.append(MONTHS[word])
    return month

def extract_day(text):
    day = re.findall(r'\b\d{1,2}\b', text)
    day = [t.zfill(2) for t in day if 1 <= int(t) <= 31]
    return day

def process_and_save(in_path, out_path):
    result = []

    with open(in_path, 'r', encoding='utf-8') as file:
        for line in file:
            original = line.strip()

            # preprocessing
            text = re.sub(r'[\(\)\[\]\?]', ' ', original)
            text = re.sub(r'[\/]', ' - ', text)
            text = re.sub(r'[\u002D\u2010\u2011\u2012\u2013\u2014\u2015\u2212]', ' - ', text)
            text2 = re.sub(r'\s+', ' ', text).strip()
            text3 = re.sub(r'(\d\d)(\d\d) - (\d\d)(?=\s|\.|$)', r'\1\2 \1\3', text2)
            text = re.sub(r'(\d\d)(\d\d)\. (\d\d)(?=\s|\.|$)', r'\1\2 \1\3', text3)
#            print(f'{original}\n\t{text2}\n\t{text3}\n\t{text}')

            years = re.findall(r'\b\d{4}\b', text)
            months = extract_month(text)
            days = extract_day(text)

            # notBefore
            if years:
                year1 = years[0]
                month1 = months[0] if months else '01'
                day1 = days[0] if days else '01'
                date1 = f"{year1}-{month1}-{day1}"
            else:
                date1 = ""

            # notAfter
            if years:
                year2 = years[-1]
                month2 = months[-1] if months else '12'
                max_day = DAYS_PER_MONTH.get(month2, 31)
                day2 = days[-1] if days else max_day
                date2 = f"{year2}-{month2}-{day2}"
            else:
                date2 = ""

            result.append((date1, date2))

    # write TSV file
    with open(out_path, 'w', encoding='utf-8', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter='\t')
        writer.writerow(['notBefore', 'notAfter'])
        writer.writerows(result)

In [None]:
process_and_save('date_in.tsv', 'date_out.tsv')

[1145 oder 1146] Mai 27
	1145 oder 1146 Mai 27
	1145 oder 1146 Mai 27
	1145 oder 1146 Mai 27
1145 Oktober 26
	1145 Oktober 26
	1145 Oktober 26
	1145 Oktober 26
[1146] Mai 25
	1146 Mai 25
	1146 Mai 25
	1146 Mai 25
1147 April 20
	1147 April 20
	1147 April 20
	1147 April 20
1147 August 25
	1147 August 25
	1147 August 25
	1147 August 25
1147 September 17
	1147 September 17
	1147 September 17
	1147 September 17
1147 September 22
	1147 September 22
	1147 September 22
	1147 September 22
[1148] April 5
	1148 April 5
	1148 April 5
	1148 April 5
[1150] Mai 3
	1150 Mai 3
	1150 Mai 3
	1150 Mai 3
[1150 ca. Juni 24]
	1150 ca. Juni 24
	1150 ca. Juni 24
	1150 ca. Juni 24
[1151 November]
	1151 November
	1151 November
	1151 November
[1152] Januar 9
	1152 Januar 9
	1152 Januar 9
	1152 Januar 9
[1152] Januar 9
	1152 Januar 9
	1152 Januar 9
	1152 Januar 9
[1152] Januar 9
	1152 Januar 9
	1152 Januar 9
	1152 Januar 9
[1152 Januar 9
	1152 Januar 9
	1152 Januar 9
	1152 Januar 9
1152 Januar 9
	1152 Januar 9
	11