In [88]:
import pandas as pd
import re

def convert_date(date_str: str) -> int | str:
    """Convert date strings to year integers where possible."""
    
    date_str = date_str.replace('l', '1')
    try:
        date = int(date_str.lstrip('[').lstrip('-')[:4])
    except ValueError:
        try:
            date = int(date_str.rstrip('.').rstrip(']').rstrip('?')[-4:])
        except ValueError:
            try:
                date = date_str.replace('-', '0').replace('?', '0')
                return int(re.compile(r'(1[5-8]\d{2})').search(re.sub(r'[\[\]\.]', '', date_str)).group(1))
            except Exception as e:
                date = 0
    return date if (date > 1500 and date < 1900) else date_str

In [32]:
corpus = 'eebo'
df = pd.read_csv(f'data/{corpus}_pages_full.csv')


In [89]:
df['date_convert'] = df['date'].apply(convert_date)

In [90]:
convert_date('109 [i.e. 1609]..')

'109 [i.e. 1609]..'

In [None]:
df['date_convert'].unique()

array([1593, 1623, 1590, 1607, 1554, 1553, 1559, 1538, 1547, 1558, 1561,
       1614, 1562, 1568, 1577, 1621, 1615, 1633, 1624, 1548, 1619, 1631,
       1627, 1580, 1605, 1600, 1634, 1639, 1571, 1573, 1609, 1618, 1635,
       1592, 1586, 1584, 1552, 1630, 1589, 1598, 1591, 1617, 1620, 1585,
       1582, 1569, 1599, 1613, 1638, 1583, 1636, 1566, 1581, 1570, 1572,
       1612, 1626, 1642, 1532, 1556, 1550, 1560, 1567, 1616, 1594, 1603,
       1602, 1632, 1519, 1575, 1595, 1625, 1535, 1525, 1555, 1565, 1640,
       1596, 1604, 1606, 1637, 1579, 1622, 1530, 1608, 1629, 1578, 1574,
       1628, 1529, 1610, 1611, 1597, 1517, 1587, 1540, 1563, 1601, 1528,
       1551, 1542, 1588, 1512, 1536, 1564, 1531, 1543,
       'In the yere of our 1orde god. M.D.xxvi. the xxvii. day of Iu1y]',
       1511, 1509, 1537, 1510, '1500?]', 1546, '1495]', 1513, 1539, 1523,
       1515, 1544, 1549, 1576, 1522, 1651, 'ca. 1475?]', 1533, 1545, 1534,
       '1477-1478]', '1484]', 1526, 1516, 1507, 1505, 1557, 1520,

In [97]:
import json
api_key = json.load(open('api_key.json'))['key']

In [104]:
from ollama import chat, Client
from ollama import ChatResponse
from tqdm import tqdm
import os


client = Client(
    host="https://ollama.com",
    headers={'Authorization': 'Bearer ' + api_key}
)

def convert_date_ollama(string: str) -> int | str:
    """Convert date strings to year integers where possible using Ollama.  
    Year should be an integer between 1400 and 1900.
    If not possible, return 'NaN'"""
    response: ChatResponse = client.chat(model='mistral-large-3:675b-cloud', messages=[
        {
    'role': 'system',
    'content': 'Convert historical date strings to year integers where possible. Only return the year as an integer, nothing else.',
        },
        {
        'role': 'user',
        'content': 'Please convert the date string to a year integer: ' + string,
    }
    ])
    # or access fields directly from the response object
    return response.message.content.strip()
string = 'Anno .M.D.XL..'

date_dict = {d : convert_date_ollama(d)for d in tqdm(df['date_convert'].unique()) if isinstance(d, str)}


100%|██████████| 306/306 [01:24<00:00,  3.62it/s] 


In [105]:
date_dict

{'In the yere of our 1orde god. M.D.xxvi. the xxvii. day of Iu1y]': '1526',
 '1500?]': '1500',
 '1495]': '1495',
 'ca. 1475?]': '1475',
 '1477-1478]': '1477',
 '1484]': '1484',
 'ca. 1500]': '1500',
 '1500]': '1500',
 '[1500?]]': '1500',
 '1493]': '1493',
 'In the yere of our 1ord god M. CCCCC. and .xxxix.': '1539',
 '1482?]': '1482',
 '[1497?]]': '1497',
 '1497?]': '1497',
 '[1500]': '1500',
 'Imprinted Anno Dom. cIc. Ic. c.xxiiii.': '1524',
 '1485?]': '1485',
 '1483]': '1483',
 '[1496?]]': '1496',
 '1491]': '1491',
 '1498]': '1498',
 'M.DC.XXVIII.': '1628',
 '1483?]': '1483',
 '1492?]': '1492',
 '169-?]': '169',
 '166-?]': '166',
 '2671 [i.e. 1671]': '1671',
 '168-?]': '168',
 '164-?]': '164',
 '[16--?]': '1600',
 'Printed in the year,': '1584',
 '[166-?]': '166',
 'printed in the year,': '1500',
 'MDCXCVIII.': '1698',
 'M DC XLII.': '1642',
 '16--?]': '1600',
 '[169-?]': '169',
 '16-?]': '16',
 'M DC XCI.': '1691',
 '1983 [i.e. 1683]': '1683',
 '[167-?]': '167',
 'MDCXCI.': '1691',


In [None]:

df.to_csv(f'data/{corpus}_pages_cleaned.csv', index=False)